Merge branch 'users/chapuni/cov/single/nextcount-base' into users/chapuni/cov/single/nextcountusers/chapuni/cov/single/nextcount

author: NAKAMURA Takumi <geek4civic@gmail.com> 2025-01-09 17:50:40 +0900
committer: NAKAMURA Takumi <geek4civic@gmail.com> 2025-01-09 17:50:40 +0900
commit: fea7da1b00cc97d742faede2df96c7d327950f49 (patch)
tree: 4de1d6b4ddc69f4f32daabb11ad5c71ab0cf895e
parent: 9b99dde0d47102625d93c5d1cbbc04951025a6c9 (diff)
parent: 0aa930a41f2d1ebf1fa90ec42da8f96d15a4dcbb (diff)
download: llvm-users/chapuni/cov/single/nextcount.zip
llvm-users/chapuni/cov/single/nextcount.tar.gz
llvm-users/chapuni/cov/single/nextcount.tar.bz2
2818 files changed, 115253 insertions, 49811 deletions
diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 0aa05cd..566308b 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -661,6 +661,11 @@ backend:DirectX:
 
 backend:SPIR-V:
   - clang/lib/Driver/ToolChains/SPIRV.*
+  - clang/lib/Sema/SemaSPIRV.cpp
+  - clang/include/clang/Sema/SemaSPIRV.h
+  - clang/include/clang/Basic/BuiltinsSPIRV.td
+  - clang/test/CodeGenSPIRV/**
+  - clang/test/SemaSPIRV/**
   - llvm/lib/Target/SPIRV/**
   - llvm/test/CodeGen/SPIRV/**
   - llvm/test/Frontend/HLSL/**
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 50729e0..4fa0713 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -59,8 +59,9 @@ jobs:
 
       - name: Test Container
         run: |
-          for image in ${{ steps.vars.outputs.container-name-tag }} ${{  steps.vars.outputs.container-name }}; do
-            podman run --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
+          for image in ${{ steps.vars.outputs.container-name-tag }}; do
+            # Use --pull=never to ensure we are testing the just built image.
+            podman run --pull=never --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
           done
 
   push-ci-container:
diff --git a/.github/workflows/commit-access-review.py b/.github/workflows/commit-access-review.py
index 91d3a61..4f539fe 100644
--- a/.github/workflows/commit-access-review.py
+++ b/.github/workflows/commit-access-review.py
@@ -67,39 +67,47 @@ def check_manual_requests(
 ) -> list[str]:
     """
     Return a list of users who have been asked since ``start_date`` if they
-    want to keep their commit access.
+    want to keep their commit access or if they have applied for commit
+    access since ``start_date``
     """
+
     query = """
-        query ($query: String!) {
-          search(query: $query, type: ISSUE, first: 100) {
+        query ($query: String!, $after: String) {
+          search(query: $query, type: ISSUE, first: 100, after: $after) {
             nodes {
               ... on Issue {
-                body
-                comments (first: 100) {
-                  nodes {
-                    author {
-                      login
-                    }
-                  }
+                author {
+                  login
                 }
+                body
               }
             }
+            pageInfo {
+              hasNextPage
+              endCursor
+            }
           }
         }
         """
     formatted_start_date = start_date.strftime("%Y-%m-%dT%H:%M:%S")
     variables = {
-        "query": f"type:issue created:>{formatted_start_date} org:llvm repo:llvm-project label:infra:commit-access"
+        "query": f"type:issue created:>{formatted_start_date} org:llvm repo:llvm-project label:infra:commit-access,infra:commit-access-request"
     }
 
-    res_header, res_data = gh._Github__requester.graphql_query(
-        query=query, variables=variables
-    )
-    data = res_data["data"]
+    has_next_page = True
     users = []
-    for issue in data["search"]["nodes"]:
-        users.extend([user[1:] for user in re.findall("@[^ ,\n]+", issue["body"])])
-
+    while has_next_page:
+        res_header, res_data = gh._Github__requester.graphql_query(
+            query=query, variables=variables
+        )
+        data = res_data["data"]
+        for issue in data["search"]["nodes"]:
+            users.extend([user[1:] for user in re.findall("@[^ ,\n]+", issue["body"])])
+            if issue["author"]:
+                users.append(issue["author"]["login"])
+        has_next_page = data["search"]["pageInfo"]["hasNextPage"]
+        if has_next_page:
+            variables["after"] = data["search"]["pageInfo"]["endCursor"]
     return users
 
 
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 58355d2..3757e60 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -57,6 +57,7 @@ RUN apt-get update && \
     nodejs \
     perl-modules \
     python3-psutil \
+    sudo \
 
     # These are needed by the premerge pipeline. Pip is used to install
     # dependent python packages and ccache is used for build caching. File and
@@ -66,6 +67,16 @@ RUN apt-get update && \
     file \
     tzdata
 
+# Install sccache as it is needed by most of the project test workflows and
+# cannot be installed by the ccache action when executing as a non-root user.
+# TODO(boomanaiden154): This should be switched to being installed with apt
+# once we bump to Ubuntu 24.04.
+RUN curl -L 'https://github.com/mozilla/sccache/releases/download/v0.7.6/sccache-v0.7.6-x86_64-unknown-linux-musl.tar.gz' > /tmp/sccache.tar.gz && \
+    echo "2902a5e44c3342132f07b62e70cca75d9b23252922faf3b924f449808cc1ae58 /tmp/sccache.tar.gz" | sha256sum -c && \
+    tar xzf /tmp/sccache.tar.gz -O --wildcards '*/sccache' > '/usr/local/bin/sccache' && \
+    rm /tmp/sccache.tar.gz && \
+    chmod +x /usr/local/bin/sccache
+
 ENV LLVM_SYSROOT=$LLVM_SYSROOT
 ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 
@@ -73,5 +84,11 @@ ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 # permissions issues in some tests. Set the user id to 1001 as that is the
 # user id that Github Actions uses to perform the checkout action.
 RUN useradd gha -u 1001 -m -s /bin/bash
+
+# Also add the user to passwordless sudoers so that we can install software
+# later on without having to rebuild the container.
+RUN adduser gha sudo
+RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
 USER gha
 
diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml
index ed15fdb..3cac57e 100644
--- a/.github/workflows/new-issues.yml
+++ b/.github/workflows/new-issues.yml
@@ -15,7 +15,7 @@ jobs:
     steps:
       - uses: llvm/actions/issue-labeler@main
         with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           configuration-path: .github/new-issues-labeler.yml
           include-title: 1
           include-body: 0
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index f2bb373..0e6180a 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -60,7 +60,7 @@ jobs:
       - name: Install clang-format
         uses: aminya/setup-cpp@v1
         with:
-          clangformat: 18.1.7
+          clangformat: 19.1.6
 
       - name: Setup Python env
         uses: actions/setup-python@v5
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 7a97628..261dc8b 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -31,6 +31,12 @@ jobs:
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@v1.2.14
       - name: Build and Test
+        # Mark the job as a success even if the step fails so that people do
+        # not get notified while the new premerge pipeline is in an
+        # experimental state.
+        # TODO(boomanaiden154): Remove this once the pipeline is stable and we
+        # are ready for people to start recieving notifications.
+        continue-on-error: true
         run: |
           git config --global --add safe.directory '*'
 
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 1cde628..fc5431c 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -83,7 +83,7 @@ jobs:
         USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
       shell: bash
       run: |
-        ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
+        ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user "$GITHUB_ACTOR" --user-token "$USER_TOKEN" check-permissions
 
     - name: Collect Variables
       id: vars
@@ -102,8 +102,8 @@ jobs:
           release_version="$trimmed"
           ref="llvmorg-$release_version"
         else
-          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-${{ github.sha }}"
-          ref=${{ github.sha }}
+          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-$GITHUB_SHA"
+          ref="$GITHUB_SHA"
         fi
         if [ -n "${{ inputs.upload }}" ]; then
           upload="${{ inputs.upload }}"
@@ -114,20 +114,20 @@ jobs:
         echo "ref=$ref" >> $GITHUB_OUTPUT
         echo "upload=$upload" >> $GITHUB_OUTPUT
 
-        release_binary_basename="LLVM-$release_version-${{ runner.os }}-${{ runner.arch }}"
+        release_binary_basename="LLVM-$release_version-$RUNNER_OS-$RUNNER_ARCH"
         echo "release-binary-basename=$release_binary_basename" >> $GITHUB_OUTPUT
         echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
 
         # Detect necessary CMake flags
-        target="${{ runner.os }}-${{ runner.arch }}"
+        target="$RUNNER_OS-$RUNNER_ARCH"
         echo "enable-pgo=false" >> $GITHUB_OUTPUT
         target_cmake_flags="-DLLVM_RELEASE_ENABLE_PGO=OFF"
         # The macOS builds try to cross compile some libraries so we need to
         # add extra CMake args to disable them.
         # See https://github.com/llvm/llvm-project/issues/99767
-        if [ "${{ runner.os }}" = "macOS" ]; then
+        if [ "$RUNNER_OS" = "macOS" ]; then
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
-          if [ "${{ runner.arch }}" = "ARM64" ]; then
+          if [ "$RUNNER_ARCH" = "ARM64" ]; then
             arches=arm64
           else
             arches=x86_64
@@ -137,7 +137,7 @@ jobs:
 
         build_flang="true"
 
-        if [ "${{ runner.os }}" = "Windows" ]; then
+        if [ "$RUNNER_OS" = "Windows" ]; then
           # The build times out on Windows, so we need to disable LTO.
           target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
         fi
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 91918d6..f3881c9 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -931,15 +931,6 @@
 
   Remove redundant Address-Size override prefix
 
-### BOLT options in relocation mode:
-
-- `--align-macro-fusion=<value>`
-
-  Fix instruction alignment for macro-fusion (x86 relocation mode)
-  - `none`: do not insert alignment no-ops for macro-fusion
-  - `hot`: only insert alignment no-ops on hot execution paths (default)
-  - `all`: always align instructions to allow macro-fusion
-
 ### BOLT instrumentation options:
 
 `llvm-bolt <executable> -instrument [-o outputfile] <instrumented-executable>`
diff --git a/bolt/include/bolt/Core/BinaryData.h b/bolt/include/bolt/Core/BinaryData.h
index 6a773c4..4ab6280 100644
--- a/bolt/include/bolt/Core/BinaryData.h
+++ b/bolt/include/bolt/Core/BinaryData.h
@@ -169,6 +169,11 @@ public:
     return Parent && (Parent == BD || Parent->isAncestorOf(BD));
   }
 
+  void updateSize(uint64_t N) {
+    if (N > Size)
+      Size = N;
+  }
+
   void setIsMoveable(bool Flag) { IsMoveable = Flag; }
   void setSection(BinarySection &NewSection);
   void setOutputSection(BinarySection &NewSection) {
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index f88e34b..f5e1135 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1076,6 +1076,7 @@ MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, uint64_t Address,
     BD = GAI->second;
     if (!BD->hasName(Name)) {
       GlobalSymbols[Name] = BD;
+      BD->updateSize(Size);
       BD->Symbols.push_back(Symbol);
     }
   }
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 5019cf3..1aad252 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -46,13 +46,13 @@ BreakFunctionNames("break-funcs",
   cl::Hidden,
   cl::cat(BoltCategory));
 
-cl::list<std::string>
+static cl::list<std::string>
     FunctionPadSpec("pad-funcs", cl::CommaSeparated,
                     cl::desc("list of functions to pad with amount of bytes"),
                     cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."),
                     cl::Hidden, cl::cat(BoltCategory));
 
-cl::list<std::string> FunctionPadBeforeSpec(
+static cl::list<std::string> FunctionPadBeforeSpec(
     "pad-funcs-before", cl::CommaSeparated,
     cl::desc("list of functions to pad with amount of bytes"),
     cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."), cl::Hidden,
@@ -74,10 +74,9 @@ X86AlignBranchBoundaryHotOnly("x86-align-branch-boundary-hot-only",
   cl::init(true),
   cl::cat(BoltOptCategory));
 
-size_t padFunction(const cl::list<std::string> &Spec,
+size_t padFunction(std::map<std::string, size_t> &FunctionPadding,
+                   const cl::list<std::string> &Spec,
                    const BinaryFunction &Function) {
-  static std::map<std::string, size_t> FunctionPadding;
-
   if (FunctionPadding.empty() && !Spec.empty()) {
     for (const std::string &Spec : Spec) {
       size_t N = Spec.find(':');
@@ -99,6 +98,15 @@ size_t padFunction(const cl::list<std::string> &Spec,
   return 0;
 }
 
+size_t padFunctionBefore(const BinaryFunction &Function) {
+  static std::map<std::string, size_t> CacheFunctionPadding;
+  return padFunction(CacheFunctionPadding, FunctionPadBeforeSpec, Function);
+}
+size_t padFunctionAfter(const BinaryFunction &Function) {
+  static std::map<std::string, size_t> CacheFunctionPadding;
+  return padFunction(CacheFunctionPadding, FunctionPadSpec, Function);
+}
+
 } // namespace opts
 
 namespace {
@@ -324,8 +332,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function,
     Streamer.emitCodeAlignment(Function.getAlign(), &*BC.STI);
   }
 
-  if (size_t Padding =
-          opts::padFunction(opts::FunctionPadBeforeSpec, Function)) {
+  if (size_t Padding = opts::padFunctionBefore(Function)) {
     // Handle padFuncsBefore after the above alignment logic but before
     // symbol addresses are decided.
     if (!BC.HasRelocations) {
@@ -404,7 +411,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function,
   emitFunctionBody(Function, FF, /*EmitCodeOnly=*/false);
 
   // Emit padding if requested.
-  if (size_t Padding = opts::padFunction(opts::FunctionPadSpec, Function)) {
+  if (size_t Padding = opts::padFunctionAfter(Function)) {
     LLVM_DEBUG(dbgs() << "BOLT-DEBUG: padding function " << Function << " with "
                       << Padding << " bytes\n");
     Streamer.emitFill(Padding, MAI->getTextAlignFillValue());
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index bb58667..8c1f5d0 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(LLVMBOLTCore
   ParallelUtilities.cpp
   Relocation.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
   LINK_LIBS
   ${LLVM_PTHREAD_LIB}
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index 1c1273b..1e32894 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_library(LLVMBOLTPasses
   VeneerElimination.cpp
   RetpolineInsertion.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_LIBS
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index f8f6a01..35c5acf 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -28,9 +28,8 @@ extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<unsigned> Verbosity;
 extern cl::opt<uint32_t> RandomSeed;
 
-extern size_t padFunction(const cl::list<std::string> &Spec,
-                          const bolt::BinaryFunction &Function);
-extern cl::list<std::string> FunctionPadSpec, FunctionPadBeforeSpec;
+extern size_t padFunctionBefore(const bolt::BinaryFunction &Function);
+extern size_t padFunctionAfter(const bolt::BinaryFunction &Function);
 
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
 cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions(
@@ -306,12 +305,10 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
                           return false;
                         if (B->isIgnored())
                           return true;
-                        const size_t PadA =
-                            opts::padFunction(opts::FunctionPadSpec, *A) +
-                            opts::padFunction(opts::FunctionPadBeforeSpec, *A);
-                        const size_t PadB =
-                            opts::padFunction(opts::FunctionPadSpec, *B) +
-                            opts::padFunction(opts::FunctionPadBeforeSpec, *B);
+                        const size_t PadA = opts::padFunctionBefore(*A) +
+                                            opts::padFunctionAfter(*A);
+                        const size_t PadB = opts::padFunctionBefore(*B) +
+                                            opts::padFunctionAfter(*B);
                         if (!PadA || !PadB) {
                           if (PadA)
                             return true;
diff --git a/bolt/lib/Profile/CMakeLists.txt b/bolt/lib/Profile/CMakeLists.txt
index 9aa4ba0..a2bb4aa 100644
--- a/bolt/lib/Profile/CMakeLists.txt
+++ b/bolt/lib/Profile/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_library(LLVMBOLTProfile
   YAMLProfileReader.cpp
   YAMLProfileWriter.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_COMPONENTS
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 5d11492..c83cf36 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMBOLTRewrite
   RewriteInstance.cpp
   SDTRewriter.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_LIBS
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 0532468..5a5e044 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -21,6 +21,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorOr.h"
+#include <regex>
 
 #define DEBUG_TYPE "bolt-linux"
 
@@ -89,6 +91,34 @@ static cl::opt<bool>
 
 } // namespace opts
 
+/// Linux kernel version
+struct LKVersion {
+  LKVersion() {}
+  LKVersion(unsigned Major, unsigned Minor, unsigned Rev)
+      : Major(Major), Minor(Minor), Rev(Rev) {}
+
+  bool operator<(const LKVersion &Other) const {
+    return std::make_tuple(Major, Minor, Rev) <
+           std::make_tuple(Other.Major, Other.Minor, Other.Rev);
+  }
+
+  bool operator>(const LKVersion &Other) const { return Other < *this; }
+
+  bool operator<=(const LKVersion &Other) const { return !(*this > Other); }
+
+  bool operator>=(const LKVersion &Other) const { return !(*this < Other); }
+
+  bool operator==(const LKVersion &Other) const {
+    return Major == Other.Major && Minor == Other.Minor && Rev == Other.Rev;
+  }
+
+  bool operator!=(const LKVersion &Other) const { return !(*this == Other); }
+
+  unsigned Major{0};
+  unsigned Minor{0};
+  unsigned Rev{0};
+};
+
 /// Linux Kernel supports stack unwinding using ORC (oops rewind capability).
 /// ORC state at every IP can be described by the following data structure.
 struct ORCState {
@@ -148,6 +178,8 @@ public:
 };
 
 class LinuxKernelRewriter final : public MetadataRewriter {
+  LKVersion LinuxKernelVersion;
+
   /// Information required for updating metadata referencing an instruction.
   struct InstructionFixup {
     BinarySection &Section; // Section referencing the instruction.
@@ -249,6 +281,8 @@ class LinuxKernelRewriter final : public MetadataRewriter {
   ErrorOr<BinarySection &> PCIFixupSection = std::errc::bad_address;
   static constexpr size_t PCI_FIXUP_ENTRY_SIZE = 16;
 
+  Error detectLinuxKernelVersion();
+
   /// Process linux kernel special sections and their relocations.
   void processLKSections();
 
@@ -314,6 +348,9 @@ public:
       : MetadataRewriter("linux-kernel-rewriter", BC) {}
 
   Error preCFGInitializer() override {
+    if (Error E = detectLinuxKernelVersion())
+      return E;
+
     processLKSections();
 
     if (Error E = processSMPLocks())
@@ -394,6 +431,28 @@ public:
   }
 };
 
+Error LinuxKernelRewriter::detectLinuxKernelVersion() {
+  if (BinaryData *BD = BC.getBinaryDataByName("linux_banner")) {
+    const BinarySection &Section = BD->getSection();
+    const std::string S =
+        Section.getContents().substr(BD->getOffset(), BD->getSize()).str();
+
+    const std::regex Re(R"---(Linux version ((\d+)\.(\d+)(\.(\d+))?))---");
+    std::smatch Match;
+    if (std::regex_search(S, Match, Re)) {
+      const unsigned Major = std::stoi(Match[2].str());
+      const unsigned Minor = std::stoi(Match[3].str());
+      const unsigned Rev = Match[5].matched ? std::stoi(Match[5].str()) : 0;
+      LinuxKernelVersion = LKVersion(Major, Minor, Rev);
+      BC.outs() << "BOLT-INFO: Linux kernel version is " << Match[1].str()
+                << "\n";
+      return Error::success();
+    }
+  }
+  return createStringError(errc::executable_format_error,
+                           "Linux kernel version is unknown");
+}
+
 void LinuxKernelRewriter::processLKSections() {
   processLKKSymtab();
   processLKKSymtab(true);
diff --git a/bolt/lib/RuntimeLibs/CMakeLists.txt b/bolt/lib/RuntimeLibs/CMakeLists.txt
index d3ac71d..b8db7e4 100644
--- a/bolt/lib/RuntimeLibs/CMakeLists.txt
+++ b/bolt/lib/RuntimeLibs/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMBOLTRuntimeLibs
   HugifyRuntimeLibrary.cpp
   InstrumentationRuntimeLibrary.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
   )
 
diff --git a/bolt/lib/Target/AArch64/CMakeLists.txt b/bolt/lib/Target/AArch64/CMakeLists.txt
index 7e2d33e..8435ea7 100644
--- a/bolt/lib/Target/AArch64/CMakeLists.txt
+++ b/bolt/lib/Target/AArch64/CMakeLists.txt
@@ -19,6 +19,7 @@ endif()
 add_llvm_library(LLVMBOLTTargetAArch64
   AArch64MCPlusBuilder.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   DEPENDS
diff --git a/bolt/lib/Target/RISCV/CMakeLists.txt b/bolt/lib/Target/RISCV/CMakeLists.txt
index 5d19d38..6c3a196f 100644
--- a/bolt/lib/Target/RISCV/CMakeLists.txt
+++ b/bolt/lib/Target/RISCV/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()
 add_llvm_library(LLVMBOLTTargetRISCV
   RISCVMCPlusBuilder.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   DEPENDS
diff --git a/bolt/lib/Target/X86/CMakeLists.txt b/bolt/lib/Target/X86/CMakeLists.txt
index b274716..6d1accb 100644
--- a/bolt/lib/Target/X86/CMakeLists.txt
+++ b/bolt/lib/Target/X86/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_library(LLVMBOLTTargetX86
   X86MCPlusBuilder.cpp
   X86MCSymbolizer.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   DEPENDS
diff --git a/bolt/lib/Utils/CMakeLists.txt b/bolt/lib/Utils/CMakeLists.txt
index c452c1f..efba6d5 100644
--- a/bolt/lib/Utils/CMakeLists.txt
+++ b/bolt/lib/Utils/CMakeLists.txt
@@ -29,6 +29,8 @@ add_llvm_library(LLVMBOLTUtils
   CommandLineOpts.cpp
   Utils.cpp
   ${version_inc}
+
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_LIBS
diff --git a/bolt/test/AArch64/pad-before-funcs.s b/bolt/test/AArch64/pad-before-funcs.s
index 3ce0ee5..f3e8a23 100644
--- a/bolt/test/AArch64/pad-before-funcs.s
+++ b/bolt/test/AArch64/pad-before-funcs.s
@@ -2,11 +2,18 @@
 # It should be able to introduce a configurable offset for the _start symbol.
 # It should reject requests which don't obey the code alignment requirement.
 
+# Tests check inserting padding before _start; and additionally a test where
+# padding is inserted after start. In each case, check that the following
+# symbol ends up in the expected place as well.
+
+
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -Wl,--section-start=.text=0x4000
 # RUN: llvm-bolt %t.exe -o %t.bolt.0 --pad-funcs-before=_start:0
 # RUN: llvm-bolt %t.exe -o %t.bolt.4 --pad-funcs-before=_start:4
 # RUN: llvm-bolt %t.exe -o %t.bolt.8 --pad-funcs-before=_start:8
+# RUN: llvm-bolt %t.exe -o %t.bolt.4.4 --pad-funcs-before=_start:4 --pad-funcs=_start:4
+# RUN: llvm-bolt %t.exe -o %t.bolt.4.8 --pad-funcs-before=_start:4 --pad-funcs=_start:8
 
 # RUN: not llvm-bolt %t.exe -o %t.bolt.8 --pad-funcs-before=_start:1 2>&1 | FileCheck --check-prefix=CHECK-BAD-ALIGN %s
 
@@ -15,15 +22,27 @@
 # RUN: llvm-objdump --section=.text --disassemble %t.bolt.0 | FileCheck --check-prefix=CHECK-0 %s
 # RUN: llvm-objdump --section=.text --disassemble %t.bolt.4 | FileCheck --check-prefix=CHECK-4 %s
 # RUN: llvm-objdump --section=.text --disassemble %t.bolt.8 | FileCheck --check-prefix=CHECK-8 %s
+# RUN: llvm-objdump --section=.text --disassemble %t.bolt.4.4 | FileCheck --check-prefix=CHECK-4-4 %s
+# RUN: llvm-objdump --section=.text --disassemble %t.bolt.4.8 | FileCheck --check-prefix=CHECK-4-8 %s
 
 # Trigger relocation mode in bolt.
 .reloc 0, R_AARCH64_NONE
 
 .section .text
-.globl _start
 
 # CHECK-0: 0000000000400000 <_start>
 # CHECK-4: 0000000000400004 <_start>
+# CHECK-4-4: 0000000000400004 <_start>
 # CHECK-8: 0000000000400008 <_start>
+.globl _start
 _start:
     ret
+
+# CHECK-0: 0000000000400004 <_subsequent>
+# CHECK-4: 0000000000400008 <_subsequent>
+# CHECK-4-4: 000000000040000c <_subsequent>
+# CHECK-4-8: 0000000000400010 <_subsequent>
+# CHECK-8: 000000000040000c <_subsequent>
+.globl _subsequent
+_subsequent:
+    ret
diff --git a/bolt/test/X86/linux-alt-instruction.s b/bolt/test/X86/linux-alt-instruction.s
index fe3abbf..83d2cd0 100644
--- a/bolt/test/X86/linux-alt-instruction.s
+++ b/bolt/test/X86/linux-alt-instruction.s
@@ -142,6 +142,15 @@ _start:
   .section .orc_unwind_ip
   .long .L0 + 2 - .
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+  .string  "Linux version 6.6.61\n"
+  .size  linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-bug-table.s b/bolt/test/X86/linux-bug-table.s
index 07a4729..2965daa 100644
--- a/bolt/test/X86/linux-bug-table.s
+++ b/bolt/test/X86/linux-bug-table.s
@@ -56,6 +56,15 @@ _start:
   .long .L1 - .  # instruction
   .org 2b + 12
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+  .string  "Linux version 6.6.61\n"
+  .size  linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-exceptions.s b/bolt/test/X86/linux-exceptions.s
index 20b8c96..b0e7641 100644
--- a/bolt/test/X86/linux-exceptions.s
+++ b/bolt/test/X86/linux-exceptions.s
@@ -59,6 +59,15 @@ foo:
   .long .LF0 - . # fixup
   .long 0        # data
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+  .string  "Linux version 6.6.61\n"
+  .size  linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-orc.s b/bolt/test/X86/linux-orc.s
index 1b0e681..133b0df 100644
--- a/bolt/test/X86/linux-orc.s
+++ b/bolt/test/X86/linux-orc.s
@@ -157,6 +157,15 @@ bar:
   .section .orc_unwind_ip
   .long .L4 - .
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+  .string  "Linux version 6.6.61\n"
+  .size  linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-parainstructions.s b/bolt/test/X86/linux-parainstructions.s
index 07fca6b..facfcb16 100644
--- a/bolt/test/X86/linux-parainstructions.s
+++ b/bolt/test/X86/linux-parainstructions.s
@@ -49,6 +49,15 @@ _start:
   .byte 1        # type
   .byte 7        # length
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+  .string  "Linux version 6.6.61\n"
+  .size  linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-pci-fixup.s b/bolt/test/X86/linux-pci-fixup.s
index 42504c1..d8df91a 100644
--- a/bolt/test/X86/linux-pci-fixup.s
+++ b/bolt/test/X86/linux-pci-fixup.s
@@ -36,6 +36,15 @@ _start:
   .long 0x0         # class shift
   .long .L0 - .     # fixup
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+  .string  "Linux version 6.6.61\n"
+  .size  linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-smp-locks.s b/bolt/test/X86/linux-smp-locks.s
index 50d9e63..2fc136f 100644
--- a/bolt/test/X86/linux-smp-locks.s
+++ b/bolt/test/X86/linux-smp-locks.s
@@ -35,6 +35,15 @@ _start:
   .long .L0 - .
   .long .L1 - .
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+  .string  "Linux version 6.6.61\n"
+  .size  linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-static-calls.s b/bolt/test/X86/linux-static-calls.s
index ce90f4b..758e139 100644
--- a/bolt/test/X86/linux-static-calls.s
+++ b/bolt/test/X86/linux-static-calls.s
@@ -54,6 +54,15 @@ __start_static_call_sites:
   .type __stop_static_call_sites, %object
 __stop_static_call_sites:
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type linux_banner, @object
+linux_banner:
+  .string "Linux version 6.6.61\n"
+  .size linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-static-keys.s b/bolt/test/X86/linux-static-keys.s
index d34dd64..2e4457e 100644
--- a/bolt/test/X86/linux-static-keys.s
+++ b/bolt/test/X86/linux-static-keys.s
@@ -85,6 +85,15 @@ __stop___jump_table:
 fake_static_key:
   .quad 0
 
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+  .string  "Linux version 6.6.61\n"
+  .size  linux_banner, . - linux_banner
+
 ## Fake Linux Kernel sections.
   .section __ksymtab,"a",@progbits
   .section __ksymtab_gpl,"a",@progbits
diff --git a/bolt/test/X86/linux-version.S b/bolt/test/X86/linux-version.S
new file mode 100644
index 0000000..e680d0d
--- /dev/null
+++ b/bolt/test/X86/linux-version.S
@@ -0,0 +1,53 @@
+# REQUIRES: system-linux
+
+## Check that BOLT correctly detects the Linux kernel version
+
+# RUN: %clang -DA -target x86_64-unknown-unknown \
+# RUN:   %cflags -nostdlib %s -o %t.exe \
+# RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
+# RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-A %s
+
+# RUN: %clang -DB -target x86_64-unknown-unknown \
+# RUN:   %cflags -nostdlib %s -o %t.exe \
+# RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
+# RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-B %s
+
+# RUN: %clang -DC -target x86_64-unknown-unknown \
+# RUN:   %cflags -nostdlib %s -o %t.exe \
+# RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
+# RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-C %s
+
+  .text
+  .globl foo
+  .type foo, %function
+foo:
+  ret
+  .size foo, .-foo
+
+## Linux kernel version
+  .rodata
+  .align 16
+  .globl linux_banner
+  .type  linux_banner, @object
+linux_banner:
+
+#ifdef A
+  .string  "Linux version 6.6.61\n"
+#endif
+# CHECK-A: BOLT-INFO: Linux kernel version is 6.6.61
+
+#ifdef B
+  .string  "Linux version 6.6.50-rc4\n"
+#endif
+# CHECK-B: BOLT-INFO: Linux kernel version is 6.6.50
+
+#ifdef C
+  .string  "Linux version 6.6\n"
+#endif
+# CHECK-C: BOLT-INFO: Linux kernel version is 6.6
+
+  .size  linux_banner, . - linux_banner
+
+## Fake Linux Kernel sections.
+  .section __ksymtab,"a",@progbits
+  .section __ksymtab_gpl,"a",@progbits
diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
index 6028bb2..4aa9fe2 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "ClangTidyCheck.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/YAMLParser.h"
 #include <optional>
+#include <string>
 
 namespace clang::tidy {
 
@@ -62,16 +62,29 @@ ClangTidyCheck::OptionsView::get(StringRef LocalName) const {
   return std::nullopt;
 }
 
+static const llvm::StringSet<> DeprecatedGlobalOptions{
+    "StrictMode",
+    "IgnoreMacros",
+};
+
 static ClangTidyOptions::OptionMap::const_iterator
 findPriorityOption(const ClangTidyOptions::OptionMap &Options,
                    StringRef NamePrefix, StringRef LocalName,
-                   llvm::StringSet<> *Collector) {
+                   ClangTidyContext *Context) {
+  llvm::StringSet<> *Collector = Context->getOptionsCollector();
   if (Collector) {
     Collector->insert((NamePrefix + LocalName).str());
     Collector->insert(LocalName);
   }
   auto IterLocal = Options.find((NamePrefix + LocalName).str());
   auto IterGlobal = Options.find(LocalName);
+  // FIXME: temporary solution for deprecation warnings, should be removed
+  // after 22.x. Warn configuration deps on deprecation global options.
+  if (IterLocal == Options.end() && IterGlobal != Options.end() &&
+      DeprecatedGlobalOptions.contains(LocalName))
+    Context->configurationDiag(
+        "global option '%0' is deprecated, please use '%1%0' instead.")
+        << LocalName << NamePrefix;
   if (IterLocal == Options.end())
     return IterGlobal;
   if (IterGlobal == Options.end())
@@ -83,8 +96,7 @@ findPriorityOption(const ClangTidyOptions::OptionMap &Options,
 
 std::optional<StringRef>
 ClangTidyCheck::OptionsView::getLocalOrGlobal(StringRef LocalName) const {
-  auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName,
-                                 Context->getOptionsCollector());
+  auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName, Context);
   if (Iter != CheckOptions.end())
     return StringRef(Iter->getValue().Value);
   return std::nullopt;
@@ -117,8 +129,7 @@ ClangTidyCheck::OptionsView::get<bool>(StringRef LocalName) const {
 template <>
 std::optional<bool>
 ClangTidyCheck::OptionsView::getLocalOrGlobal<bool>(StringRef LocalName) const {
-  auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName,
-                                 Context->getOptionsCollector());
+  auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName, Context);
   if (Iter != CheckOptions.end()) {
     if (auto Result = getAsBool(Iter->getValue().Value, Iter->getKey()))
       return Result;
@@ -157,10 +168,9 @@ std::optional<int64_t> ClangTidyCheck::OptionsView::getEnumInt(
     bool IgnoreCase) const {
   if (!CheckGlobal && Context->getOptionsCollector())
     Context->getOptionsCollector()->insert((NamePrefix + LocalName).str());
-  auto Iter = CheckGlobal
-                  ? findPriorityOption(CheckOptions, NamePrefix, LocalName,
-                                       Context->getOptionsCollector())
-                  : CheckOptions.find((NamePrefix + LocalName).str());
+  auto Iter = CheckGlobal ? findPriorityOption(CheckOptions, NamePrefix,
+                                               LocalName, Context)
+                          : CheckOptions.find((NamePrefix + LocalName).str());
   if (Iter == CheckOptions.end())
     return std::nullopt;
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 33ac65e..b27616f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -9,7 +9,6 @@
 #include "../ClangTidy.h"
 #include "../ClangTidyModule.h"
 #include "../ClangTidyModuleRegistry.h"
-#include "../cppcoreguidelines/NarrowingConversionsCheck.h"
 #include "ArgumentCommentCheck.h"
 #include "AssertSideEffectCheck.h"
 #include "AssignmentInIfConditionCheck.h"
@@ -47,6 +46,7 @@
 #include "MultiLevelImplicitPointerConversionCheck.h"
 #include "MultipleNewInOneExpressionCheck.h"
 #include "MultipleStatementMacroCheck.h"
+#include "NarrowingConversionsCheck.h"
 #include "NoEscapeCheck.h"
 #include "NonZeroEnumToBoolConversionCheck.h"
 #include "NondeterministicPointerIterationOrderCheck.h"
@@ -183,7 +183,7 @@ public:
         "bugprone-pointer-arithmetic-on-polymorphic-object");
     CheckFactories.registerCheck<RedundantBranchConditionCheck>(
         "bugprone-redundant-branch-condition");
-    CheckFactories.registerCheck<cppcoreguidelines::NarrowingConversionsCheck>(
+    CheckFactories.registerCheck<NarrowingConversionsCheck>(
         "bugprone-narrowing-conversions");
     CheckFactories.registerCheck<NoEscapeCheck>("bugprone-no-escape");
     CheckFactories.registerCheck<NonZeroEnumToBoolConversionCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 13adad7..8bd5646 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -42,6 +42,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   MultiLevelImplicitPointerConversionCheck.cpp
   MultipleNewInOneExpressionCheck.cpp
   MultipleStatementMacroCheck.cpp
+  NarrowingConversionsCheck.cpp
   NoEscapeCheck.cpp
   NonZeroEnumToBoolConversionCheck.cpp
   NondeterministicPointerIterationOrderCheck.cpp
@@ -95,7 +96,6 @@ add_clang_library(clangTidyBugproneModule STATIC
 
   LINK_LIBS
   clangTidy
-  clangTidyCppCoreGuidelinesModule
   clangTidyUtils
 
   DEPENDS
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
index 45fef94..a950704 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.cpp
@@ -22,7 +22,7 @@
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cppcoreguidelines {
+namespace clang::tidy::bugprone {
 
 namespace {
 
@@ -614,4 +614,4 @@ void NarrowingConversionsCheck::check(const MatchFinder::MatchResult &Result) {
     return handleImplicitCast(*Result.Context, *Cast);
   llvm_unreachable("must be binary operator or cast expression");
 }
-} // namespace clang::tidy::cppcoreguidelines
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
index 1add40b..20403f9 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h
@@ -6,19 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NARROWING_CONVERSIONS_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NARROWING_CONVERSIONS_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cppcoreguidelines {
+namespace clang::tidy::bugprone {
 
 /// Checks for narrowing conversions, e.g:
 ///   int i = 0;
 ///   i += 0.1;
 ///
 /// For the user-facing documentation see:
-/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.html
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/narrowing-conversions.html
 class NarrowingConversionsCheck : public ClangTidyCheck {
 public:
   NarrowingConversionsCheck(StringRef Name, ClangTidyContext *Context);
@@ -104,6 +104,6 @@ private:
   const bool PedanticMode;
 };
 
-} // namespace clang::tidy::cppcoreguidelines
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_NARROWING_CONVERSIONS_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NARROWING_CONVERSIONS_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
index 8121a36..1f432c4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
@@ -74,9 +74,11 @@ void UnhandledSelfAssignmentCheck::registerMatchers(MatchFinder *Finder) {
     // Matcher for standard smart pointers.
     const auto SmartPointerType = qualType(hasUnqualifiedDesugaredType(
         recordType(hasDeclaration(classTemplateSpecializationDecl(
-            hasAnyName("::std::shared_ptr", "::std::unique_ptr",
-                       "::std::weak_ptr", "::std::auto_ptr"),
-            templateArgumentCountIs(1))))));
+            anyOf(allOf(hasAnyName("::std::shared_ptr", "::std::weak_ptr",
+                                   "::std::auto_ptr"),
+                        templateArgumentCountIs(1)),
+                  allOf(hasName("::std::unique_ptr"),
+                        templateArgumentCountIs(2))))))));
 
     // We will warn only if the class has a pointer or a C array field which
     // probably causes a problem during self-assignment (e.g. first resetting
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
index 37baae7..f15fd4d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
@@ -29,6 +29,12 @@ static constexpr StringRef DefaultIncludeTypeRegex =
 
 AST_MATCHER(VarDecl, isLocalVarDecl) { return Node.isLocalVarDecl(); }
 AST_MATCHER(VarDecl, isReferenced) { return Node.isReferenced(); }
+AST_MATCHER_P(VarDecl, explicitMarkUnused, LangOptions, LangOpts) {
+  // Implementations should not emit a warning that a name-independent
+  // declaration is used or unused.
+  return Node.hasAttr<UnusedAttr>() ||
+         (LangOpts.CPlusPlus26 && Node.isPlaceholderVar(LangOpts));
+}
 AST_MATCHER(Type, isReferenceType) { return Node.isReferenceType(); }
 AST_MATCHER(QualType, isTrivial) {
   return Node.isTrivialType(Finder->getASTContext()) ||
@@ -60,7 +66,7 @@ void UnusedLocalNonTrivialVariableCheck::registerMatchers(MatchFinder *Finder) {
       varDecl(isLocalVarDecl(), unless(isReferenced()),
               unless(isExceptionVariable()), hasLocalStorage(), isDefinition(),
               unless(hasType(isReferenceType())), unless(hasType(isTrivial())),
-              unless(hasAttr(attr::Kind::Unused)),
+              unless(explicitMarkUnused(getLangOpts())),
               hasType(hasUnqualifiedDesugaredType(
                   anyOf(recordType(hasDeclaration(namedDecl(
                             matchesAnyListedName(IncludeTypes),
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
index 07bb89e..b023f76 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
@@ -16,7 +16,6 @@ add_clang_library(clangTidyCppCoreGuidelinesModule STATIC
   MacroUsageCheck.cpp
   MisleadingCaptureDefaultByValueCheck.cpp
   MissingStdForwardCheck.cpp
-  NarrowingConversionsCheck.cpp
   NoMallocCheck.cpp
   NoSuspendWithLockCheck.cpp
   OwningMemoryCheck.cpp
@@ -38,6 +37,7 @@ add_clang_library(clangTidyCppCoreGuidelinesModule STATIC
 
   LINK_LIBS
   clangTidy
+  clangTidyBugproneModule
   clangTidyMiscModule
   clangTidyModernizeModule
   clangTidyPerformanceModule
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
index e9f020161..6adef04 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
@@ -9,6 +9,7 @@
 #include "../ClangTidy.h"
 #include "../ClangTidyModule.h"
 #include "../ClangTidyModuleRegistry.h"
+#include "../bugprone/NarrowingConversionsCheck.h"
 #include "../misc/NonPrivateMemberVariablesInClassesCheck.h"
 #include "../misc/UnconventionalAssignOperatorCheck.h"
 #include "../modernize/AvoidCArraysCheck.h"
@@ -30,7 +31,6 @@
 #include "MacroUsageCheck.h"
 #include "MisleadingCaptureDefaultByValueCheck.h"
 #include "MissingStdForwardCheck.h"
-#include "NarrowingConversionsCheck.h"
 #include "NoMallocCheck.h"
 #include "NoSuspendWithLockCheck.h"
 #include "OwningMemoryCheck.h"
@@ -87,7 +87,7 @@ public:
         "cppcoreguidelines-misleading-capture-default-by-value");
     CheckFactories.registerCheck<MissingStdForwardCheck>(
         "cppcoreguidelines-missing-std-forward");
-    CheckFactories.registerCheck<NarrowingConversionsCheck>(
+    CheckFactories.registerCheck<bugprone::NarrowingConversionsCheck>(
         "cppcoreguidelines-narrowing-conversions");
     CheckFactories.registerCheck<NoMallocCheck>("cppcoreguidelines-no-malloc");
     CheckFactories.registerCheck<NoSuspendWithLockCheck>(
diff --git a/clang-tools-extra/clang-tidy/tool/CMakeLists.txt b/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
index 81fba3b..0d4501d 100644
--- a/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
   AllTargetsDescs
   AllTargetsInfos
   FrontendOpenMP
+  TargetParser
   support
   )
 
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index b8d843c..fa8887e 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -20,12 +20,14 @@
 #include "../GlobList.h"
 #include "clang/Tooling/CommonOptionsParser.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/WithColor.h"
+#include "llvm/TargetParser/Host.h"
 #include <optional>
 
 using namespace clang::tooling;
@@ -36,6 +38,11 @@ static cl::desc desc(StringRef description) { return {description.ltrim()}; }
 static cl::OptionCategory ClangTidyCategory("clang-tidy options");
 
 static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage);
+static cl::extrahelp ClangTidyParameterFileHelp(R"(
+Parameters files:
+  A large number of options or source files can be passed as parameter files
+  by use '@parameter-file' in the command line.
+)");
 static cl::extrahelp ClangTidyHelp(R"(
 Configuration files:
   clang-tidy attempts to read configuration for each source file from a
@@ -54,12 +61,12 @@ Configuration files:
                                  globs can be specified as a list instead of a
                                  string.
   ExcludeHeaderFilterRegex     - Same as '--exclude-header-filter'.
-  ExtraArgs                    - Same as '--extra-args'.
-  ExtraArgsBefore              - Same as '--extra-args-before'.
+  ExtraArgs                    - Same as '--extra-arg'.
+  ExtraArgsBefore              - Same as '--extra-arg-before'.
   FormatStyle                  - Same as '--format-style'.
   HeaderFileExtensions         - File extensions to consider to determine if a
                                  given diagnostic is located in a header file.
-  HeaderFilterRegex            - Same as '--header-filter-regex'.
+  HeaderFilterRegex            - Same as '--header-filter'.
   ImplementationFileExtensions - File extensions to consider to determine if a
                                  given diagnostic is located in an
                                  implementation file.
@@ -571,6 +578,21 @@ static llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> createBaseFS() {
 
 int clangTidyMain(int argc, const char **argv) {
   llvm::InitLLVM X(argc, argv);
+  SmallVector<const char *> Args{argv, argv + argc};
+
+  // expand parameters file to argc and argv.
+  llvm::BumpPtrAllocator Alloc;
+  llvm::cl::TokenizerCallback Tokenizer =
+      llvm::Triple(llvm::sys::getProcessTriple()).isOSWindows()
+          ? llvm::cl::TokenizeWindowsCommandLine
+          : llvm::cl::TokenizeGNUCommandLine;
+  llvm::cl::ExpansionContext ECtx(Alloc, Tokenizer);
+  if (llvm::Error Err = ECtx.expandResponseFiles(Args)) {
+    llvm::WithColor::error() << llvm::toString(std::move(Err)) << "\n";
+    return 1;
+  }
+  argc = static_cast<int>(Args.size());
+  argv = Args.data();
 
   // Enable help for -load option, if plugins are enabled.
   if (cl::Option *LoadOpt = cl::getRegisteredOptions().lookup("load"))
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index 2c2d5f0..fb39b7b 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -807,8 +807,8 @@ SpecifiedScope getQueryScopes(CodeCompletionContext &CCContext,
   llvm::StringRef SpelledSpecifier = Lexer::getSourceText(
       CharSourceRange::getCharRange(SemaSpecifier->getRange()),
       CCSema.SourceMgr, clang::LangOptions());
-  if (SpelledSpecifier.consume_front("::")) 
-      Scopes.QueryScopes = {""};
+  if (SpelledSpecifier.consume_front("::"))
+    Scopes.QueryScopes = {""};
   Scopes.UnresolvedQualifier = std::string(SpelledSpecifier);
   // Sema excludes the trailing "::".
   if (!Scopes.UnresolvedQualifier->empty())
@@ -1604,7 +1604,7 @@ class CodeCompleteFlow {
   CompletionPrefix HeuristicPrefix;
   std::optional<FuzzyMatcher> Filter; // Initialized once Sema runs.
   Range ReplacedRange;
-  std::vector<std::string> QueryScopes; // Initialized once Sema runs.
+  std::vector<std::string> QueryScopes;      // Initialized once Sema runs.
   std::vector<std::string> AccessibleScopes; // Initialized once Sema runs.
   // Initialized once QueryScopes is initialized, if there are scopes.
   std::optional<ScopeDistance> ScopeProximity;
@@ -1663,7 +1663,9 @@ public:
       Inserter.emplace(
           SemaCCInput.FileName, SemaCCInput.ParseInput.Contents, Style,
           SemaCCInput.ParseInput.CompileCommand.Directory,
-          &Recorder->CCSema->getPreprocessor().getHeaderSearchInfo());
+          &Recorder->CCSema->getPreprocessor().getHeaderSearchInfo(),
+          Config::current().Style.QuotedHeaders,
+          Config::current().Style.AngledHeaders);
       for (const auto &Inc : Includes.MainFileIncludes)
         Inserter->addExisting(Inc);
 
@@ -1746,7 +1748,9 @@ public:
     auto Style = getFormatStyleForFile(FileName, Content, TFS, false);
     // This will only insert verbatim headers.
     Inserter.emplace(FileName, Content, Style,
-                     /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr);
+                     /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr,
+                     Config::current().Style.QuotedHeaders,
+                     Config::current().Style.AngledHeaders);
 
     auto Identifiers = collectIdentifiers(Content, Style);
     std::vector<RawIdentifier> IdentifierResults;
diff --git a/clang-tools-extra/clangd/Config.h b/clang-tools-extra/clangd/Config.h
index e174f7f..586d031 100644
--- a/clang-tools-extra/clangd/Config.h
+++ b/clang-tools-extra/clangd/Config.h
@@ -124,6 +124,10 @@ struct Config {
     // declarations, always spell out the whole name (with or without leading
     // ::). All nested namespaces are affected as well.
     std::vector<std::string> FullyQualifiedNamespaces;
+
+    // List of matcher functions for inserting certain headers with <> or "".
+    std::vector<std::function<bool(llvm::StringRef)>> QuotedHeaders;
+    std::vector<std::function<bool(llvm::StringRef)>> AngledHeaders;
   } Style;
 
   /// controls the completion options for argument lists.
diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp
index fb76929..aa2561e 100644
--- a/clang-tools-extra/clangd/ConfigCompile.cpp
+++ b/clang-tools-extra/clangd/ConfigCompile.cpp
@@ -482,6 +482,55 @@ struct FragmentCompiler {
             FullyQualifiedNamespaces.begin(), FullyQualifiedNamespaces.end());
       });
     }
+    auto QuotedFilter = compileHeaderRegexes(F.QuotedHeaders);
+    if (QuotedFilter.has_value()) {
+      Out.Apply.push_back(
+          [QuotedFilter = *QuotedFilter](const Params &, Config &C) {
+            C.Style.QuotedHeaders.emplace_back(QuotedFilter);
+          });
+    }
+    auto AngledFilter = compileHeaderRegexes(F.AngledHeaders);
+    if (AngledFilter.has_value()) {
+      Out.Apply.push_back(
+          [AngledFilter = *AngledFilter](const Params &, Config &C) {
+            C.Style.AngledHeaders.emplace_back(AngledFilter);
+          });
+    }
+  }
+
+  auto compileHeaderRegexes(llvm::ArrayRef<Located<std::string>> HeaderPatterns)
+      -> std::optional<std::function<bool(llvm::StringRef)>> {
+    // TODO: Share this code with Diagnostics.Includes.IgnoreHeader
+#ifdef CLANGD_PATH_CASE_INSENSITIVE
+    static llvm::Regex::RegexFlags Flags = llvm::Regex::IgnoreCase;
+#else
+    static llvm::Regex::RegexFlags Flags = llvm::Regex::NoFlags;
+#endif
+    auto Filters = std::make_shared<std::vector<llvm::Regex>>();
+    for (auto &HeaderPattern : HeaderPatterns) {
+      // Anchor on the right.
+      std::string AnchoredPattern = "(" + *HeaderPattern + ")$";
+      llvm::Regex CompiledRegex(AnchoredPattern, Flags);
+      std::string RegexError;
+      if (!CompiledRegex.isValid(RegexError)) {
+        diag(Warning,
+             llvm::formatv("Invalid regular expression '{0}': {1}",
+                           *HeaderPattern, RegexError)
+                 .str(),
+             HeaderPattern.Range);
+        continue;
+      }
+      Filters->push_back(std::move(CompiledRegex));
+    }
+    if (Filters->empty())
+      return std::nullopt;
+    auto Filter = [Filters](llvm::StringRef Path) {
+      for (auto &Regex : *Filters)
+        if (Regex.match(Path))
+          return true;
+      return false;
+    };
+    return Filter;
   }
 
   void appendTidyCheckSpec(std::string &CurSpec,
diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h
index 36f7d04..9535b202 100644
--- a/clang-tools-extra/clangd/ConfigFragment.h
+++ b/clang-tools-extra/clangd/ConfigFragment.h
@@ -301,6 +301,23 @@ struct Fragment {
     // ::). All nested namespaces are affected as well.
     // Affects availability of the AddUsing tweak.
     std::vector<Located<std::string>> FullyQualifiedNamespaces;
+
+    /// List of regexes for headers that should always be included with a
+    /// ""-style include. By default, and in case of a conflict with
+    /// AngledHeaders (i.e. a header matches a regex in both QuotedHeaders and
+    /// AngledHeaders), system headers use <> and non-system headers use "".
+    /// These can match any suffix of the header file in question.
+    /// Matching is performed against the header text, not its absolute path
+    /// within the project.
+    std::vector<Located<std::string>> QuotedHeaders;
+    /// List of regexes for headers that should always be included with a
+    /// <>-style include. By default, and in case of a conflict with
+    /// AngledHeaders (i.e. a header matches a regex in both QuotedHeaders and
+    /// AngledHeaders), system headers use <> and non-system headers use "".
+    /// These can match any suffix of the header file in question.
+    /// Matching is performed against the header text, not its absolute path
+    /// within the project.
+    std::vector<Located<std::string>> AngledHeaders;
   };
   StyleBlock Style;
 
diff --git a/clang-tools-extra/clangd/ConfigYAML.cpp b/clang-tools-extra/clangd/ConfigYAML.cpp
index 32e0289..95cc5c1 100644
--- a/clang-tools-extra/clangd/ConfigYAML.cpp
+++ b/clang-tools-extra/clangd/ConfigYAML.cpp
@@ -116,6 +116,14 @@ private:
       if (auto Values = scalarValues(N))
         F.FullyQualifiedNamespaces = std::move(*Values);
     });
+    Dict.handle("QuotedHeaders", [&](Node &N) {
+      if (auto Values = scalarValues(N))
+        F.QuotedHeaders = std::move(*Values);
+    });
+    Dict.handle("AngledHeaders", [&](Node &N) {
+      if (auto Values = scalarValues(N))
+        F.AngledHeaders = std::move(*Values);
+    });
     Dict.parse(N);
   }
 
diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp
index b537417..0ffd9ee 100644
--- a/clang-tools-extra/clangd/Headers.cpp
+++ b/clang-tools-extra/clangd/Headers.cpp
@@ -9,6 +9,7 @@
 #include "Headers.h"
 #include "Preamble.h"
 #include "SourceCode.h"
+#include "support/Logger.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/CompilerInstance.h"
@@ -30,8 +31,7 @@ namespace clangd {
 class IncludeStructure::RecordHeaders : public PPCallbacks {
 public:
   RecordHeaders(const CompilerInstance &CI, IncludeStructure *Out)
-      : SM(CI.getSourceManager()),
-        Out(Out) {}
+      : SM(CI.getSourceManager()), Out(Out) {}
 
   // Record existing #includes - both written and resolved paths. Only #includes
   // in the main file are collected.
@@ -287,11 +287,11 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader,
   assert(InsertedHeader.valid());
   if (InsertedHeader.Verbatim)
     return InsertedHeader.File;
-  bool IsAngled = false;
+  bool IsAngledByDefault = false;
   std::string Suggested;
   if (HeaderSearchInfo) {
     Suggested = HeaderSearchInfo->suggestPathToFileForDiagnostics(
-        InsertedHeader.File, BuildDir, IncludingFile, &IsAngled);
+        InsertedHeader.File, BuildDir, IncludingFile, &IsAngledByDefault);
   } else {
     // Calculate include relative to including file only.
     StringRef IncludingDir = llvm::sys::path::parent_path(IncludingFile);
@@ -304,9 +304,33 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader,
   // FIXME: should we allow (some limited number of) "../header.h"?
   if (llvm::sys::path::is_absolute(Suggested))
     return std::nullopt;
+  bool IsAngled = false;
+  for (auto Filter : AngledHeaders) {
+    if (Filter(Suggested)) {
+      IsAngled = true;
+      break;
+    }
+  }
+  bool IsQuoted = false;
+  for (auto Filter : QuotedHeaders) {
+    if (Filter(Suggested)) {
+      IsQuoted = true;
+      break;
+    }
+  }
+  // No filters apply, or both filters apply (a bug), use system default.
+  if (IsAngled == IsQuoted) {
+    // Probably a bug in the config regex.
+    if (IsAngled && IsQuoted) {
+      elog("Header '{0}' matches both quoted and angled regexes, default will "
+           "be used.",
+           Suggested);
+    }
+    IsAngled = IsAngledByDefault;
+  }
   if (IsAngled)
     Suggested = "<" + Suggested + ">";
-  else
+  else // if (IsQuoted)
     Suggested = "\"" + Suggested + "\"";
   return Suggested;
 }
diff --git a/clang-tools-extra/clangd/Headers.h b/clang-tools-extra/clangd/Headers.h
index 41cf3de..b91179d 100644
--- a/clang-tools-extra/clangd/Headers.h
+++ b/clang-tools-extra/clangd/Headers.h
@@ -33,6 +33,8 @@
 namespace clang {
 namespace clangd {
 
+using HeaderFilter = llvm::ArrayRef<std::function<bool(llvm::StringRef)>>;
+
 /// Returns true if \p Include is literal include like "path" or <path>.
 bool isLiteralInclude(llvm::StringRef Include);
 
@@ -211,10 +213,12 @@ public:
   // include path of non-verbatim header will not be shortened.
   IncludeInserter(StringRef FileName, StringRef Code,
                   const format::FormatStyle &Style, StringRef BuildDir,
-                  HeaderSearch *HeaderSearchInfo)
+                  HeaderSearch *HeaderSearchInfo, HeaderFilter QuotedHeaders,
+                  HeaderFilter AngledHeaders)
       : FileName(FileName), Code(Code), BuildDir(BuildDir),
         HeaderSearchInfo(HeaderSearchInfo),
-        Inserter(FileName, Code, Style.IncludeStyle) {}
+        Inserter(FileName, Code, Style.IncludeStyle),
+        QuotedHeaders(QuotedHeaders), AngledHeaders(AngledHeaders) {}
 
   void addExisting(const Inclusion &Inc);
 
@@ -258,6 +262,8 @@ private:
   HeaderSearch *HeaderSearchInfo = nullptr;
   llvm::StringSet<> IncludedHeaders; // Both written and resolved.
   tooling::HeaderIncludes Inserter;  // Computers insertion replacement.
+  HeaderFilter QuotedHeaders;
+  HeaderFilter AngledHeaders;
 };
 
 } // namespace clangd
diff --git a/clang-tools-extra/clangd/HeuristicResolver.h b/clang-tools-extra/clangd/HeuristicResolver.h
index dcc063b..c130e06 100644
--- a/clang-tools-extra/clangd/HeuristicResolver.h
+++ b/clang-tools-extra/clangd/HeuristicResolver.h
@@ -26,13 +26,14 @@ class UnresolvedUsingValueDecl;
 
 namespace clangd {
 
-// This class heuristic resolution of declarations and types in template code.
+// This class handles heuristic resolution of declarations and types in template
+// code.
 //
 // As a compiler, clang only needs to perform certain types of processing on
 // template code (such as resolving dependent names to declarations, or
 // resolving the type of a dependent expression) after instantiation. Indeed,
 // C++ language features such as template specialization mean such resolution
-// cannot be done accurately before instantiation
+// cannot be done accurately before instantiation.
 //
 // However, template code is written and read in uninstantiated form, and clangd
 // would like to provide editor features like go-to-definition in template code
diff --git a/clang-tools-extra/clangd/IncludeCleaner.h b/clang-tools-extra/clangd/IncludeCleaner.h
index a01146d..3f6e3b2 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.h
+++ b/clang-tools-extra/clangd/IncludeCleaner.h
@@ -57,7 +57,6 @@ IncludeCleanerFindings
 computeIncludeCleanerFindings(ParsedAST &AST,
                               bool AnalyzeAngledIncludes = false);
 
-using HeaderFilter = llvm::ArrayRef<std::function<bool(llvm::StringRef)>>;
 std::vector<Diag>
 issueIncludeCleanerDiagnostics(ParsedAST &AST, llvm::StringRef Code,
                                const IncludeCleanerFindings &Findings,
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 045d32a..725cbeb 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -639,7 +639,8 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
           getFormatStyleForFile(Filename, Inputs.Contents, *Inputs.TFS, false);
       auto Inserter = std::make_shared<IncludeInserter>(
           Filename, Inputs.Contents, Style, BuildDir.get(),
-          &Clang->getPreprocessor().getHeaderSearchInfo());
+          &Clang->getPreprocessor().getHeaderSearchInfo(),
+          Cfg.Style.QuotedHeaders, Cfg.Style.AngledHeaders);
       ArrayRef<Inclusion> MainFileIncludes;
       if (Preamble) {
         MainFileIncludes = Preamble->Includes.MainFileIncludes;
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 80a0653..7148917 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -242,13 +242,13 @@ opt<std::string> FallbackStyle{
     init(clang::format::DefaultFallbackStyle),
 };
 
-opt<int> EnableFunctionArgSnippets{
+opt<std::string> EnableFunctionArgSnippets{
     "function-arg-placeholders",
     cat(Features),
     desc("When disabled (0), completions contain only parentheses for "
          "function calls. When enabled (1), completions also contain "
          "placeholders for method parameters"),
-    init(-1),
+    init("-1"),
 };
 
 opt<CodeCompleteOptions::IncludeInsertion> HeaderInsertion{
@@ -636,6 +636,22 @@ loadExternalIndex(const Config::ExternalIndexSpec &External,
   llvm_unreachable("Invalid ExternalIndexKind.");
 }
 
+std::optional<bool> shouldEnableFunctionArgSnippets() {
+  std::string Val = EnableFunctionArgSnippets;
+  // Accept the same values that a bool option parser would, but also accept
+  // -1 to indicate "unspecified", in which case the ArgumentListsPolicy
+  // config option will be respected.
+  if (Val == "1" || Val == "true" || Val == "True" || Val == "TRUE")
+    return true;
+  if (Val == "0" || Val == "false" || Val == "False" || Val == "FALSE")
+    return false;
+  if (Val != "-1")
+    elog("Value specified by --function-arg-placeholders is invalid. Provide a "
+         "boolean value or leave unspecified to use ArgumentListsPolicy from "
+         "config instead.");
+  return std::nullopt;
+}
+
 class FlagsConfigProvider : public config::Provider {
 private:
   config::CompiledFragment Frag;
@@ -696,10 +712,9 @@ public:
       BGPolicy = Config::BackgroundPolicy::Skip;
     }
 
-    if (EnableFunctionArgSnippets >= 0) {
-      ArgumentLists = EnableFunctionArgSnippets
-                          ? Config::ArgumentListsPolicy::FullPlaceholders
-                          : Config::ArgumentListsPolicy::Delimiters;
+    if (std::optional<bool> Enable = shouldEnableFunctionArgSnippets()) {
+      ArgumentLists = *Enable ? Config::ArgumentListsPolicy::FullPlaceholders
+                              : Config::ArgumentListsPolicy::Delimiters;
     }
 
     Frag = [=](const config::Params &, Config &C) {
diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt
index dffdcd5..8dba808 100644
--- a/clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -64,6 +64,7 @@ add_unittest(ClangdUnitTests ClangdTests
   GlobalCompilationDatabaseTests.cpp
   HeadersTests.cpp
   HeaderSourceSwitchTests.cpp
+  HeuristicResolverTests.cpp
   HoverTests.cpp
   IncludeCleanerTests.cpp
   IndexActionTests.cpp
diff --git a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
index 49a9404..2c7f50d 100644
--- a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
@@ -208,12 +208,13 @@ TEST_F(LSPTest, ClangTidyRename) {
   Annotations Source(R"cpp(
     void [[foo]]() {}
   )cpp");
-  Opts.ClangTidyProvider = [](tidy::ClangTidyOptions &ClangTidyOpts,
-                              llvm::StringRef) {
+  constexpr auto ClangTidyProvider = [](tidy::ClangTidyOptions &ClangTidyOpts,
+                                        llvm::StringRef) {
     ClangTidyOpts.Checks = {"-*,readability-identifier-naming"};
     ClangTidyOpts.CheckOptions["readability-identifier-naming.FunctionCase"] =
         "CamelCase";
   };
+  Opts.ClangTidyProvider = ClangTidyProvider;
   auto &Client = start();
   Client.didOpen("foo.hpp", Header.code());
   Client.didOpen("foo.cpp", Source.code());
@@ -266,10 +267,11 @@ TEST_F(LSPTest, ClangTidyCrash_Issue109367) {
   // This test requires clang-tidy checks to be linked in.
   if (!CLANGD_TIDY_CHECKS)
     return;
-  Opts.ClangTidyProvider = [](tidy::ClangTidyOptions &ClangTidyOpts,
-                              llvm::StringRef) {
+  constexpr auto ClangTidyProvider = [](tidy::ClangTidyOptions &ClangTidyOpts,
+                                        llvm::StringRef) {
     ClangTidyOpts.Checks = {"-*,boost-use-ranges"};
   };
+  Opts.ClangTidyProvider = ClangTidyProvider;
   // Check that registering the boost-use-ranges checker's matchers
   // on two different threads does not cause a crash.
   auto &Client = start();
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index 3acacf4..9d48a6e 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -920,6 +920,41 @@ TEST(CompletionTest, NoIncludeInsertionWhenDeclFoundInFile) {
                           AllOf(named("Y"), Not(insertInclude()))));
 }
 
+TEST(CompletionTest, IncludeInsertionRespectsQuotedAngledConfig) {
+  TestTU TU;
+  TU.ExtraArgs.push_back("-I" + testPath("sub"));
+  TU.AdditionalFiles["sub/bar.h"] = "";
+  auto BarURI = URI::create(testPath("sub/bar.h")).toString();
+
+  Symbol Sym = cls("ns::X");
+  Sym.CanonicalDeclaration.FileURI = BarURI.c_str();
+  Sym.IncludeHeaders.emplace_back(BarURI, 1, Symbol::Include);
+  Annotations Test("int main() { ns::^ }");
+  TU.Code = Test.code().str();
+  auto Results = completions(TU, Test.point(), {Sym});
+  // Default for a local path is quoted include
+  EXPECT_THAT(Results.Completions,
+              ElementsAre(AllOf(named("X"), insertInclude("\"bar.h\""))));
+  {
+    Config C;
+    C.Style.AngledHeaders.push_back(
+        [](auto header) { return header == "bar.h"; });
+    WithContextValue WithCfg(Config::Key, std::move(C));
+    Results = completions(TU, Test.point(), {Sym});
+    EXPECT_THAT(Results.Completions,
+                ElementsAre(AllOf(named("X"), insertInclude("<bar.h>"))));
+  }
+  {
+    Config C;
+    C.Style.QuotedHeaders.push_back(
+        [](auto header) { return header == "bar.h"; });
+    WithContextValue WithCfg(Config::Key, std::move(C));
+    Results = completions(TU, Test.point(), {Sym});
+    EXPECT_THAT(Results.Completions,
+                ElementsAre(AllOf(named("X"), insertInclude("\"bar.h\""))));
+  }
+}
+
 TEST(CompletionTest, IndexSuppressesPreambleCompletions) {
   Annotations Test(R"cpp(
       #include "bar.h"
@@ -1138,8 +1173,8 @@ TEST(CodeCompleteTest, NoColonColonAtTheEnd) {
 }
 
 TEST(CompletionTests, EmptySnippetDoesNotCrash) {
-    // See https://github.com/clangd/clangd/issues/1216
-    auto Results = completions(R"cpp(
+  // See https://github.com/clangd/clangd/issues/1216
+  auto Results = completions(R"cpp(
         int main() {
           auto w = [&](auto &&f) { return f(f); };
           auto f = w([&](auto &&f) {
@@ -1155,18 +1190,18 @@ TEST(CompletionTests, EmptySnippetDoesNotCrash) {
 }
 
 TEST(CompletionTest, Issue1427Crash) {
-    // Need to provide main file signals to ensure that the branch in
-    // SymbolRelevanceSignals::computeASTSignals() that tries to
-    // compute a symbol ID is taken.
-    ASTSignals MainFileSignals;
-    CodeCompleteOptions Opts;
-    Opts.MainFileSignals = &MainFileSignals;
-    completions(R"cpp(
+  // Need to provide main file signals to ensure that the branch in
+  // SymbolRelevanceSignals::computeASTSignals() that tries to
+  // compute a symbol ID is taken.
+  ASTSignals MainFileSignals;
+  CodeCompleteOptions Opts;
+  Opts.MainFileSignals = &MainFileSignals;
+  completions(R"cpp(
       auto f = []() {
         1.0_^
       };
     )cpp",
-                {}, Opts);
+              {}, Opts);
 }
 
 TEST(CompletionTest, BacktrackCrashes) {
diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
index 4ecfdf0..179960a 100644
--- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
@@ -545,6 +545,44 @@ TEST_F(ConfigCompileTests, Style) {
   Frag.Style.FullyQualifiedNamespaces.push_back(std::string("bar"));
   EXPECT_TRUE(compileAndApply());
   EXPECT_THAT(Conf.Style.FullyQualifiedNamespaces, ElementsAre("foo", "bar"));
+
+  {
+    Frag = {};
+    EXPECT_TRUE(Conf.Style.QuotedHeaders.empty())
+        << Conf.Style.QuotedHeaders.size();
+    Frag.Style.QuotedHeaders.push_back(Located<std::string>("foo.h"));
+    Frag.Style.QuotedHeaders.push_back(Located<std::string>(".*inc"));
+    EXPECT_TRUE(compileAndApply());
+    auto HeaderFilter = [this](llvm::StringRef Path) {
+      for (auto &Filter : Conf.Style.QuotedHeaders) {
+        if (Filter(Path))
+          return true;
+      }
+      return false;
+    };
+    EXPECT_TRUE(HeaderFilter("foo.h"));
+    EXPECT_TRUE(HeaderFilter("prefix/foo.h"));
+    EXPECT_FALSE(HeaderFilter("bar.h"));
+    EXPECT_FALSE(HeaderFilter("foo.h/bar.h"));
+  }
+
+  {
+    Frag = {};
+    EXPECT_TRUE(Conf.Style.AngledHeaders.empty())
+        << Conf.Style.AngledHeaders.size();
+    Frag.Style.AngledHeaders.push_back(Located<std::string>("foo.h"));
+    Frag.Style.AngledHeaders.push_back(Located<std::string>(".*inc"));
+    EXPECT_TRUE(compileAndApply());
+    auto HeaderFilter = [this](llvm::StringRef Path) {
+      for (auto &Filter : Conf.Style.AngledHeaders) {
+        if (Filter(Path))
+          return true;
+      }
+      return false;
+    };
+    EXPECT_TRUE(HeaderFilter("foo.h"));
+    EXPECT_FALSE(HeaderFilter("bar.h"));
+  }
 }
 } // namespace
 } // namespace config
diff --git a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
index 10d67de..979d7254 100644
--- a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
@@ -297,13 +297,19 @@ TEST(ParseYAML, Style) {
   CapturedDiags Diags;
   Annotations YAML(R"yaml(
 Style:
-  FullyQualifiedNamespaces: [foo, bar])yaml");
+  FullyQualifiedNamespaces: [foo, bar]
+  AngledHeaders: ["foo", "bar"]
+  QuotedHeaders: ["baz", "baar"])yaml");
   auto Results =
       Fragment::parseYAML(YAML.code(), "config.yaml", Diags.callback());
   ASSERT_THAT(Diags.Diagnostics, IsEmpty());
   ASSERT_EQ(Results.size(), 1u);
   EXPECT_THAT(Results[0].Style.FullyQualifiedNamespaces,
               ElementsAre(val("foo"), val("bar")));
+  EXPECT_THAT(Results[0].Style.AngledHeaders,
+              ElementsAre(val("foo"), val("bar")));
+  EXPECT_THAT(Results[0].Style.QuotedHeaders,
+              ElementsAre(val("baz"), val("baar")));
 }
 } // namespace
 } // namespace config
diff --git a/clang-tools-extra/clangd/unittests/HeadersTests.cpp b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
index dc6adae..751383e3 100644
--- a/clang-tools-extra/clangd/unittests/HeadersTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
@@ -107,7 +107,8 @@ protected:
 
     IncludeInserter Inserter(MainFile, /*Code=*/"", format::getLLVMStyle(),
                              CDB.getCompileCommand(MainFile)->Directory,
-                             &Clang->getPreprocessor().getHeaderSearchInfo());
+                             &Clang->getPreprocessor().getHeaderSearchInfo(),
+                             QuotedHeaders, AngledHeaders);
     for (const auto &Inc : Inclusions)
       Inserter.addExisting(Inc);
     auto Inserted = ToHeaderFile(Preferred);
@@ -127,7 +128,8 @@ protected:
 
     IncludeInserter Inserter(MainFile, /*Code=*/"", format::getLLVMStyle(),
                              CDB.getCompileCommand(MainFile)->Directory,
-                             &Clang->getPreprocessor().getHeaderSearchInfo());
+                             &Clang->getPreprocessor().getHeaderSearchInfo(),
+                             QuotedHeaders, AngledHeaders);
     auto Edit = Inserter.insert(VerbatimHeader, Directive);
     Action.EndSourceFile();
     return Edit;
@@ -139,6 +141,8 @@ protected:
   std::string Subdir = testPath("sub");
   std::string SearchDirArg = (llvm::Twine("-I") + Subdir).str();
   IgnoringDiagConsumer IgnoreDiags;
+  std::vector<std::function<bool(llvm::StringRef)>> QuotedHeaders;
+  std::vector<std::function<bool(llvm::StringRef)>> AngledHeaders;
   std::unique_ptr<CompilerInstance> Clang;
 };
 
@@ -304,6 +308,9 @@ TEST_F(HeadersTest, InsertInclude) {
   std::string Path = testPath("sub/bar.h");
   FS.Files[Path] = "";
   EXPECT_EQ(calculate(Path), "\"bar.h\"");
+
+  AngledHeaders.push_back([](auto Path) { return true; });
+  EXPECT_EQ(calculate(Path), "<bar.h>");
 }
 
 TEST_F(HeadersTest, DoNotInsertIfInSameFile) {
@@ -326,6 +333,17 @@ TEST_F(HeadersTest, ShortenIncludesInSearchPath) {
   EXPECT_EQ(calculate(BarHeader), "\"sub/bar.h\"");
 }
 
+TEST_F(HeadersTest, ShortenIncludesInSearchPathBracketed) {
+  AngledHeaders.push_back([](auto Path) { return true; });
+  std::string BarHeader = testPath("sub/bar.h");
+  EXPECT_EQ(calculate(BarHeader), "<bar.h>");
+
+  SearchDirArg = (llvm::Twine("-I") + Subdir + "/..").str();
+  CDB.ExtraClangFlags = {SearchDirArg.c_str()};
+  BarHeader = testPath("sub/bar.h");
+  EXPECT_EQ(calculate(BarHeader), "<sub/bar.h>");
+}
+
 TEST_F(HeadersTest, ShortenedIncludeNotInSearchPath) {
   std::string BarHeader =
       llvm::sys::path::convert_to_slash(testPath("sub-2/bar.h"));
@@ -338,6 +356,10 @@ TEST_F(HeadersTest, PreferredHeader) {
 
   std::string BazHeader = testPath("sub/baz.h");
   EXPECT_EQ(calculate(BarHeader, BazHeader), "\"baz.h\"");
+
+  AngledHeaders.push_back([](auto Path) { return true; });
+  std::string BiffHeader = testPath("sub/biff.h");
+  EXPECT_EQ(calculate(BarHeader, BiffHeader), "<biff.h>");
 }
 
 TEST_F(HeadersTest, DontInsertDuplicatePreferred) {
@@ -370,7 +392,8 @@ TEST_F(HeadersTest, PreferInserted) {
 TEST(Headers, NoHeaderSearchInfo) {
   std::string MainFile = testPath("main.cpp");
   IncludeInserter Inserter(MainFile, /*Code=*/"", format::getLLVMStyle(),
-                           /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr);
+                           /*BuildDir=*/"", /*HeaderSearchInfo=*/nullptr,
+                           /*QuotedHeaders=*/{}, /*AngledHeaders=*/{});
 
   auto HeaderPath = testPath("sub/bar.h");
   auto Inserting = HeaderFile{HeaderPath, /*Verbatim=*/false};
diff --git a/clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp b/clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp
new file mode 100644
index 0000000..e4b3822
--- /dev/null
+++ b/clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp
@@ -0,0 +1,542 @@
+//===-- HeuristicResolverTests.cpp --------------------------*- C++ -*-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "HeuristicResolver.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Tooling/Tooling.h"
+#include "gmock/gmock-matchers.h"
+#include "gtest/gtest.h"
+
+using namespace clang::ast_matchers;
+using clang::clangd::HeuristicResolver;
+using testing::ElementsAre;
+
+namespace clang {
+namespace {
+
+// Helper for matching a sequence of elements with a variadic list of matchers.
+// Usage: `ElementsAre(matchAdapter(Vs, MatchFunction)...)`, where `Vs...` is
+//        a variadic list of matchers.
+// For each `V` in `Vs`, this will match the corresponding element `E` if
+// `MatchFunction(V, E)` is true.
+MATCHER_P2(matchAdapter, MatcherForElement, MatchFunction, "matchAdapter") {
+  return MatchFunction(MatcherForElement, arg);
+}
+
+template <typename InputNode>
+using ResolveFnT = std::function<std::vector<const NamedDecl *>(
+    const HeuristicResolver *, const InputNode *)>;
+
+// Test heuristic resolution on `Code` using the resolution procedure
+// `ResolveFn`, which takes a `HeuristicResolver` and an input AST node of type
+// `InputNode` and returns a `std::vector<const NamedDecl *>`.
+// `InputMatcher` should be an AST matcher that matches a single node to pass as
+// input to `ResolveFn`, bound to the ID "input". `OutputMatchers` should be AST
+// matchers that each match a single node, bound to the ID "output".
+template <typename InputNode, typename InputMatcher, typename... OutputMatchers>
+void expectResolution(llvm::StringRef Code, ResolveFnT<InputNode> ResolveFn,
+                      const InputMatcher &IM, const OutputMatchers &...OMS) {
+  auto TU = tooling::buildASTFromCodeWithArgs(Code, {"-std=c++20"});
+  auto &Ctx = TU->getASTContext();
+  auto InputMatches = match(IM, Ctx);
+  ASSERT_EQ(1u, InputMatches.size());
+  const auto *Input = InputMatches[0].template getNodeAs<InputNode>("input");
+  ASSERT_TRUE(Input);
+
+  auto OutputNodeMatches = [&](auto &OutputMatcher, auto &Actual) {
+    auto OutputMatches = match(OutputMatcher, Ctx);
+    if (OutputMatches.size() != 1u)
+      return false;
+    const auto *ExpectedOutput =
+        OutputMatches[0].template getNodeAs<NamedDecl>("output");
+    if (!ExpectedOutput)
+      return false;
+    return ExpectedOutput == Actual;
+  };
+
+  HeuristicResolver H(Ctx);
+  auto Results = ResolveFn(&H, Input);
+  EXPECT_THAT(Results, ElementsAre(matchAdapter(OMS, OutputNodeMatches)...));
+}
+
+// Wrapper for the above that accepts a HeuristicResolver member function
+// pointer directly.
+template <typename InputNode, typename InputMatcher, typename... OutputMatchers>
+void expectResolution(llvm::StringRef Code,
+                      std::vector<const NamedDecl *> (
+                          HeuristicResolver::*ResolveFn)(const InputNode *)
+                          const,
+                      const InputMatcher &IM, const OutputMatchers &...OMS) {
+  expectResolution(Code, ResolveFnT<InputNode>(std::mem_fn(ResolveFn)), IM,
+                   OMS...);
+}
+
+TEST(HeuristicResolver, MemberExpr) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct S {
+      void bar() {}
+    };
+
+    template <typename T>
+    void foo(S<T> arg) {
+      arg.bar();
+    }
+  )cpp";
+  // Test resolution of "bar" in "arg.bar()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("bar")).bind("input"),
+      cxxMethodDecl(hasName("bar")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Overloads) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct S {
+      void bar(int);
+      void bar(float);
+    };
+
+    template <typename T, typename U>
+    void foo(S<T> arg, U u) {
+      arg.bar(u);
+    }
+  )cpp";
+  // Test resolution of "bar" in "arg.bar(u)". Both overloads should be found.
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("bar")).bind("input"),
+      cxxMethodDecl(hasName("bar"), hasParameter(0, hasType(asString("int"))))
+          .bind("output"),
+      cxxMethodDecl(hasName("bar"), hasParameter(0, hasType(asString("float"))))
+          .bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_SmartPointer) {
+  std::string Code = R"cpp(
+    template <typename> struct S { void foo() {} };
+    template <typename T> struct unique_ptr {
+      T* operator->();
+    };
+    template <typename T>
+    void test(unique_ptr<S<T>>& v) {
+      v->foo();
+    }
+  )cpp";
+  // Test resolution of "foo" in "v->foo()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("foo")).bind("input"),
+      cxxMethodDecl(hasName("foo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Chained) {
+  std::string Code = R"cpp(
+    struct A { void foo() {} };
+    template <typename T>
+    struct B {
+      A func(int);
+      void bar() {
+        func(1).foo();
+      }
+    };
+  )cpp";
+  // Test resolution of "foo" in "func(1).foo()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("foo")).bind("input"),
+      cxxMethodDecl(hasName("foo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_TemplateArgs) {
+  std::string Code = R"cpp(
+    struct Foo {
+      static Foo k(int);
+      template <typename T> T convert();
+    };
+    template <typename T>
+    void test() {
+      Foo::k(T()).template convert<T>();
+    }
+  )cpp";
+  // Test resolution of "convert" in "Foo::k(T()).template convert<T>()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("convert")).bind("input"),
+      functionTemplateDecl(hasName("convert")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_TypeAlias) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Waldo {
+      void find();
+    };
+    template <typename T>
+    using Wally = Waldo<T>;
+    template <typename T>
+    void foo(Wally<T> w) {
+      w.find();
+    }
+  )cpp";
+  // Test resolution of "find" in "w.find()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("find")).bind("input"),
+      cxxMethodDecl(hasName("find")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_BaseClass_TypeAlias) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Waldo {
+      void find();
+    };
+    template <typename T>
+    using Wally = Waldo<T>;
+    template <typename T>
+    struct S : Wally<T> {
+      void foo() {
+        this->find();
+      }
+    };
+  )cpp";
+  // Test resolution of "find" in "this->find()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("find")).bind("input"),
+      cxxMethodDecl(hasName("find")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Metafunction) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Waldo {
+      void find();
+    };
+    template <typename T>
+    struct MetaWaldo {
+      using Type = Waldo<T>;
+    };
+    template <typename T>
+    void foo(typename MetaWaldo<T>::Type w) {
+      w.find();
+    }
+  )cpp";
+  // Test resolution of "find" in "w.find()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("find")).bind("input"),
+      cxxMethodDecl(hasName("find")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_DeducedNonTypeTemplateParameter) {
+  std::string Code = R"cpp(
+    template <int N>
+    struct Waldo {
+      const int found = N;
+    };
+    template <Waldo W>
+    int foo() {
+      return W.found;
+    }
+  )cpp";
+  // Test resolution of "found" in "W.found".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("found")).bind("input"),
+      fieldDecl(hasName("found")).bind("output"));
+}
+
+TEST(HeuristicResolver, DeclRefExpr_StaticMethod) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct S {
+      static void bar() {}
+    };
+
+    template <typename T>
+    void foo() {
+      S<T>::bar();
+    }
+  )cpp";
+  // Test resolution of "bar" in "S<T>::bar()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("bar")).bind("input"),
+      cxxMethodDecl(hasName("bar")).bind("output"));
+}
+
+TEST(HeuristicResolver, DeclRefExpr_StaticOverloads) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct S {
+      static void bar(int);
+      static void bar(float);
+    };
+
+    template <typename T, typename U>
+    void foo(U u) {
+      S<T>::bar(u);
+    }
+  )cpp";
+  // Test resolution of "bar" in "S<T>::bar(u)". Both overloads should be found.
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("bar")).bind("input"),
+      cxxMethodDecl(hasName("bar"), hasParameter(0, hasType(asString("int"))))
+          .bind("output"),
+      cxxMethodDecl(hasName("bar"), hasParameter(0, hasType(asString("float"))))
+          .bind("output"));
+}
+
+TEST(HeuristicResolver, DeclRefExpr_Enumerator) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Foo {
+      enum class E { A, B };
+      E e = E::A;
+    };
+  )cpp";
+  // Test resolution of "A" in "E::A".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("A")).bind("input"),
+      enumConstantDecl(hasName("A")).bind("output"));
+}
+
+TEST(HeuristicResolver, DeclRefExpr_RespectScope) {
+  std::string Code = R"cpp(
+    template <typename Info>
+    struct PointerIntPair {
+      void *getPointer() const { return Info::getPointer(); }
+    };
+  )cpp";
+  // Test resolution of "getPointer" in "Info::getPointer()".
+  // Here, we are testing that we do not incorrectly get the enclosing
+  // getPointer() function as a result.
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("getPointer")).bind("input"));
+}
+
+TEST(HeuristicResolver, DependentNameType) {
+  std::string Code = R"cpp(
+    template <typename>
+    struct A {
+      struct B {};
+    };
+    template <typename T>
+    void foo(typename A<T>::B);
+  )cpp";
+  // Tests resolution of "B" in "A<T>::B".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDependentNameType,
+      functionDecl(hasParameter(0, hasType(dependentNameType().bind("input")))),
+      classTemplateDecl(
+          has(cxxRecordDecl(has(cxxRecordDecl(hasName("B")).bind("output"))))));
+}
+
+TEST(HeuristicResolver, DependentNameType_Nested) {
+  std::string Code = R"cpp(
+    template <typename>
+    struct A {
+      struct B {
+        struct C {};
+      };
+    };
+    template <typename T>
+    void foo(typename A<T>::B::C);
+  )cpp";
+  // Tests resolution of "C" in "A<T>::B::C".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDependentNameType,
+      functionDecl(hasParameter(0, hasType(dependentNameType().bind("input")))),
+      classTemplateDecl(has(cxxRecordDecl(has(
+          cxxRecordDecl(has(cxxRecordDecl(hasName("C")).bind("output"))))))));
+}
+
+TEST(HeuristicResolver, DependentNameType_Recursion) {
+  std::string Code = R"cpp(
+    template <int N>
+    struct Waldo {
+      using Type = typename Waldo<N - 1>::Type::Next;
+    };
+  )cpp";
+  // Test resolution of "Next" in "typename Waldo<N - 1>::Type::Next".
+  // Here, we are testing that we do not get into an infinite recursion.
+  expectResolution(Code, &HeuristicResolver::resolveDependentNameType,
+                   typeAliasDecl(hasType(dependentNameType().bind("input"))));
+}
+
+TEST(HeuristicResolver, DependentNameType_MutualRecursion) {
+  std::string Code = R"cpp(
+    template <int N>
+    struct Odd;
+    template <int N>
+    struct Even {
+      using Type = typename Odd<N - 1>::Type::Next;
+    };
+    template <int N>
+    struct Odd {
+      using Type = typename Even<N - 1>::Type::Next;
+    };
+  )cpp";
+  // Test resolution of "Next" in "typename Even<N - 1>::Type::Next".
+  // Similar to the above but we have two mutually recursive templates.
+  expectResolution(
+      Code, &HeuristicResolver::resolveDependentNameType,
+      classTemplateDecl(hasName("Odd"),
+                        has(cxxRecordDecl(has(typeAliasDecl(
+                            hasType(dependentNameType().bind("input"))))))));
+}
+
+TEST(HeuristicResolver, NestedNameSpecifier) {
+  // Test resolution of "B" in "A<T>::B::C".
+  // Unlike the "C", the "B" does not get its own DependentNameTypeLoc node,
+  // so the resolution uses the NestedNameSpecifier as input.
+  std::string Code = R"cpp(
+    template <typename>
+    struct A {
+      struct B {
+        struct C {};
+      };
+    };
+    template <typename T>
+    void foo(typename A<T>::B::C);
+  )cpp";
+  // Adapt the call to resolveNestedNameSpecifierToType() to the interface
+  // expected by expectResolution() (returning a vector of decls).
+  ResolveFnT<NestedNameSpecifier> ResolveFn =
+      [](const HeuristicResolver *H,
+         const NestedNameSpecifier *NNS) -> std::vector<const NamedDecl *> {
+    return {H->resolveNestedNameSpecifierToType(NNS)->getAsCXXRecordDecl()};
+  };
+  expectResolution(Code, ResolveFn,
+                   nestedNameSpecifier(hasPrefix(specifiesType(hasDeclaration(
+                                           classTemplateDecl(hasName("A"))))))
+                       .bind("input"),
+                   classTemplateDecl(has(cxxRecordDecl(
+                       has(cxxRecordDecl(hasName("B")).bind("output"))))));
+}
+
+TEST(HeuristicResolver, TemplateSpecializationType) {
+  std::string Code = R"cpp(
+    template <typename>
+    struct A {
+      template <typename>
+      struct B {};
+    };
+    template <typename T>
+    void foo(typename A<T>::template B<int>);
+  )cpp";
+  // Test resolution of "B" in "A<T>::template B<int>".
+  expectResolution(Code, &HeuristicResolver::resolveTemplateSpecializationType,
+                   functionDecl(hasParameter(0, hasType(type().bind("input")))),
+                   classTemplateDecl(has(cxxRecordDecl(
+                       has(classTemplateDecl(hasName("B")).bind("output"))))));
+}
+
+TEST(HeuristicResolver, DependentCall_NonMember) {
+  std::string Code = R"cpp(
+    template <typename T>
+    void nonmember(T);
+    template <typename T>
+    void bar(T t) {
+      nonmember(t);
+    }
+  )cpp";
+  // Test resolution of "nonmember" in "nonmember(t)".
+  expectResolution(Code, &HeuristicResolver::resolveCalleeOfCallExpr,
+                   callExpr(callee(unresolvedLookupExpr(hasAnyDeclaration(
+                                functionTemplateDecl(hasName("nonmember"))))))
+                       .bind("input"),
+                   functionTemplateDecl(hasName("nonmember")).bind("output"));
+}
+
+TEST(HeuristicResolver, DependentCall_Member) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct A {
+      void member(T);
+    };
+    template <typename T>
+    void bar(A<T> a, T t) {
+      a.member(t);
+    }
+  )cpp";
+  // Test resolution of "member" in "a.member(t)".
+  expectResolution(
+      Code, &HeuristicResolver::resolveCalleeOfCallExpr,
+      callExpr(callee(cxxDependentScopeMemberExpr(hasMemberName("member"))))
+          .bind("input"),
+      cxxMethodDecl(hasName("member")).bind("output"));
+}
+
+TEST(HeuristicResolver, DependentCall_StaticMember) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct A {
+      static void static_member(T);
+    };
+    template <typename T>
+    void bar(T t) {
+      A<T>::static_member(t);
+    }
+  )cpp";
+  // Test resolution of "static_member" in "A<T>::static_member(t)".
+  expectResolution(Code, &HeuristicResolver::resolveCalleeOfCallExpr,
+                   callExpr(callee(dependentScopeDeclRefExpr(
+                                hasDependentName("static_member"))))
+                       .bind("input"),
+                   cxxMethodDecl(hasName("static_member")).bind("output"));
+}
+
+TEST(HeuristicResolver, DependentCall_Overload) {
+  std::string Code = R"cpp(
+    void overload(int);
+    void overload(double);
+    template <typename T>
+    void bar(T t) {
+      overload(t);
+    }
+  )cpp";
+  // Test resolution of "overload" in "overload(t)". Both overload should be
+  // found.
+  expectResolution(Code, &HeuristicResolver::resolveCalleeOfCallExpr,
+                   callExpr(callee(unresolvedLookupExpr(hasAnyDeclaration(
+                                functionDecl(hasName("overload"))))))
+                       .bind("input"),
+                   functionDecl(hasName("overload"),
+                                hasParameter(0, hasType(asString("double"))))
+                       .bind("output"),
+                   functionDecl(hasName("overload"),
+                                hasParameter(0, hasType(asString("int"))))
+                       .bind("output"));
+}
+
+TEST(HeuristicResolver, UsingValueDecl) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Base {
+      void waldo();
+    };
+    template <typename T>
+    struct Derived : Base<T> {
+      using Base<T>::waldo;
+    };
+  )cpp";
+  // Test resolution of "waldo" in "Base<T>::waldo".
+  expectResolution(Code, &HeuristicResolver::resolveUsingValueDecl,
+                   unresolvedUsingValueDecl(hasName("waldo")).bind("input"),
+                   cxxMethodDecl(hasName("waldo")).bind("output"));
+}
+
+} // namespace
+} // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/Matchers.h b/clang-tools-extra/clangd/unittests/Matchers.h
index 0fbd93b..17d18dd 100644
--- a/clang-tools-extra/clangd/unittests/Matchers.h
+++ b/clang-tools-extra/clangd/unittests/Matchers.h
@@ -127,74 +127,6 @@ PolySubsequenceMatcher<Args...> HasSubsequence(Args &&... M) {
     llvm::consumeError(ComputedValue.takeError());                             \
   } while (false)
 
-// Implements the HasValue(m) matcher for matching an Optional whose
-// value matches matcher m.
-template <typename InnerMatcher> class OptionalMatcher {
-public:
-  explicit OptionalMatcher(const InnerMatcher &matcher) : matcher_(matcher) {}
-  OptionalMatcher(const OptionalMatcher&) = default;
-  OptionalMatcher &operator=(const OptionalMatcher&) = delete;
-
-  // This type conversion operator template allows Optional(m) to be
-  // used as a matcher for any Optional type whose value type is
-  // compatible with the inner matcher.
-  //
-  // The reason we do this instead of relying on
-  // MakePolymorphicMatcher() is that the latter is not flexible
-  // enough for implementing the DescribeTo() method of Optional().
-  template <typename Optional> operator Matcher<Optional>() const {
-    return MakeMatcher(new Impl<Optional>(matcher_));
-  }
-
-private:
-  // The monomorphic implementation that works for a particular optional type.
-  template <typename Optional>
-  class Impl : public ::testing::MatcherInterface<Optional> {
-  public:
-    using Value = typename std::remove_const<
-        typename std::remove_reference<Optional>::type>::type::value_type;
-
-    explicit Impl(const InnerMatcher &matcher)
-        : matcher_(::testing::MatcherCast<const Value &>(matcher)) {}
-
-    Impl(const Impl&) = default;
-    Impl &operator=(const Impl&) = delete;
-
-    virtual void DescribeTo(::std::ostream *os) const {
-      *os << "has a value that ";
-      matcher_.DescribeTo(os);
-    }
-
-    virtual void DescribeNegationTo(::std::ostream *os) const {
-      *os << "does not have a value that ";
-      matcher_.DescribeTo(os);
-    }
-
-    virtual bool
-    MatchAndExplain(Optional optional,
-                    ::testing::MatchResultListener *listener) const {
-      if (!optional)
-        return false;
-
-      *listener << "which has a value ";
-      return MatchPrintAndExplain(*optional, matcher_, listener);
-    }
-
-  private:
-    const Matcher<const Value &> matcher_;
-  };
-
-  const InnerMatcher matcher_;
-};
-
-// Creates a matcher that matches an Optional that has a value
-// that matches inner_matcher.
-template <typename InnerMatcher>
-inline OptionalMatcher<InnerMatcher>
-HasValue(const InnerMatcher &inner_matcher) {
-  return OptionalMatcher<InnerMatcher>(inner_matcher);
-}
-
 } // namespace clangd
 } // namespace clang
 #endif
diff --git a/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp b/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
index 15158d8..406a842 100644
--- a/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TypeHierarchyTests.cpp
@@ -28,6 +28,7 @@ using ::testing::ElementsAre;
 using ::testing::Field;
 using ::testing::IsEmpty;
 using ::testing::Matcher;
+using ::testing::Optional;
 using ::testing::SizeIs;
 using ::testing::UnorderedElementsAre;
 
@@ -38,12 +39,12 @@ MATCHER_P(selectionRangeIs, R, "") { return arg.selectionRange == R; }
 template <class... ParentMatchers>
 ::testing::Matcher<TypeHierarchyItem> parents(ParentMatchers... ParentsM) {
   return Field(&TypeHierarchyItem::parents,
-               HasValue(UnorderedElementsAre(ParentsM...)));
+               Optional(UnorderedElementsAre(ParentsM...)));
 }
 template <class... ChildMatchers>
 ::testing::Matcher<TypeHierarchyItem> children(ChildMatchers... ChildrenM) {
   return Field(&TypeHierarchyItem::children,
-               HasValue(UnorderedElementsAre(ChildrenM...)));
+               Optional(UnorderedElementsAre(ChildrenM...)));
 }
 // Note: "not resolved" is different from "resolved but empty"!
 MATCHER(parentsNotResolved, "") { return !arg.parents; }
@@ -790,7 +791,7 @@ struct Child : Parent1, Parent2 {};
       Children,
       UnorderedElementsAre(
           AllOf(withName("Child"),
-                withResolveParents(HasValue(UnorderedElementsAre(withResolveID(
+                withResolveParents(Optional(UnorderedElementsAre(withResolveID(
                     getSymbolID(&findDecl(AST, "Parent1")).str())))))));
 }
 
@@ -810,9 +811,9 @@ struct Chil^d : Parent {};
   ASSERT_THAT(Result, SizeIs(1));
   auto Parents = superTypes(Result.front(), Index.get());
 
-  EXPECT_THAT(Parents, HasValue(UnorderedElementsAre(
+  EXPECT_THAT(Parents, Optional(UnorderedElementsAre(
                            AllOf(withName("Parent"),
-                                 withResolveParents(HasValue(IsEmpty()))))));
+                                 withResolveParents(Optional(IsEmpty()))))));
 }
 } // namespace
 } // namespace clangd
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index fa3a8e5..94e1563 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -115,9 +115,12 @@ Improvements to clang-tidy
 - Improved :program:`run-clang-tidy.py` script. Fixed minor shutdown noise
   happening on certain platforms when interrupting the script.
 
+- Improved :program:`clang-tidy` by accepting parameters file in command line.
+
 - Removed :program:`clang-tidy`'s global options for most of checks. All options
   are changed to local options except `IncludeStyle`, `StrictMode` and
-  `IgnoreMacros`.
+  `IgnoreMacros`. Global scoped `StrictMode` and `IgnoreMacros` are deprecated
+  and will be removed in further releases.
 
 .. csv-table::
   :header: "Check", "Options removed from global option"
@@ -230,10 +233,18 @@ Changes in existing checks
   `bsl::optional` and `bdlb::NullableValue` from
   <https://github.com/bloomberg/bde>_.
 
+- Improved :doc:`bugprone-unhandled-self-assignment
+  <clang-tidy/checks/bugprone/unhandled-self-assignment>` check by fixing smart
+  pointer check against std::unique_ptr type.
+
 - Improved :doc:`bugprone-unsafe-functions
   <clang-tidy/checks/bugprone/unsafe-functions>` check to allow specifying
   additional functions to match.
 
+- Improved :doc:`bugprone-unused-local-non-trivial-variable
+  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check to avoid
+  false positives when using name-independent variables after C++26.
+
 - Improved :doc:`bugprone-use-after-move
   <clang-tidy/checks/bugprone/use-after-move>` to avoid triggering on
   ``reset()`` calls on moved-from ``std::optional`` and ``std::any`` objects,
@@ -359,6 +370,13 @@ Removed checks
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- The :doc:`bugprone-narrowing-conversions <clang-tidy/checks/bugprone/narrowing-conversions>`
+  check is no longer an alias of :doc:`cppcoreguidelines-narrowing-conversions
+  <clang-tidy/checks/cppcoreguidelines/narrowing-conversions>`. Instead,
+  :doc:`cppcoreguidelines-narrowing-conversions
+  <clang-tidy/checks/cppcoreguidelines/narrowing-conversions>` is now an alias
+  of :doc:`bugprone-narrowing-conversions <clang-tidy/checks/bugprone/narrowing-conversions>`.
+
 Improvements to include-fixer
 -----------------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
index f4bb40b..1a1217e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
@@ -1,10 +1,126 @@
 .. title:: clang-tidy - bugprone-narrowing-conversions
-.. meta::
-   :http-equiv=refresh: 5;URL=../cppcoreguidelines/narrowing-conversions.html
 
 bugprone-narrowing-conversions
 ==============================
 
-The bugprone-narrowing-conversions check is an alias, please see
-:doc:`cppcoreguidelines-narrowing-conversions <../cppcoreguidelines/narrowing-conversions>`
-for more information.
+`cppcoreguidelines-narrowing-conversions` redirects here as an alias for this check.
+
+Checks for silent narrowing conversions, e.g: ``int i = 0; i += 0.1;``. While
+the issue is obvious in this former example, it might not be so in the
+following: ``void MyClass::f(double d) { int_member_ += d; }``.
+
+We flag narrowing conversions from:
+ - an integer to a narrower integer (e.g. ``char`` to ``unsigned char``)
+   if WarnOnIntegerNarrowingConversion Option is set,
+ - an integer to a narrower floating-point (e.g. ``uint64_t`` to ``float``)
+   if WarnOnIntegerToFloatingPointNarrowingConversion Option is set,
+ - a floating-point to an integer (e.g. ``double`` to ``int``),
+ - a floating-point to a narrower floating-point (e.g. ``double`` to ``float``)
+   if WarnOnFloatingPointNarrowingConversion Option is set.
+
+This check will flag:
+ - All narrowing conversions that are not marked by an explicit cast (c-style or
+   ``static_cast``). For example: ``int i = 0; i += 0.1;``,
+   ``void f(int); f(0.1);``,
+ - All applications of binary operators with a narrowing conversions.
+   For example: ``int i; i+= 0.1;``.
+
+Arithmetic with smaller integer types than ``int`` trigger implicit conversions,
+as explained under `"Integral Promotion" on cppreference.com
+<https://en.cppreference.com/w/cpp/language/implicit_conversion>`_.
+This check diagnoses more instances of narrowing than the compiler warning
+`-Wconversion` does. The example below demonstrates this behavior.
+
+.. code-block:: c++
+
+  // The following function definition demonstrates usage of arithmetic with
+  // integer types smaller than `int` and how the narrowing conversion happens
+  // implicitly.
+  void computation(short argument1, short argument2) {
+    // Arithmetic written by humans:
+    short result = argument1 + argument2;
+    // Arithmetic actually performed by C++:
+    short result = static_cast<short>(static_cast<int>(argument1) + static_cast<int>(argument2));
+  }
+
+  void recommended_resolution(short argument1, short argument2) {
+    short result = argument1 + argument2;
+    //           ^ warning: narrowing conversion from 'int' to signed type 'short' is implementation-defined
+
+    // The cppcoreguidelines recommend to resolve this issue by using the GSL
+    // in one of two ways. Either by a cast that throws if a loss of precision
+    // would occur.
+    short result = gsl::narrow<short>(argument1 + argument2);
+    // Or it can be resolved without checking the result risking invalid results.
+    short result = gsl::narrow_cast<short>(argument1 + argument2);
+
+    // A classical `static_cast` will silence the warning as well if the GSL
+    // is not available.
+    short result = static_cast<short>(argument1 + argument2);
+  }
+
+Options
+-------
+
+.. option:: WarnOnIntegerNarrowingConversion
+
+    When `true`, the check will warn on narrowing integer conversion
+    (e.g. ``int`` to ``size_t``). `true` by default.
+
+.. option:: WarnOnIntegerToFloatingPointNarrowingConversion
+
+    When `true`, the check will warn on narrowing integer to floating-point
+    conversion (e.g. ``size_t`` to ``double``). `true` by default.
+
+.. option:: WarnOnFloatingPointNarrowingConversion
+
+    When `true`, the check will warn on narrowing floating point conversion
+    (e.g. ``double`` to ``float``). `true` by default.
+
+.. option:: WarnWithinTemplateInstantiation
+
+    When `true`, the check will warn on narrowing conversions within template
+    instantiations. `false` by default.
+
+.. option:: WarnOnEquivalentBitWidth
+
+    When `true`, the check will warn on narrowing conversions that arise from
+    casting between types of equivalent bit width. (e.g.
+    `int n = uint(0);` or `long long n = double(0);`) `true` by default.
+
+.. option:: IgnoreConversionFromTypes
+
+   Narrowing conversions from any type in this semicolon-separated list will be
+   ignored. This may be useful to weed out commonly occurring, but less commonly
+   problematic assignments such as `int n = std::vector<char>().size();` or
+   `int n = std::difference(it1, it2);`. The default list is empty, but one
+   suggested list for a legacy codebase would be
+   `size_t;ptrdiff_t;size_type;difference_type`.
+
+.. option:: PedanticMode
+
+    When `true`, the check will warn on assigning a floating point constant
+    to an integer value even if the floating point value is exactly
+    representable in the destination type (e.g. ``int i = 1.0;``).
+    `false` by default.
+
+FAQ
+---
+
+ - What does "narrowing conversion from 'int' to 'float'" mean?
+
+An IEEE754 Floating Point number can represent all integer values in the range
+[-2^PrecisionBits, 2^PrecisionBits] where PrecisionBits is the number of bits in
+the mantissa.
+
+For ``float`` this would be [-2^23, 2^23], where ``int`` can represent values in
+the range [-2^31, 2^31-1].
+
+ - What does "implementation-defined" mean?
+
+You may have encountered messages like "narrowing conversion from 'unsigned int'
+to signed type 'int' is implementation-defined".
+The C/C++ standard does not mandate two's complement for signed integers, and so
+the compiler is free to define what the semantics are for converting an unsigned
+integer to signed integer. Clang's implementation uses the two's complement
+format.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
index dee1398..d3cdd5a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
@@ -120,5 +120,7 @@ temporary object into ``this`` (needs a move assignment operator):
 
 .. option:: WarnOnlyIfThisHasSuspiciousField
 
-  When `true`, the check will warn only if the container class of the copy assignment operator
-  has any suspicious fields (pointer or C array). This option is set to `true` by default.
+  When `true`, the check will warn only if the container class of the copy
+  assignment operator has any suspicious fields (pointer, C array and C++ smart
+  pointer).
+  This option is set to `true` by default.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
index 9f283de..672eab6 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
@@ -12,6 +12,7 @@ The following types of variables are excluded from this check:
 * static or thread local
 * structured bindings
 * variables with ``[[maybe_unused]]`` attribute
+* name-independent variables
 
 This check can be configured to warn on all non-trivial variables by setting
 `IncludeTypes` to `.*`, and excluding specific types using `ExcludeTypes`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
index 7cc0b28..ea24e87 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
@@ -1,129 +1,14 @@
 .. title:: clang-tidy - cppcoreguidelines-narrowing-conversions
+.. meta::
+   :http-equiv=refresh: 5;URL=../bugprone/narrowing-conversions.html
 
 cppcoreguidelines-narrowing-conversions
 =======================================
 
-Checks for silent narrowing conversions, e.g: ``int i = 0; i += 0.1;``. While
-the issue is obvious in this former example, it might not be so in the
-following: ``void MyClass::f(double d) { int_member_ += d; }``.
-
-This check implements `ES.46
+This check implements part of  `ES.46
 <https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#es46-avoid-lossy-narrowing-truncating-arithmetic-conversions>`_
 from the C++ Core Guidelines.
 
-We enforce only part of the guideline, more specifically, we flag narrowing conversions from:
- - an integer to a narrower integer (e.g. ``char`` to ``unsigned char``)
-   if WarnOnIntegerNarrowingConversion Option is set,
- - an integer to a narrower floating-point (e.g. ``uint64_t`` to ``float``)
-   if WarnOnIntegerToFloatingPointNarrowingConversion Option is set,
- - a floating-point to an integer (e.g. ``double`` to ``int``),
- - a floating-point to a narrower floating-point (e.g. ``double`` to ``float``)
-   if WarnOnFloatingPointNarrowingConversion Option is set.
-
-This check will flag:
- - All narrowing conversions that are not marked by an explicit cast (c-style or
-   ``static_cast``). For example: ``int i = 0; i += 0.1;``,
-   ``void f(int); f(0.1);``,
- - All applications of binary operators with a narrowing conversions.
-   For example: ``int i; i+= 0.1;``.
-
-Arithmetic with smaller integer types than ``int`` trigger implicit conversions,
-as explained under `"Integral Promotion" on cppreference.com
-<https://en.cppreference.com/w/cpp/language/implicit_conversion>`_.
-This check diagnoses more instances of narrowing than the compiler warning
-`-Wconversion` does. The example below demonstrates this behavior.
-
-.. code-block:: c++
-
-   // The following function definition demonstrates usage of arithmetic with
-   // integer types smaller than `int` and how the narrowing conversion happens
-   // implicitly.
-   void computation(short argument1, short argument2) {
-     // Arithmetic written by humans:
-     short result = argument1 + argument2;
-     // Arithmetic actually performed by C++:
-     short result = static_cast<short>(static_cast<int>(argument1) + static_cast<int>(argument2));
-   }
-
-   void recommended_resolution(short argument1, short argument2) {
-     short result = argument1 + argument2;
-     //           ^ warning: narrowing conversion from 'int' to signed type 'short' is implementation-defined
-
-     // The cppcoreguidelines recommend to resolve this issue by using the GSL
-     // in one of two ways. Either by a cast that throws if a loss of precision
-     // would occur.
-     short result = gsl::narrow<short>(argument1 + argument2);
-     // Or it can be resolved without checking the result risking invalid results.
-     short result = gsl::narrow_cast<short>(argument1 + argument2);
-
-     // A classical `static_cast` will silence the warning as well if the GSL
-     // is not available.
-     short result = static_cast<short>(argument1 + argument2);
-   }
-
-
-Options
--------
-
-.. option:: WarnOnIntegerNarrowingConversion
-
-    When `true`, the check will warn on narrowing integer conversion
-    (e.g. ``int`` to ``size_t``). `true` by default.
-
-.. option:: WarnOnIntegerToFloatingPointNarrowingConversion
-
-    When `true`, the check will warn on narrowing integer to floating-point
-    conversion (e.g. ``size_t`` to ``double``). `true` by default.
-
-.. option:: WarnOnFloatingPointNarrowingConversion
-
-    When `true`, the check will warn on narrowing floating point conversion
-    (e.g. ``double`` to ``float``). `true` by default.
-
-.. option:: WarnWithinTemplateInstantiation
-
-    When `true`, the check will warn on narrowing conversions within template
-    instantiations. `false` by default.
-
-.. option:: WarnOnEquivalentBitWidth
-
-    When `true`, the check will warn on narrowing conversions that arise from
-    casting between types of equivalent bit width. (e.g.
-    `int n = uint(0);` or `long long n = double(0);`) `true` by default.
-
-.. option:: IgnoreConversionFromTypes
-
-   Narrowing conversions from any type in this semicolon-separated list will be
-   ignored. This may be useful to weed out commonly occurring, but less commonly
-   problematic assignments such as `int n = std::vector<char>().size();` or
-   `int n = std::difference(it1, it2);`. The default list is empty, but one
-   suggested list for a legacy codebase would be
-   `size_t;ptrdiff_t;size_type;difference_type`.
-
-.. option:: PedanticMode
-
-    When `true`, the check will warn on assigning a floating point constant
-    to an integer value even if the floating point value is exactly
-    representable in the destination type (e.g. ``int i = 1.0;``).
-    `false` by default.
-
-FAQ
----
-
- - What does "narrowing conversion from 'int' to 'float'" mean?
-
-An IEEE754 Floating Point number can represent all integer values in the range
-[-2^PrecisionBits, 2^PrecisionBits] where PrecisionBits is the number of bits in
-the mantissa.
-
-For ``float`` this would be [-2^23, 2^23], where ``int`` can represent values in
-the range [-2^31, 2^31-1].
-
- - What does "implementation-defined" mean?
-
-You may have encountered messages like "narrowing conversion from 'unsigned int'
-to signed type 'int' is implementation-defined".
-The C/C++ standard does not mandate two's complement for signed integers, and so
-the compiler is free to define what the semantics are for converting an unsigned
-integer to signed integer. Clang's implementation uses the two's complement
-format.
+The cppcoreguidelines-narrowing-conversions check is an alias, please see
+:doc:`bugprone-narrowing-conversions <../bugprone/narrowing-conversions>`
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 4d8853a..e8f9b4e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -114,6 +114,7 @@ Clang-Tidy Checks
    :doc:`bugprone-multi-level-implicit-pointer-conversion <bugprone/multi-level-implicit-pointer-conversion>`,
    :doc:`bugprone-multiple-new-in-one-expression <bugprone/multiple-new-in-one-expression>`,
    :doc:`bugprone-multiple-statement-macro <bugprone/multiple-statement-macro>`,
+   :doc:`bugprone-narrowing-conversions <bugprone/narrowing-conversions>`,
    :doc:`bugprone-no-escape <bugprone/no-escape>`,
    :doc:`bugprone-non-zero-enum-to-bool-conversion <bugprone/non-zero-enum-to-bool-conversion>`,
    :doc:`bugprone-nondeterministic-pointer-iteration-order <bugprone/nondeterministic-pointer-iteration-order>`,
@@ -190,7 +191,6 @@ Clang-Tidy Checks
    :doc:`cppcoreguidelines-macro-usage <cppcoreguidelines/macro-usage>`,
    :doc:`cppcoreguidelines-misleading-capture-default-by-value <cppcoreguidelines/misleading-capture-default-by-value>`, "Yes"
    :doc:`cppcoreguidelines-missing-std-forward <cppcoreguidelines/missing-std-forward>`,
-   :doc:`cppcoreguidelines-narrowing-conversions <cppcoreguidelines/narrowing-conversions>`,
    :doc:`cppcoreguidelines-no-malloc <cppcoreguidelines/no-malloc>`,
    :doc:`cppcoreguidelines-no-suspend-with-lock <cppcoreguidelines/no-suspend-with-lock>`,
    :doc:`cppcoreguidelines-owning-memory <cppcoreguidelines/owning-memory>`,
@@ -411,7 +411,6 @@ Check aliases
 .. csv-table::
    :header: "Name", "Redirect", "Offers fixes"
 
-   :doc:`bugprone-narrowing-conversions <bugprone/narrowing-conversions>`, :doc:`cppcoreguidelines-narrowing-conversions <cppcoreguidelines/narrowing-conversions>`,
    :doc:`cert-arr39-c <cert/arr39-c>`, :doc:`bugprone-sizeof-expression <bugprone/sizeof-expression>`,
    :doc:`cert-con36-c <cert/con36-c>`, :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
    :doc:`cert-con54-cpp <cert/con54-cpp>`, :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
@@ -541,6 +540,7 @@ Check aliases
    :doc:`cppcoreguidelines-c-copy-assignment-signature <cppcoreguidelines/c-copy-assignment-signature>`, :doc:`misc-unconventional-assign-operator <misc/unconventional-assign-operator>`,
    :doc:`cppcoreguidelines-explicit-virtual-functions <cppcoreguidelines/explicit-virtual-functions>`, :doc:`modernize-use-override <modernize/use-override>`, "Yes"
    :doc:`cppcoreguidelines-macro-to-enum <cppcoreguidelines/macro-to-enum>`, :doc:`modernize-macro-to-enum <modernize/macro-to-enum>`, "Yes"
+   :doc:`cppcoreguidelines-narrowing-conversions <cppcoreguidelines/narrowing-conversions>`, :doc:`bugprone-narrowing-conversions <bugprone/narrowing-conversions>`,
    :doc:`cppcoreguidelines-noexcept-destructor <cppcoreguidelines/noexcept-destructor>`, :doc:`performance-noexcept-destructor <performance/noexcept-destructor>`, "Yes"
    :doc:`cppcoreguidelines-noexcept-move-operations <cppcoreguidelines/noexcept-move-operations>`, :doc:`performance-noexcept-move-constructor <performance/noexcept-move-constructor>`, "Yes"
    :doc:`cppcoreguidelines-noexcept-swap <cppcoreguidelines/noexcept-swap>`, :doc:`performance-noexcept-swap <performance/noexcept-swap>`, "Yes"
diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index f053e57..b7a366e 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -33,6 +33,14 @@ compilation options on the command line after ``--``:
 
   $ clang-tidy test.cpp -- -Imy_project/include -DMY_DEFINES ...
 
+If there are too many options or source files to specify on the command line,
+you can store them in a parameter file, and use :program:`clang-tidy` with that
+parameters file:
+
+.. code-block:: console
+
+  $ clang-tidy @parameters_file
+
 :program:`clang-tidy` has its own checks and can also run Clang Static Analyzer
 checks. Each check has a name and the checks to run can be chosen using the
 ``-checks=`` option, which specifies a comma-separated list of positive and
@@ -264,6 +272,9 @@ An overview of all the command-line options:
     automatically removed, but the rest of a relative path must be a
     suffix of a path in the compile command database.
 
+  Parameters files:
+    A large number of options or source files can be passed as parameter files
+    by use '@parameter-file' in the command line.
 
   Configuration files:
     clang-tidy attempts to read configuration for each source file from a
@@ -282,8 +293,8 @@ An overview of all the command-line options:
                                    globs can be specified as a list instead of a
                                    string.
     ExcludeHeaderFilterRegex     - Same as '--exclude-header-filter'.
-    ExtraArgs                    - Same as '--extra-args'.
-    ExtraArgsBefore              - Same as '--extra-args-before'.
+    ExtraArgs                    - Same as '--extra-arg'.
+    ExtraArgsBefore              - Same as '--extra-arg-before'.
     FormatStyle                  - Same as '--format-style'.
     HeaderFileExtensions         - File extensions to consider to determine if a
                                    given diagnostic is located in a header file.
diff --git a/clang-tools-extra/include-cleaner/lib/Analysis.cpp b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
index 16013f5..e3a4834 100644
--- a/clang-tools-extra/include-cleaner/lib/Analysis.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
@@ -85,8 +85,9 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
   const auto MainFile = *SM.getFileEntryRefForID(SM.getMainFileID());
   llvm::DenseSet<const Include *> Used;
   llvm::StringMap<Header> Missing;
+  constexpr auto DefaultHeaderFilter = [](llvm::StringRef) { return false; };
   if (!HeaderFilter)
-    HeaderFilter = [](llvm::StringRef) { return false; };
+    HeaderFilter = DefaultHeaderFilter;
   OptionalDirectoryEntryRef ResourceDir =
       PP.getHeaderSearchInfo().getModuleMap().getBuiltinDir();
   walkUsed(ASTRoots, MacroRefs, PI, PP,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/branch-clone-2.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/branch-clone-2.cpp
new file mode 100644
index 0000000..b91ac6a
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/branch-clone-2.cpp
@@ -0,0 +1,768 @@
+// RUN: %check_clang_tidy %s bugprone-branch-clone %t --
+
+/* Only one expected warning per function allowed at the very end. */
+
+int func(void)
+{
+  return 0;
+}
+
+int func2(void)
+{
+  return 0;
+}
+
+int funcParam(int a)
+{
+  return 0;
+}
+
+/* '!=' operator*/
+
+
+/* '!=' with int pointer */
+
+int checkNotEqualIntPointerLiteralCompare1(void) {
+  int* p = 0;
+  return (p != 0); // no warning
+}
+
+int checkNotEqualIntPointerLiteralCompare2(void) {
+  return (6 != 7); // no warning
+}
+
+int checkNotEqualIntPointerDeclCompare1(void) {
+  int k = 3;
+  int* f = &k;
+  int* g = &k;
+  return (f != g); // no warning
+}
+
+int checkNotEqualCastIntPointerDeclCompare11(void) {
+  int k = 7;
+  int* f = &k;
+  return ((int*)f != (int*)f);
+}
+int checkNotEqualCastIntPointerDeclCompare12(void) {
+  int k = 7;
+  int* f = &k;
+  return ((int*)((char*)f) != (int*)f); // no warning
+}
+int checkNotEqualBinaryOpIntPointerCompare1(void) {
+  int k = 7;
+  int res;
+  int* f= &k;
+  res = (f + 4 != f + 4);
+  return (0);
+}
+int checkNotEqualBinaryOpIntPointerCompare2(void) {
+  int k = 7;
+  int* f = &k;
+  int* g = &k;
+  return (f + 4 != g + 4); // no warning
+}
+
+
+int checkNotEqualBinaryOpIntPointerCompare3(void) {
+  int k = 7;
+  int res;
+  int* f= &k;
+  res = ((int*)f + 4 != (int*)f + 4);
+  return (0);
+}
+int checkNotEqualBinaryOpIntPointerCompare4(void) {
+  int k = 7;
+  int res;
+  int* f= &k;
+  res = ((int*)f + 4 != (int*)((char*)f) + 4);  // no warning
+  return (0);
+}
+
+int checkNotEqualNestedBinaryOpIntPointerCompare1(void) {
+  int res;
+  int k = 7;
+  int t= 1;
+  int* u= &k+2;
+  int* f= &k+3;
+  res = ((f + (3)*t) != (f + (3)*t));
+  return (0);
+}
+
+int checkNotEqualNestedBinaryOpIntPointerCompare2(void) {
+  int res;
+  int k = 7;
+  int t= 1;
+  int* u= &k+2;
+  int* f= &k+3;
+  res = (((3)*t + f) != (f + (3)*t));  // no warning
+  return (0);
+}
+/*   end '!=' int*          */
+
+/* '!=' with function*/
+
+int checkNotEqualSameFunction() {
+  unsigned a = 0;
+  unsigned b = 1;
+  int res = (a+func() != a+func());  // no warning
+  return (0);
+}
+
+int checkNotEqualDifferentFunction() {
+  unsigned a = 0;
+  unsigned b = 1;
+  int res = (a+func() != a+func2());  // no warning
+  return (0);
+}
+
+int checkNotEqualSameFunctionSameParam() {
+  unsigned a = 0;
+  unsigned b = 1;
+  int res = (a+funcParam(a) != a+funcParam(a));  // no warning
+  return (0);
+}
+
+int checkNotEqualSameFunctionDifferentParam() {
+  unsigned a = 0;
+  unsigned b = 1;
+  int res = (a+funcParam(a) != a+funcParam(b));  // no warning
+  return (0);
+}
+
+/*   end '!=' with function*/
+
+/*   end '!=' */
+
+
+/* Checking use of identical expressions in conditional operator*/
+
+unsigned test_unsigned(unsigned a) {
+  unsigned b = 1;
+  a = a > 5 ? b : b;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+  return a;
+}
+
+void test_signed() {
+  int a = 0;
+  a = a > 5 ? a : a;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_bool(bool a) {
+  a = a > 0 ? a : a;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_float() {
+  float a = 0;
+  float b = 0;
+  a = a > 5 ? a : a;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+const char *test_string() {
+  float a = 0;
+  return a > 5 ? "abc" : "abc";
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_unsigned_expr() {
+  unsigned a = 0;
+  unsigned b = 0;
+  a = a > 5 ? a+b : a+b;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_signed_expr() {
+  int a = 0;
+  int b = 1;
+  a = a > 5 ? a+b : a+b;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_bool_expr(bool a) {
+  bool b = 0;
+  a = a > 0 ? a&&b : a&&b;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_unsigned_expr_negative() {
+  unsigned a = 0;
+  unsigned b = 0;
+  a = a > 5 ? a+b : b+a; // no warning
+}
+
+void test_signed_expr_negative() {
+  int a = 0;
+  int b = 1;
+  a = a > 5 ? b+a : a+b; // no warning
+}
+
+void test_bool_expr_negative(bool a) {
+  bool b = 0;
+  a = a > 0 ? a&&b : b&&a; // no warning
+}
+
+void test_float_expr_positive() {
+  float a = 0;
+  float b = 0;
+  a = a > 5 ? a+b : a+b;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_expr_positive_func() {
+  unsigned a = 0;
+  unsigned b = 1;
+  a = a > 5 ? a+func() : a+func();
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_expr_negative_func() {
+  unsigned a = 0;
+  unsigned b = 1;
+  a = a > 5 ? a+func() : a+func2(); // no warning
+}
+
+void test_expr_positive_funcParam() {
+  unsigned a = 0;
+  unsigned b = 1;
+  a = a > 5 ? a+funcParam(b) : a+funcParam(b);
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_expr_negative_funcParam() {
+  unsigned a = 0;
+  unsigned b = 1;
+  a = a > 5 ? a+funcParam(a) : a+funcParam(b); // no warning
+}
+
+void test_expr_positive_inc() {
+  unsigned a = 0;
+  unsigned b = 1;
+  a = a > 5 ? a++ : a++;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_expr_negative_inc() {
+  unsigned a = 0;
+  unsigned b = 1;
+  a = a > 5 ? a++ : b++; // no warning
+}
+
+void test_expr_positive_assign() {
+  unsigned a = 0;
+  unsigned b = 1;
+  a = a > 5 ? a=1 : a=1;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_expr_negative_assign() {
+  unsigned a = 0;
+  unsigned b = 1;
+  a = a > 5 ? a=1 : a=2; // no warning
+}
+
+void test_signed_nested_expr() {
+  int a = 0;
+  int b = 1;
+  int c = 3;
+  a = a > 5 ? a+b+(c+a)*(a + b*(c+a)) : a+b+(c+a)*(a + b*(c+a));
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_signed_nested_expr_negative() {
+  int a = 0;
+  int b = 1;
+  int c = 3;
+  a = a > 5 ? a+b+(c+a)*(a + b*(c+a)) : a+b+(c+a)*(a + b*(a+c)); // no warning
+}
+
+void test_signed_nested_cond_expr_negative() {
+  int a = 0;
+  int b = 1;
+  int c = 3;
+  a = a > 5 ? (b > 5 ? 1 : 4) : (b > 5 ? 2 : 4); // no warning
+}
+
+void test_signed_nested_cond_expr() {
+  int a = 0;
+  int b = 1;
+  int c = 3;
+  a = a > 5 ? (b > 5 ? 1 : 4) : (b > 5 ? 4 : 4);
+// CHECK-MESSAGES: :[[@LINE-1]]:40: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+
+void test_identical_branches1(bool b) {
+  int i = 0;
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    ++i;
+  } else {
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+    ++i;
+  }
+}
+
+void test_identical_branches2(bool b) {
+  int i = 0;
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    ++i;
+  } else
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+    ++i;
+}
+
+void test_identical_branches3(bool b) {
+  int i = 0;
+  if (b) { // no warning
+    ++i;
+  } else {
+    i++;
+  }
+}
+
+void test_identical_branches4(bool b) {
+  int i = 0;
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+  } else {
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+  }
+}
+
+void test_identical_branches_break(bool b) {
+  while (true) {
+    if (b)
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: if with identical then and else branches [bugprone-branch-clone]
+      break;
+    else
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+      break;
+  }
+}
+
+void test_identical_branches_continue(bool b) {
+  while (true) {
+    if (b)
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: if with identical then and else branches [bugprone-branch-clone]
+      continue;
+    else
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+      continue;
+  }
+}
+
+void test_identical_branches_func(bool b) {
+  if (b)
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    func();
+  else
+// CHECK-MESSAGES: :[[@LINE-1]]:3: note: else branch starts here
+    func();
+}
+
+void test_identical_branches_func_arguments(bool b) {
+  if (b) // no-warning
+    funcParam(1);
+  else
+    funcParam(2);
+}
+
+void test_identical_branches_cast1(bool b) {
+  long v = -7;
+  if (b) // no-warning
+    v = (signed int) v;
+  else
+    v = (unsigned int) v;
+}
+
+void test_identical_branches_cast2(bool b) {
+  long v = -7;
+  if (b)
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    v = (signed int) v;
+  else
+// CHECK-MESSAGES: :[[@LINE-1]]:3: note: else branch starts here
+    v = (signed int) v;
+}
+
+int test_identical_branches_return_int(bool b) {
+  int i = 0;
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    i++;
+    return i;
+  } else {
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+    i++;
+    return i;
+  }
+}
+
+int test_identical_branches_return_func(bool b) {
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    return func();
+  } else {
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+    return func();
+  }
+}
+
+void test_identical_branches_for(bool b) {
+  int i;
+  int j;
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    for (i = 0, j = 0; i < 10; i++)
+      j += 4;
+  } else {
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+    for (i = 0, j = 0; i < 10; i++)
+      j += 4;
+  }
+}
+
+void test_identical_branches_while(bool b) {
+  int i = 10;
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    while (func())
+      i--;
+  } else {
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+    while (func())
+      i--;
+  }
+}
+
+void test_identical_branches_while_2(bool b) {
+  int i = 10;
+  if (b) { // no-warning
+    while (func())
+      i--;
+  } else {
+    while (func())
+      i++;
+  }
+}
+
+void test_identical_branches_do_while(bool b) {
+  int i = 10;
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    do {
+      i--;
+    } while (func());
+  } else {
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+    do {
+      i--;
+    } while (func());
+  }
+}
+
+void test_identical_branches_if(bool b, int i) {
+  if (b) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
+    if (i < 5)
+      i += 10;
+  } else {
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: else branch starts here
+    if (i < 5)
+      i += 10;
+  }
+}
+
+void test_identical_bitwise1() {
+  int a = 5 | 5; // no-warning
+}
+
+void test_identical_bitwise2() {
+  int a = 5;
+  int b = a | a; // no-warning
+}
+
+void test_identical_bitwise3() {
+  int a = 5;
+  int b = (a | a); // no-warning
+}
+
+void test_identical_bitwise4() {
+  int a = 4;
+  int b = a | 4; // no-warning
+}
+
+void test_identical_bitwise5() {
+  int a = 4;
+  int b = 4;
+  int c = a | b; // no-warning
+}
+
+void test_identical_bitwise6() {
+  int a = 5;
+  int b = a | 4 | a;
+}
+
+void test_identical_bitwise7() {
+  int a = 5;
+  int b = func() | func();
+}
+
+void test_identical_logical1(int a) {
+  if (a == 4 && a == 4)
+    ;
+}
+
+void test_identical_logical2(int a) {
+  if (a == 4 || a == 5 || a == 4)
+    ;
+}
+
+void test_identical_logical3(int a) {
+  if (a == 4 || a == 5 || a == 6) // no-warning
+    ;
+}
+
+void test_identical_logical4(int a) {
+  if (a == func() || a == func()) // no-warning
+    ;
+}
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlogical-op-parentheses"
+void test_identical_logical5(int x, int y) {
+  if (x == 4 && y == 5 || x == 4 && y == 6) // no-warning
+    ;
+}
+
+void test_identical_logical6(int x, int y) {
+  if (x == 4 && y == 5 || x == 4 && y == 5)
+    ;
+}
+
+void test_identical_logical7(int x, int y) {
+  // FIXME: We should warn here
+  if (x == 4 && y == 5 || x == 4)
+    ;
+}
+
+void test_identical_logical8(int x, int y) {
+  // FIXME: We should warn here
+  if (x == 4 || y == 5 && x == 4)
+    ;
+}
+
+void test_identical_logical9(int x, int y) {
+  // FIXME: We should warn here
+  if (x == 4 || x == 4 && y == 5)
+    ;
+}
+#pragma clang diagnostic pop
+
+void test_warn_chained_if_stmts_1(int x) {
+  if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+}
+
+void test_warn_chained_if_stmts_2(int x) {
+  if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+  else if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here
+}
+
+void test_warn_chained_if_stmts_3(int x) {
+  if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x == 2)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+  else if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here
+}
+
+void test_warn_chained_if_stmts_4(int x) {
+  if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (func())
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+  else if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here
+}
+
+void test_warn_chained_if_stmts_5(int x) {
+  if (x & 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x & 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+}
+
+void test_warn_chained_if_stmts_6(int x) {
+  if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x == 2)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+  else if (x == 2)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here
+  else if (x == 3)
+    ;
+}
+
+void test_warn_chained_if_stmts_7(int x) {
+  if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x == 2)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+  else if (x == 3)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here
+  else if (x == 2)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 3 starts here
+  else if (x == 5)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 4 starts here
+}
+
+void test_warn_chained_if_stmts_8(int x) {
+  if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x == 2)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+  else if (x == 3)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here
+  else if (x == 2)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 3 starts here
+  else if (x == 5)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 4 starts here
+  else if (x == 3)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 5 starts here
+  else if (x == 7)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 6 starts here
+}
+
+void test_nowarn_chained_if_stmts_1(int x) {
+  if (func())
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (func())
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+}
+
+void test_nowarn_chained_if_stmts_2(int x) {
+  if (func())
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x == 1)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+  else if (func())
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 2 starts here
+}
+
+void test_nowarn_chained_if_stmts_3(int x) {
+  if (x++)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: end of the original
+  else if (x++)
+    ;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: clone 1 starts here
+}
+
+void test_warn_wchar() {
+  const wchar_t * a = 0 ? L"Warning" : L"Warning";
+// CHECK-MESSAGES: :[[@LINE-1]]:25: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
+}
+void test_nowarn_wchar() {
+  const wchar_t * a = 0 ? L"No" : L"Warning";
+}
+
+void test_nowarn_long() {
+  int a = 0, b = 0;
+  long c;
+  if (0) {
+    b -= a;
+    c = 0;
+  } else {
+    b -= a;
+    c = 0LL;
+  }
+}
+
+// Identical inner conditions
+
+void test_warn_inner_if_1(int x) {
+  if (x == 1) {
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: if with identical inner if statement [bugprone-branch-clone]
+    if (x == 1)
+// CHECK-MESSAGES: :[[@LINE-1]]:5: note: inner if starts here
+      ;
+  }
+
+  // FIXME: Should warn here. The warning is currently not emitted because there
+  // is code between the conditions.
+  if (x == 1) {
+    int y = x;
+    if (x == 1)
+      ;
+  }
+}
+
+void test_nowarn_inner_if_1(int x) {
+  // Don't warn when condition has side effects.
+  if (x++ == 1) {
+    if (x++ == 1)
+      ;
+  }
+
+  // Don't warn when x is changed before inner condition.
+  if (x < 10) {
+    x++;
+    if (x < 10)
+      ;
+  }
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-bitfields.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-bitfields.cpp
index 36fde38..a7bb3c8 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-bitfields.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-bitfields.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN:   -std=c++17 -- -target x86_64-unknown-linux
 
 #define CHAR_BITS 8
@@ -31,7 +31,7 @@ struct CompleteBitfield {
 };
 
 int example_warning(unsigned x) {
-  // CHECK-MESSAGES: :[[@LINE+1]]:10: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE+1]]:10: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   return x;
 }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-equivalentbitwidth-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-equivalentbitwidth-option.cpp
index fb5c7e3..0deb006 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-equivalentbitwidth-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-equivalentbitwidth-option.cpp
@@ -1,35 +1,35 @@
 // RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- 
+// RUN: bugprone-narrowing-conversions %t -- 
 
 // RUN: %check_clang_tidy -check-suffix=DISABLED %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
+// RUN: bugprone-narrowing-conversions %t -- \
 // RUN: -config='{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.WarnOnEquivalentBitWidth: 0}}'
+// RUN:   bugprone-narrowing-conversions.WarnOnEquivalentBitWidth: 0}}'
 
 void narrowing_equivalent_bitwidth() {
   int i;
   unsigned int ui;
   i = ui;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 
   float f;
   i = f;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 
   f = i;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'int' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 
   long long ll;
   double d;
   ll = d;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'long long' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'long long' [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 
   d = ll;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to 'double' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to 'double' [bugprone-narrowing-conversions]
   // DISABLED: Warning disabled with WarnOnEquivalentBitWidth=0.
 }
 
@@ -37,6 +37,6 @@ void most_narrowing_is_not_ok() {
   int i;
   long long ui;
   i = ui;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-ignoreconversionfromtypes-option.cpp
index 91e908f..6d93f5d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-ignoreconversionfromtypes-option.cpp
@@ -1,10 +1,10 @@
 // RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t --
+// RUN: bugprone-narrowing-conversions %t --
 
 // RUN: %check_clang_tidy -check-suffix=IGNORED %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
+// RUN: bugprone-narrowing-conversions %t -- \
 // RUN: -config='{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.IgnoreConversionFromTypes: "global_size_t;nested_size_type;long" \
+// RUN:   bugprone-narrowing-conversions.IgnoreConversionFromTypes: "global_size_t;nested_size_type;long" \
 // RUN: }}'
 
 // We use global_size_t instead of 'size_t' because windows predefines size_t.
@@ -20,7 +20,7 @@ void narrowing_global_size_t() {
   int i;
   global_size_t j;
   i = j;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 }
 
@@ -28,7 +28,7 @@ void narrowing_size_type() {
   int i;
   vector::nested_size_type j;
   i = j;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'vector::nested_size_type' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'vector::nested_size_type' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=nested_size_type.
 }
 
@@ -36,11 +36,11 @@ void narrowing_size_method() {
   vector v;
   int i, j;
   i = v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 
   i = j + v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 }
 
@@ -49,7 +49,7 @@ void narrowing_size_method_binary_expr() {
   int j;
   vector v;
   i = j + v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 }
 
@@ -57,11 +57,11 @@ void narrowing_size_method_binary_op() {
   int i, j;
   vector v;
   i += v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 
   i += j + v.size();
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'global_size_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // IGNORED: Warning is disabled with IgnoreConversionFromTypes=global_size_t.
 }
 
@@ -69,13 +69,13 @@ void most_narrowing_is_not_ok() {
   int i;
   long long j;
   i = j;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-IGNORED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-IGNORED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void test_ignore_builtin_type_pr58809() {
   long x = 123;
   short y = x;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-NOT-IGNORED: :[[@LINE-2]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-NOT-IGNORED: :[[@LINE-2]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [bugprone-narrowing-conversions]
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-intemplates-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-intemplates-option.cpp
index cb19ed7..625dc45 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-intemplates-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-intemplates-option.cpp
@@ -1,10 +1,10 @@
 // RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t --
+// RUN: bugprone-narrowing-conversions %t --
 
 // RUN: %check_clang_tidy -check-suffix=WARN %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
+// RUN: bugprone-narrowing-conversions %t -- \
 // RUN: -config='{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.WarnWithinTemplateInstantiation: 1 \
+// RUN:   bugprone-narrowing-conversions.WarnWithinTemplateInstantiation: 1 \
 // RUN: }}'
 
 template <typename OrigType>
@@ -12,7 +12,7 @@ void assign_in_template(OrigType jj) {
   int ii;
   ii = jj;
   // DEFAULT: Warning disabled because WarnWithinTemplateInstantiation=0.
-  // CHECK-MESSAGES-WARN: :[[@LINE-2]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-WARN: :[[@LINE-2]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_inside_template_not_ok() {
@@ -23,8 +23,8 @@ void narrow_inside_template_not_ok() {
 void assign_outside_template(long long jj) {
   int ii;
   ii = jj;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-WARN: :[[@LINE-2]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-WARN: :[[@LINE-2]]:8: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_outside_template_not_ok() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-long-is-32bits.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-long-is-32bits.cpp
index dcf1848..8e801a0 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-long-is-32bits.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-long-is-32bits.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -- -- -target x86_64-unknown-linux -m32
 
 static_assert(sizeof(int) * 8 == 32, "int is 32-bits");
@@ -16,8 +16,8 @@ void narrow_integer_to_signed_integer_is_not_ok() {
 
   i = l;  // int and long are the same type.
   i = ll; // int64_t does not fit in an int32_t
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   ll = ul;  // uint32_t fits into int64_t
   ll = ull; // uint64_t does not fit in an int64_t
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingfloatingpoint-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
index 6cad320..9ded2f0 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingfloatingpoint-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingfloatingpoint-option.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -- -- -target x86_64-unknown-linux -fsigned-char
 
 namespace floats {
@@ -6,15 +6,15 @@ namespace floats {
 void narrow_constant_floating_point_to_int_not_ok(double d) {
   int i = 0;
   i += 0.5;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [bugprone-narrowing-conversions]
   i += 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i *= 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i /= 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i += (double)0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [bugprone-narrowing-conversions]
   i += 2.0;
   i += 2.0f;
 }
@@ -28,11 +28,11 @@ float narrow_double_to_float_return() {
 void narrow_double_to_float_not_ok(double d) {
   float f;
   f = d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'float' [bugprone-narrowing-conversions]
   f = 15_double;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'float' [bugprone-narrowing-conversions]
   f += d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'float' [bugprone-narrowing-conversions]
   f = narrow_double_to_float_return();
 }
 
@@ -46,11 +46,11 @@ void narrow_fp_constants() {
   f = __builtin_nanf("0");    // float NaN is not narrowing.
 
   f = __builtin_huge_val(); // max double is not within-range of float.
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [bugprone-narrowing-conversions]
   f = -__builtin_huge_val(); // -max double is not within-range of float.
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [bugprone-narrowing-conversions]
   f = __builtin_inf(); // double infinity is not within-range of float.
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'double' to 'float' [bugprone-narrowing-conversions]
   f = __builtin_nan("0"); // double NaN is not narrowing.
 }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowinginteger-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowinginteger-option.cpp
index f58de65..fce90ec 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowinginteger-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowinginteger-option.cpp
@@ -1,23 +1,23 @@
 // RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
-// RUN: -config='{CheckOptions: {cppcoreguidelines-narrowing-conversions.WarnOnIntegerNarrowingConversion: true}}'
+// RUN: bugprone-narrowing-conversions %t -- \
+// RUN: -config='{CheckOptions: {bugprone-narrowing-conversions.WarnOnIntegerNarrowingConversion: true}}'
 
 // RUN: %check_clang_tidy -check-suffix=DISABLED %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
-// RUN: -config='{CheckOptions: {cppcoreguidelines-narrowing-conversions.WarnOnIntegerNarrowingConversion: false}}'
+// RUN: bugprone-narrowing-conversions %t -- \
+// RUN: -config='{CheckOptions: {bugprone-narrowing-conversions.WarnOnIntegerNarrowingConversion: false}}'
 
 void foo(unsigned long long value) {
   int a = value;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:11: warning: narrowing conversion from 'unsigned long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:11: warning: narrowing conversion from 'unsigned long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   // DISABLED: No warning for integer narrowing conversions when WarnOnIntegerNarrowingConversion = false.
   long long b = value;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:17: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:17: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
   // DISABLED: No warning for integer narrowing conversions when WarnOnIntegerNarrowingConversion = false.
 }
 
 void casting_float_to_bool_is_still_operational_when_integer_narrowing_is_disabled(float f) {
   if (f) {
-    // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
-    // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:7: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
+    // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:7: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
   }
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
new file mode 100644
index 0000000..704d24d
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
@@ -0,0 +1,19 @@
+// RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
+// RUN: bugprone-narrowing-conversions %t -- \
+// RUN: -config='{CheckOptions: {bugprone-narrowing-conversions.WarnOnIntegerToFloatingPointNarrowingConversion: true}}'
+
+// RUN: %check_clang_tidy -check-suffix=DISABLED %s \
+// RUN: bugprone-narrowing-conversions %t -- \
+// RUN: -config='{CheckOptions: {bugprone-narrowing-conversions.WarnOnIntegerToFloatingPointNarrowingConversion: false}}'
+
+void foo(unsigned long long value) {
+  double a = value;
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:14: warning: narrowing conversion from 'unsigned long long' to 'double' [bugprone-narrowing-conversions]
+  // DISABLED: No warning for integer to floating-point narrowing conversions when WarnOnIntegerToFloatingPointNarrowingConversion = false.
+}
+
+void floating_point_to_integer_is_still_not_ok(double f) {
+  int a = f;
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:11: warning: narrowing conversion from 'double' to 'int' [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:11: warning: narrowing conversion from 'double' to 'int' [bugprone-narrowing-conversions]
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-pedanticmode-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-pedanticmode-option.cpp
index eb1a5a6..d2e2ead 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-pedanticmode-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-pedanticmode-option.cpp
@@ -1,22 +1,22 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -config="{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.PedanticMode: true}}" \
+// RUN:   bugprone-narrowing-conversions.PedanticMode: true}}" \
 // RUN: -- -target x86_64-unknown-linux -fsigned-char
 
 namespace floats {
 
 void triggers_wrong_constant_type_warning(double d) {
   int i = 0.0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: constant value should be of type of type 'int' instead of 'double' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: constant value should be of type of type 'int' instead of 'double' [bugprone-narrowing-conversions]
   i += 2.0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: constant value should be of type of type 'int' instead of 'double' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: constant value should be of type of type 'int' instead of 'double' [bugprone-narrowing-conversions]
   i += 2.0f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: constant value should be of type of type 'int' instead of 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: constant value should be of type of type 'int' instead of 'float' [bugprone-narrowing-conversions]
 }
 
 void triggers_narrowing_warning_when_overflowing() {
   unsigned short us = 65537.0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: narrowing conversion from constant 'double' to 'unsigned short' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: narrowing conversion from constant 'double' to 'unsigned short' [bugprone-narrowing-conversions]
 }
 
 } // namespace floats
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-unsigned-char.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-unsigned-char.cpp
index 6bd437f..6a544b4 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-unsigned-char.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions-unsigned-char.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -- -- -target x86_64-unknown-linux -funsigned-char
 
 void narrow_integer_to_unsigned_integer_is_ok() {
@@ -42,24 +42,24 @@ void narrow_integer_to_signed_integer_is_not_ok() {
 
   sc = sc;
   sc = s;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'short' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'short' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = i;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'int' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'int' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = l;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'long' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'long' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = ll;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'long long' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'long long' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
 
   sc = c;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'char' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'char' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = us;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned short' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned short' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = ui;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned int' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned int' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = ul;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
   sc = ull;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_constant_to_unsigned_integer_is_ok() {
@@ -72,7 +72,7 @@ void narrow_constant_to_unsigned_integer_is_ok() {
   unsigned char uc3 = -1;  // unsigned dst type is well defined.
   unsigned char uc4 = 256; // unsigned dst type is well defined.
   signed char sc = 128;
-  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'signed char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'signed char' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_conditional_operator_contant_to_unsigned_is_ok(bool b) {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions.cpp
index 29b38e7..3987526 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/narrowing-conversions.cpp
@@ -1,6 +1,6 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-narrowing-conversions %t \
+// RUN: %check_clang_tidy %s bugprone-narrowing-conversions %t \
 // RUN: -config="{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.WarnOnFloatingPointNarrowingConversion: false}}" \
+// RUN:   bugprone-narrowing-conversions.WarnOnFloatingPointNarrowingConversion: false}}" \
 // RUN: -- -target x86_64-unknown-linux -fsigned-char
 
 float ceil(float);
@@ -20,27 +20,27 @@ float operator"" _float(unsigned long long);
 void narrow_fp_to_int_not_ok(double d) {
   int i = 0;
   i = d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'double' to 'int' [bugprone-narrowing-conversions]
   i = 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i = static_cast<float>(d);
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [bugprone-narrowing-conversions]
   i = ConvertsToFloat();
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [bugprone-narrowing-conversions]
   i = 15_float;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'int' [bugprone-narrowing-conversions]
   i += d;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'double' to 'int' [bugprone-narrowing-conversions]
   i += 0.5;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [bugprone-narrowing-conversions]
   i += 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i *= 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i /= 0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'float' to 'int' [bugprone-narrowing-conversions]
   i += (double)0.5f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from constant 'double' to 'int' [bugprone-narrowing-conversions]
   i += 2.0;
   i += 2.0f;
 }
@@ -84,29 +84,29 @@ void narrow_double_to_float_not_ok_binary_ops(double d) {
 
 void narrow_fp_constant_to_bool_not_ok() {
   bool b1 = 1.0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant 'double' to 'bool' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant 'double' to 'bool' [bugprone-narrowing-conversions]
   bool b2 = 1.0f;
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant 'float' to 'bool' [bugprone-narrowing-conversions]
 }
 
 void narrow_integer_to_floating() {
   {
     long long ll; // 64 bits
     float f = ll; // doesn't fit in 24 bits
-    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'long long' to 'float' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'long long' to 'float' [bugprone-narrowing-conversions]
     double d = ll; // doesn't fit in 53 bits.
-    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: narrowing conversion from 'long long' to 'double' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: narrowing conversion from 'long long' to 'double' [bugprone-narrowing-conversions]
   }
   {
     int i;       // 32 bits
     float f = i; // doesn't fit in 24 bits
-    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'int' to 'float' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
     double d = i; // fits in 53 bits.
   }
   {
     short n1, n2;
     float f = n1 + n2; // 'n1 + n2' is of type 'int' because of integer rules
-    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'int' to 'float' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: narrowing conversion from 'int' to 'float' [bugprone-narrowing-conversions]
   }
   {
     short s;      // 16 bits
@@ -156,41 +156,41 @@ void narrow_integer_to_signed_integer_is_not_ok() {
 
   c = c;
   c = s;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'short' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'short' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = i;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = l;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = ll;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
 
   c = uc;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned char' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned char' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = us;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned short' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned short' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = ui;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = ul;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   c = ull;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long long' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long long' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
 
   i = c;
   i = s;
   i = i;
   i = l;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   i = ll;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 
   i = uc;
   i = us;
   i = ui;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned int' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   i = ul;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
   i = ull;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'unsigned long long' to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 
   ll = c;
   ll = s;
@@ -202,9 +202,9 @@ void narrow_integer_to_signed_integer_is_not_ok() {
   ll = us;
   ll = ui;
   ll = ul;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
   ll = ull;
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_constant_to_unsigned_integer_is_ok() {
@@ -222,16 +222,16 @@ void narrow_constant_to_signed_integer_is_not_ok() {
   char c1 = -128;
   char c2 = 127;
   char c3 = -129;
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant value -129 (0xFFFFFF7F) of type 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant value -129 (0xFFFFFF7F) of type 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   char c4 = 128;
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
 
   short s1 = -32768;
   short s2 = 32767;
   short s3 = -32769;
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: narrowing conversion from constant value -32769 (0xFFFF7FFF) of type 'int' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: narrowing conversion from constant value -32769 (0xFFFF7FFF) of type 'int' to signed type 'short' is implementation-defined [bugprone-narrowing-conversions]
   short s4 = 32768;
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: narrowing conversion from constant value 32768 (0x00008000) of type 'int' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: narrowing conversion from constant value 32768 (0x00008000) of type 'int' to signed type 'short' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_conditional_operator_contant_to_unsigned_is_ok(bool b) {
@@ -244,22 +244,22 @@ void narrow_conditional_operator_contant_to_unsigned_is_ok(bool b) {
 void narrow_conditional_operator_contant_to_signed_is_not_ok(bool b) {
   char uc1 = b ? 1 : 0;
   char uc2 = b ? 1 : 128;
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: narrowing conversion from constant value 128 (0x00000080) of type 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   char uc3 = b ? -129 : 0;
-  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: narrowing conversion from constant value -129 (0xFFFFFF7F) of type 'int' to signed type 'char' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: narrowing conversion from constant value -129 (0xFFFFFF7F) of type 'int' to signed type 'char' is implementation-defined [bugprone-narrowing-conversions]
   unsigned long long ysize;
   long long mirror = b ? -1 : ysize - 1;
-  // CHECK-MESSAGES: :[[@LINE-1]]:26: warning: narrowing conversion from constant value 18446744073709551615 (0xFFFFFFFFFFFFFFFF) of type 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES: :[[@LINE-2]]:37: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:26: warning: narrowing conversion from constant value 18446744073709551615 (0xFFFFFFFFFFFFFFFF) of type 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-2]]:37: warning: narrowing conversion from 'unsigned long long' to signed type 'long long' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 void narrow_constant_to_floating_point() {
   float f_ok = 1ULL << 24;              // fits in 24 bits mantissa.
   float f_not_ok = (1ULL << 24) + 1ULL; // doesn't fit in 24 bits mantissa.
-  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: narrowing conversion from constant value 16777217 of type 'unsigned long long' to 'float' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: narrowing conversion from constant value 16777217 of type 'unsigned long long' to 'float' [bugprone-narrowing-conversions]
   double d_ok = 1ULL << 53;              // fits in 53 bits mantissa.
   double d_not_ok = (1ULL << 53) + 1ULL; // doesn't fit in 53 bits mantissa.
-  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: narrowing conversion from constant value 9007199254740993 of type 'unsigned long long' to 'double' [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: narrowing conversion from constant value 9007199254740993 of type 'unsigned long long' to 'double' [bugprone-narrowing-conversions]
 }
 
 void casting_integer_to_bool_is_ok() {
@@ -275,13 +275,13 @@ void casting_integer_to_bool_is_ok() {
 void casting_float_to_bool_is_not_ok() {
   float f;
   while (f) {
-    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
   }
   for (; f;) {
-    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
   }
   if (f) {
-    // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'bool' [cppcoreguidelines-narrowing-conversions]
+    // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'float' to 'bool' [bugprone-narrowing-conversions]
   }
 }
 
@@ -352,7 +352,7 @@ void typedef_context() {
   i64 = i;   // Okay, no narrowing.
 
   i = i64;
-  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'myint64_t' (aka 'long long') to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: narrowing conversion from 'myint64_t' (aka 'long long') to signed type 'int' is implementation-defined [bugprone-narrowing-conversions]
 }
 
 } // namespace floats
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
index 14d2785..8610393 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
@@ -10,7 +10,9 @@ template <class T>
 T &&move(T &x) {
 }
 
-template <class T>
+template <typename T> class default_delete {};
+
+template <class T, typename Deleter = std::default_delete<T>>
 class unique_ptr {
 };
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable-name-independence.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable-name-independence.cpp
new file mode 100644
index 0000000..bcc8b810
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable-name-independence.cpp
@@ -0,0 +1,21 @@
+// RUN: %check_clang_tidy -std=c++23 -check-suffixes=,CXX23 %s bugprone-unused-local-non-trivial-variable %t -- \
+// RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Foo'}}" \
+// RUN:       --
+// RUN: %check_clang_tidy -std=c++26 %s bugprone-unused-local-non-trivial-variable %t -- \
+// RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Foo'}}" \
+// RUN:       --
+
+namespace async {
+class Foo {
+  public:
+    ~Foo();
+  private:
+};
+} // namespace async
+
+void check() {
+  async::Foo C;
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: unused local variable 'C' of type 'async::Foo' [bugprone-unused-local-non-trivial-variable]
+  async::Foo _;
+  // CHECK-MESSAGES-CXX23: :[[@LINE-1]]:14: warning: unused local variable '_' of type 'async::Foo' [bugprone-unused-local-non-trivial-variable]
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
deleted file mode 100644
index 35ca61b..0000000
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-narrowingintegertofloatingpoint-option.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: %check_clang_tidy -check-suffix=DEFAULT %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
-// RUN: -config='{CheckOptions: {cppcoreguidelines-narrowing-conversions.WarnOnIntegerToFloatingPointNarrowingConversion: true}}'
-
-// RUN: %check_clang_tidy -check-suffix=DISABLED %s \
-// RUN: cppcoreguidelines-narrowing-conversions %t -- \
-// RUN: -config='{CheckOptions: {cppcoreguidelines-narrowing-conversions.WarnOnIntegerToFloatingPointNarrowingConversion: false}}'
-
-void foo(unsigned long long value) {
-  double a = value;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:14: warning: narrowing conversion from 'unsigned long long' to 'double' [cppcoreguidelines-narrowing-conversions]
-  // DISABLED: No warning for integer to floating-point narrowing conversions when WarnOnIntegerToFloatingPointNarrowingConversion = false.
-}
-
-void floating_point_to_integer_is_still_not_ok(double f) {
-  int a = f;
-  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:11: warning: narrowing conversion from 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
-  // CHECK-MESSAGES-DISABLED: :[[@LINE-2]]:11: warning: narrowing conversion from 'double' to 'int' [cppcoreguidelines-narrowing-conversions]
-}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/alpha-core-identicalexpr.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/redundant-expression-2.cpp
index 8eff3eb..8dcef30 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/alpha-core-identicalexpr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/redundant-expression-2.cpp
@@ -1,5 +1,4 @@
-// RUN: clang-tidy %s -checks="-*,misc-redundant-expression" -- 2>&1 | FileCheck %s --check-prefix=CHECK-MESSAGES-IDENTEXPR
-// RUN: clang-tidy %s -checks="-*,bugprone-branch-clone" -- 2>&1 | FileCheck %s --check-prefix=CHECK-MESSAGES-BUGPRONEBRANCH
+// RUN: %check_clang_tidy %s misc-redundant-expression -check-suffix=IDENTEXPR %t
 
 /* Only one expected warning per function allowed at the very end. */
 
@@ -77,6 +76,7 @@ int checkNotEqualBinaryOpFloatCompare1(void) {
   int res;
   float f= 3.14F;
   res = (f + 3.14F != f + 3.14F);  // no warning
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 int checkNotEqualBinaryOpFloatCompare2(void) {
@@ -88,6 +88,7 @@ int checkNotEqualBinaryOpFloatCompare3(void) {
   int res;
   float f= 3.14F;
   res = ((int)f + 3.14F != (int)f + 3.14F);  // no warning
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:25: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 int checkNotEqualBinaryOpFloatCompare4(void) {
@@ -103,6 +104,7 @@ int checkNotEqualNestedBinaryOpFloatCompare1(void) {
   int u= 2;
   float f= 3.14F;
   res = (((int)f + (3.14F - u)*t) != ((int)f + (3.14F - u)*t));  // no warning
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:35: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -121,12 +123,11 @@ int checkNotEqualNestedBinaryOpFloatCompare3(void) {
   int u= 2;
   float f= 3.14F;
   res = (((int)f + (u - 3.14F)*t) != ((int)f + (3.14F - u)*(f + t != f + t)));  // no warning
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:67: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
 
-
-
 /* end '!=' with float*/
 
 /* '!=' with int*/
@@ -238,8 +239,6 @@ int checkNotEqualNestedBinaryOpIntCompare3(void) {
 
 /*   end '!=' int          */
 
-
-
 /* '!=' with int pointer */
 
 int checkNotEqualIntPointerLiteralCompare1(void) {
@@ -329,6 +328,7 @@ int checkNotEqualSameFunction() {
   unsigned a = 0;
   unsigned b = 1;
   int res = (a+func() != a+func());  // no warning
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:23: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -343,6 +343,7 @@ int checkNotEqualSameFunctionSameParam() {
   unsigned a = 0;
   unsigned b = 1;
   int res = (a+funcParam(a) != a+funcParam(a));  // no warning
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:29: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -434,7 +435,8 @@ int checkEqualCastFloatDeclCompare12(void) {
 int checkEqualBinaryOpFloatCompare1(void) {
   int res;
   float f= 3.14F;
-  res = (f + 3.14F == f + 3.14F);  // no warning
+  res = (f + 3.14F == f + 3.14F);
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 int checkEqualBinaryOpFloatCompare2(void) {
@@ -445,7 +447,8 @@ int checkEqualBinaryOpFloatCompare2(void) {
 int checkEqualBinaryOpFloatCompare3(void) {
   int res;
   float f= 3.14F;
-  res = ((int)f + 3.14F == (int)f + 3.14F);  // no warning
+  res = ((int)f + 3.14F == (int)f + 3.14F);
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:25: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 int checkEqualBinaryOpFloatCompare4(void) {
@@ -460,7 +463,8 @@ int checkEqualNestedBinaryOpFloatCompare1(void) {
   int t= 1;
   int u= 2;
   float f= 3.14F;
-  res = (((int)f + (3.14F - u)*t) == ((int)f + (3.14F - u)*t));  // no warning
+  res = (((int)f + (3.14F - u)*t) == ((int)f + (3.14F - u)*t));
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:35: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -478,14 +482,11 @@ int checkEqualNestedBinaryOpFloatCompare3(void) {
   int t= 1;
   int u= 2;
   float f= 3.14F;
-  res = (((int)f + (u - 3.14F)*t) == ((int)f + (3.14F - u)*(f + t == f + t)));  // no warning
+  res = (((int)f + (u - 3.14F)*t) == ((int)f + (3.14F - u)*(f + t == f + t)));
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:67: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
-
-
-
-
 /* Equal with int*/
 
 int checkEqualIntLiteralCompare1(void) {
@@ -600,7 +601,8 @@ int checkEqualNestedBinaryOpIntCompare3(void) {
 int checkEqualSameFunction() {
   unsigned a = 0;
   unsigned b = 1;
-  int res = (a+func() == a+func());  // no warning
+  int res = (a+func() == a+func());
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:23: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -614,7 +616,8 @@ int checkEqualDifferentFunction() {
 int checkEqualSameFunctionSameParam() {
   unsigned a = 0;
   unsigned b = 1;
-  int res = (a+funcParam(a) == a+funcParam(a));  // no warning
+  int res = (a+funcParam(a) == a+funcParam(a));
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:29: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -692,7 +695,8 @@ int checkLessThanCastFloatDeclCompare12(void) {
 int checkLessThanBinaryOpFloatCompare1(void) {
   int res;
   float f= 3.14F;
-  res = (f + 3.14F < f + 3.14F);  // no warning
+  res = (f + 3.14F < f + 3.14F);
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 int checkLessThanBinaryOpFloatCompare2(void) {
@@ -703,7 +707,8 @@ int checkLessThanBinaryOpFloatCompare2(void) {
 int checkLessThanBinaryOpFloatCompare3(void) {
   int res;
   float f= 3.14F;
-  res = ((int)f + 3.14F < (int)f + 3.14F);  // no warning
+  res = ((int)f + 3.14F < (int)f + 3.14F);
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:25: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 int checkLessThanBinaryOpFloatCompare4(void) {
@@ -718,7 +723,8 @@ int checkLessThanNestedBinaryOpFloatCompare1(void) {
   int t= 1;
   int u= 2;
   float f= 3.14F;
-  res = (((int)f + (3.14F - u)*t) < ((int)f + (3.14F - u)*t));  // no warning
+  res = (((int)f + (3.14F - u)*t) < ((int)f + (3.14F - u)*t));
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:35: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -736,7 +742,8 @@ int checkLessThanNestedBinaryOpFloatCompare3(void) {
   int t= 1;
   int u= 2;
   float f= 3.14F;
-  res = (((int)f + (u - 3.14F)*t) < ((int)f + (3.14F - u)*(f + t < f + t)));  // no warning
+  res = (((int)f + (u - 3.14F)*t) < ((int)f + (3.14F - u)*(f + t < f + t)));
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:66: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -912,7 +919,8 @@ int checkGreaterThanCastFloatDeclCompare12(void) {
 int checkGreaterThanBinaryOpFloatCompare1(void) {
   int res;
   float f= 3.14F;
-  res = (f + 3.14F > f + 3.14F);  // no warning
+  res = (f + 3.14F > f + 3.14F);
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 int checkGreaterThanBinaryOpFloatCompare2(void) {
@@ -923,7 +931,8 @@ int checkGreaterThanBinaryOpFloatCompare2(void) {
 int checkGreaterThanBinaryOpFloatCompare3(void) {
   int res;
   float f= 3.14F;
-  res = ((int)f + 3.14F > (int)f + 3.14F);  // no warning
+  res = ((int)f + 3.14F > (int)f + 3.14F);
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:25: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 int checkGreaterThanBinaryOpFloatCompare4(void) {
@@ -938,7 +947,8 @@ int checkGreaterThanNestedBinaryOpFloatCompare1(void) {
   int t= 1;
   int u= 2;
   float f= 3.14F;
-  res = (((int)f + (3.14F - u)*t) > ((int)f + (3.14F - u)*t));  // no warning
+  res = (((int)f + (3.14F - u)*t) > ((int)f + (3.14F - u)*t));
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:35: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -957,6 +967,7 @@ int checkGreaterThanNestedBinaryOpFloatCompare3(void) {
   int u= 2;
   float f= 3.14F;
   res = (((int)f + (u - 3.14F)*t) > ((int)f + (3.14F - u)*(f + t > f + t)));  // no warning
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:66: warning: both sides of operator are equivalent [misc-redundant-expression]
   return (0);
 }
 
@@ -1066,7 +1077,6 @@ unsigned test_unsigned(unsigned a) {
   unsigned b = 1;
   a = a > 5 ? b : b;
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:17: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
   return a;
 }
 
@@ -1074,13 +1084,11 @@ void test_signed() {
   int a = 0;
   a = a > 5 ? a : a;
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:17: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_bool(bool a) {
   a = a > 0 ? a : a;
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:17: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_float() {
@@ -1088,14 +1096,12 @@ void test_float() {
   float b = 0;
   a = a > 5 ? a : a;
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:17: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 const char *test_string() {
   float a = 0;
   return a > 5 ? "abc" : "abc";
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:24: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:16: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_unsigned_expr() {
@@ -1103,7 +1109,6 @@ void test_unsigned_expr() {
   unsigned b = 0;
   a = a > 5 ? a+b : a+b;
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:19: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_signed_expr() {
@@ -1111,14 +1116,12 @@ void test_signed_expr() {
   int b = 1;
   a = a > 5 ? a+b : a+b;
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:19: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_bool_expr(bool a) {
   bool b = 0;
   a = a > 0 ? a&&b : a&&b;
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:20: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_unsigned_expr_negative() {
@@ -1143,7 +1146,6 @@ void test_float_expr_positive() {
   float b = 0;
   a = a > 5 ? a+b : a+b;
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:19: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_expr_positive_func() {
@@ -1151,7 +1153,6 @@ void test_expr_positive_func() {
   unsigned b = 1;
   a = a > 5 ? a+func() : a+func();
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:24: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_expr_negative_func() {
@@ -1165,7 +1166,6 @@ void test_expr_positive_funcParam() {
   unsigned b = 1;
   a = a > 5 ? a+funcParam(b) : a+funcParam(b);
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:30: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_expr_negative_funcParam() {
@@ -1174,26 +1174,12 @@ void test_expr_negative_funcParam() {
   a = a > 5 ? a+funcParam(a) : a+funcParam(b); // no warning
 }
 
-void test_expr_positive_inc() {
-  unsigned a = 0;
-  unsigned b = 1;
-  a = a > 5 ? a++ : a++;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
-}
-
 void test_expr_negative_inc() {
   unsigned a = 0;
   unsigned b = 1;
   a = a > 5 ? a++ : b++; // no warning
 }
 
-void test_expr_positive_assign() {
-  unsigned a = 0;
-  unsigned b = 1;
-  a = a > 5 ? a=1 : a=1;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
-}
-
 void test_expr_negative_assign() {
   unsigned a = 0;
   unsigned b = 1;
@@ -1206,7 +1192,6 @@ void test_signed_nested_expr() {
   int c = 3;
   a = a > 5 ? a+b+(c+a)*(a + b*(c+a)) : a+b+(c+a)*(a + b*(c+a));
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:39: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:13: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
 }
 
 void test_signed_nested_expr_negative() {
@@ -1229,190 +1214,6 @@ void test_signed_nested_cond_expr() {
   int c = 3;
   a = a > 5 ? (b > 5 ? 1 : 4) : (b > 5 ? 4 : 4);
 // CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:44: warning: 'true' and 'false' expressions are equivalent [misc-redundant-expression]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:40: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
-}
-
-void test_identical_branches1(bool b) {
-  int i = 0;
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    ++i;
-  } else {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-    ++i;
-  }
-}
-
-void test_identical_branches2(bool b) {
-  int i = 0;
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    ++i;
-  } else
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-    ++i;
-}
-
-void test_identical_branches3(bool b) {
-  int i = 0;
-  if (b) { // no warning
-    ++i;
-  } else {
-    i++;
-  }
-}
-
-void test_identical_branches4(bool b) {
-  int i = 0;
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-  } else {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-  }
-}
-
-void test_identical_branches_break(bool b) {
-  while (true) {
-    if (b)
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: if with identical then and else branches [bugprone-branch-clone]
-      break;
-    else
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-      break;
-  }
-}
-
-void test_identical_branches_continue(bool b) {
-  while (true) {
-    if (b)
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: if with identical then and else branches [bugprone-branch-clone]
-      continue;
-    else
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-      continue;
-  }
-}
-
-void test_identical_branches_func(bool b) {
-  if (b)
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    func();
-  else
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: note: else branch starts here
-    func();
-}
-
-void test_identical_branches_func_arguments(bool b) {
-  if (b) // no-warning
-    funcParam(1);
-  else
-    funcParam(2);
-}
-
-void test_identical_branches_cast1(bool b) {
-  long v = -7;
-  if (b) // no-warning
-    v = (signed int) v;
-  else
-    v = (unsigned int) v;
-}
-
-void test_identical_branches_cast2(bool b) {
-  long v = -7;
-  if (b)
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    v = (signed int) v;
-  else
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: note: else branch starts here
-    v = (signed int) v;
-}
-
-int test_identical_branches_return_int(bool b) {
-  int i = 0;
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    i++;
-    return i;
-  } else {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-    i++;
-    return i;
-  }
-}
-
-int test_identical_branches_return_func(bool b) {
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    return func();
-  } else {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-    return func();
-  }
-}
-
-void test_identical_branches_for(bool b) {
-  int i;
-  int j;
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    for (i = 0, j = 0; i < 10; i++)
-      j += 4;
-  } else {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-    for (i = 0, j = 0; i < 10; i++)
-      j += 4;
-  }
-}
-
-void test_identical_branches_while(bool b) {
-  int i = 10;
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    while (func())
-      i--;
-  } else {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-    while (func())
-      i--;
-  }
-}
-
-void test_identical_branches_while_2(bool b) {
-  int i = 10;
-  if (b) { // no-warning
-    while (func())
-      i--;
-  } else {
-    while (func())
-      i++;
-  }
-}
-
-void test_identical_branches_do_while(bool b) {
-  int i = 10;
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    do {
-      i--;
-    } while (func());
-  } else {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-    do {
-      i--;
-    } while (func());
-  }
-}
-
-void test_identical_branches_if(bool b, int i) {
-  if (b) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical then and else branches [bugprone-branch-clone]
-    if (i < 5)
-      i += 10;
-  } else {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: else branch starts here
-    if (i < 5)
-      i += 10;
-  }
 }
 
 void test_identical_bitwise1() {
@@ -1473,7 +1274,8 @@ void test_identical_logical3(int a) {
 }
 
 void test_identical_logical4(int a) {
-  if (a == func() || a == func()) // no-warning
+  if (a == func() || a == func())
+// CHECK-MESSAGES-IDENTEXPR: :[[@LINE-1]]:19: warning: both sides of operator are equivalent [misc-redundant-expression]
     ;
 }
 
@@ -1508,208 +1310,3 @@ void test_identical_logical9(int x, int y) {
     ;
 }
 #pragma clang diagnostic pop
-
-void test_warn_chained_if_stmts_1(int x) {
-  if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-}
-
-void test_warn_chained_if_stmts_2(int x) {
-  if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-  else if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here
-}
-
-void test_warn_chained_if_stmts_3(int x) {
-  if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x == 2)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-  else if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here
-}
-
-void test_warn_chained_if_stmts_4(int x) {
-  if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (func())
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-  else if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here
-}
-
-void test_warn_chained_if_stmts_5(int x) {
-  if (x & 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x & 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-}
-
-void test_warn_chained_if_stmts_6(int x) {
-  if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x == 2)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-  else if (x == 2)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here
-  else if (x == 3)
-    ;
-}
-
-void test_warn_chained_if_stmts_7(int x) {
-  if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x == 2)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-  else if (x == 3)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here
-  else if (x == 2)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 3 starts here
-  else if (x == 5)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 4 starts here
-}
-
-void test_warn_chained_if_stmts_8(int x) {
-  if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x == 2)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-  else if (x == 3)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here
-  else if (x == 2)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 3 starts here
-  else if (x == 5)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 4 starts here
-  else if (x == 3)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 5 starts here
-  else if (x == 7)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 6 starts here
-}
-
-void test_nowarn_chained_if_stmts_1(int x) {
-  if (func())
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (func())
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-}
-
-void test_nowarn_chained_if_stmts_2(int x) {
-  if (func())
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x == 1)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-  else if (func())
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 2 starts here
-}
-
-void test_nowarn_chained_if_stmts_3(int x) {
-  if (x++)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: warning: repeated branch body in conditional chain [bugprone-branch-clone]
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-2]]:6: note: end of the original
-  else if (x++)
-    ;
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: clone 1 starts here
-}
-
-void test_warn_wchar() {
-  const wchar_t * a = 0 ? L"Warning" : L"Warning";
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:25: warning: conditional operator with identical true and false expressions [bugprone-branch-clone]
-}
-void test_nowarn_wchar() {
-  const wchar_t * a = 0 ? L"No" : L"Warning";
-}
-
-void test_nowarn_long() {
-  int a = 0, b = 0;
-  long c;
-  if (0) {
-    b -= a;
-    c = 0;
-  } else {
-    b -= a;
-    c = 0LL;
-  }
-}
-
-// Identical inner conditions
-
-void test_warn_inner_if_1(int x) {
-  if (x == 1) {
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:3: warning: if with identical inner if statement [bugprone-branch-clone]
-    if (x == 1)
-// CHECK-MESSAGES-BUGPRONEBRANCH: :[[@LINE-1]]:5: note: inner if starts here
-      ;
-  }
-
-  // FIXME: Should warn here. The warning is currently not emitted because there
-  // is code between the conditions.
-  if (x == 1) {
-    int y = x;
-    if (x == 1)
-      ;
-  }
-}
-
-void test_nowarn_inner_if_1(int x) {
-  // Don't warn when condition has side effects.
-  if (x++ == 1) {
-    if (x++ == 1)
-      ;
-  }
-
-  // Don't warn when x is changed before inner condition.
-  if (x < 10) {
-    x++;
-    if (x < 10)
-      ;
-  }
-}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp
index 1eaf18a..71c8af1 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp
@@ -1,6 +1,6 @@
 // RUN: %check_clang_tidy %s modernize-use-std-format %t -- \
 // RUN:   -config="{CheckOptions: { \
-// RUN:              StrictMode: true, \
+// RUN:              modernize-use-std-format.StrictMode: true, \
 // RUN:              modernize-use-std-format.StrFormatLikeFunctions: 'fmt::sprintf', \
 // RUN:              modernize-use-std-format.ReplacementFormatFunction: 'fmt::format', \
 // RUN:              modernize-use-std-format.FormatHeader: '<fmt/core.h>' \
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/param/parameters.txt b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/param/parameters.txt
new file mode 100644
index 0000000..a6d8fa7e
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/Inputs/param/parameters.txt
@@ -0,0 +1,2 @@
+-checks='-*,llvm-namespace-comment'
+--warnings-as-errors=llvm-namespace-comment
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/deprecation-global-option.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/deprecation-global-option.cpp
new file mode 100644
index 0000000..4c9854d
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/deprecation-global-option.cpp
@@ -0,0 +1,3 @@
+// RUN: clang-tidy %s --config="{CheckOptions:{StrictMode: true}}" -checks="-*,modernize-use-std-format" | FileCheck %s 
+
+// CHECK: warning: global option 'StrictMode' is deprecated, please use 'modernize-use-std-format.StrictMode' instead. [clang-tidy-config]
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file-error.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file-error.cpp
new file mode 100644
index 0000000..183f443
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file-error.cpp
@@ -0,0 +1,3 @@
+// RUN: echo @%t.param > %t.param && not clang-tidy %s @%t.param -- 2>&1 | FileCheck %s
+
+// CHECK: recursive expansion of
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file.cpp
new file mode 100644
index 0000000..9d8c40a
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/read-parameters-from-file.cpp
@@ -0,0 +1,5 @@
+// RUN: not clang-tidy %s @%S/Inputs/param/parameters.txt -- | FileCheck %s
+
+namespace i {
+}
+// CHECK: error: namespace 'i' not terminated with a closing comment [llvm-namespace-comment,-warnings-as-errors]
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index f8a20a1..b6e0d71 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -2125,12 +2125,26 @@ class Cursor(Structure):
 
     def is_anonymous(self):
         """
-        Check if the record is anonymous.
+        Check whether this is a record type without a name, or a field where
+        the type is a record type without a name.
+
+        Use is_anonymous_record_decl to check whether a record is an
+        "anonymous union" as defined in the C/C++ standard.
         """
         if self.kind == CursorKind.FIELD_DECL:
             return self.type.get_declaration().is_anonymous()
         return conf.lib.clang_Cursor_isAnonymous(self)  # type: ignore [no-any-return]
 
+    def is_anonymous_record_decl(self):
+        """
+        Check if the record is an anonymous union as defined in the C/C++ standard
+        (or an "anonymous struct", the corresponding non-standard extension for
+        structs).
+        """
+        if self.kind == CursorKind.FIELD_DECL:
+            return self.type.get_declaration().is_anonymous_record_decl()
+        return conf.lib.clang_Cursor_isAnonymousRecordDecl(self)  # type: ignore [no-any-return]
+
     def is_bitfield(self):
         """
         Check if the field is a bitfield.
@@ -3902,12 +3916,13 @@ functionList: list[LibFunc] = [
     ("clang_Cursor_getTemplateArgumentType", [Cursor, c_uint], Type),
     ("clang_Cursor_getTemplateArgumentValue", [Cursor, c_uint], c_longlong),
     ("clang_Cursor_getTemplateArgumentUnsignedValue", [Cursor, c_uint], c_ulonglong),
-    ("clang_Cursor_isAnonymous", [Cursor], bool),
-    ("clang_Cursor_isBitField", [Cursor], bool),
     ("clang_Cursor_getBinaryOpcode", [Cursor], c_int),
     ("clang_Cursor_getBriefCommentText", [Cursor], _CXString),
     ("clang_Cursor_getRawCommentText", [Cursor], _CXString),
     ("clang_Cursor_getOffsetOfField", [Cursor], c_longlong),
+    ("clang_Cursor_isAnonymous", [Cursor], bool),
+    ("clang_Cursor_isAnonymousRecordDecl", [Cursor], bool),
+    ("clang_Cursor_isBitField", [Cursor], bool),
     ("clang_Location_isInSystemHeader", [SourceLocation], bool),
     ("clang_Type_getAlignOf", [Type], c_longlong),
     ("clang_Type_getClassType", [Type], Type),
diff --git a/clang/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py
index ce05fdb..e1d8c2a 100644
--- a/clang/bindings/python/tests/cindex/test_type.py
+++ b/clang/bindings/python/tests/cindex/test_type.py
@@ -463,8 +463,11 @@ class A
             self.assertNotEqual(children[0].spelling, "typeanon")
             self.assertEqual(children[1].spelling, "typeanon")
             self.assertEqual(fields[0].kind, CursorKind.FIELD_DECL)
+            self.assertTrue(fields[0].is_anonymous())
+            self.assertFalse(fields[0].is_anonymous_record_decl())
             self.assertEqual(fields[1].kind, CursorKind.FIELD_DECL)
             self.assertTrue(fields[1].is_anonymous())
+            self.assertTrue(fields[1].is_anonymous_record_decl())
             self.assertEqual(teststruct.type.get_offset("typeanon"), f1)
             self.assertEqual(teststruct.type.get_offset("bariton"), bariton)
             self.assertEqual(teststruct.type.get_offset("foo"), foo)
diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index c8f1d7f..e1f6771 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -150,6 +150,7 @@ names. It has the following format:
 * Patterns follow the rules specified in `POSIX 2.13.1, 2.13.2, and Rule 1 of
   2.13.3 <https://pubs.opengroup.org/onlinepubs/9699919799/utilities/
   V3_chap02.html#tag_18_13>`_.
+* Bash globstar (``**``) is supported.
 * A pattern is negated if it starts with a bang (``!``).
 
 To match all files in a directory, use e.g. ``foo/bar/*``. To match all files in
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 4be4481..637ec23 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -2088,6 +2088,11 @@ the configuration (without a prefix: ``Auto``).
   If ``true``, ``while (true) continue;`` can be put on a single
   line.
 
+.. _AllowShortNamespacesOnASingleLine:
+
+**AllowShortNamespacesOnASingleLine** (``Boolean``) :versionbadge:`clang-format 20` :ref:`¶ <AllowShortNamespacesOnASingleLine>`
+  If ``true``, ``namespace a { class b; }`` can be put on a single line.
+
 .. _AlwaysBreakAfterDefinitionReturnType:
 
 **AlwaysBreakAfterDefinitionReturnType** (``DefinitionReturnTypeBreakingStyle``) :versionbadge:`clang-format 3.7` :ref:`¶ <AlwaysBreakAfterDefinitionReturnType>`
@@ -4656,12 +4661,13 @@ the configuration (without a prefix: ``Auto``).
 .. _KeepEmptyLinesAtEOF:
 
 **KeepEmptyLinesAtEOF** (``Boolean``) :versionbadge:`clang-format 17` :ref:`¶ <KeepEmptyLinesAtEOF>`
-  This option is deprecated. See ``AtEndOfFile`` of ``KeepEmptyLines``.
+  This option is **deprecated**. See ``AtEndOfFile`` of ``KeepEmptyLines``.
 
 .. _KeepEmptyLinesAtTheStartOfBlocks:
 
 **KeepEmptyLinesAtTheStartOfBlocks** (``Boolean``) :versionbadge:`clang-format 3.7` :ref:`¶ <KeepEmptyLinesAtTheStartOfBlocks>`
-  This option is deprecated. See ``AtStartOfBlock`` of ``KeepEmptyLines``.
+  This option is **deprecated**. See ``AtStartOfBlock`` of
+  ``KeepEmptyLines``.
 
 .. _KeepFormFeed:
 
@@ -6725,8 +6731,8 @@ the configuration (without a prefix: ``Auto``).
 .. _TemplateNames:
 
 **TemplateNames** (``List of Strings``) :versionbadge:`clang-format 20` :ref:`¶ <TemplateNames>`
-  A vector of non-keyword identifiers that should be interpreted as
-  template names.
+  A vector of non-keyword identifiers that should be interpreted as template
+  names.
 
   A ``<`` after a template name is annotated as a template opener instead of
   a binary operator.
@@ -6793,6 +6799,15 @@ the configuration (without a prefix: ``Auto``).
 
 
 
+.. _VariableTemplates:
+
+**VariableTemplates** (``List of Strings``) :versionbadge:`clang-format 20` :ref:`¶ <VariableTemplates>`
+  A vector of non-keyword identifiers that should be interpreted as variable
+  template names.
+
+  A ``)`` after a variable template instantiation is **not** annotated as
+  the closing parenthesis of C-style cast operator.
+
 .. _VerilogBreakBetweenInstancePorts:
 
 **VerilogBreakBetweenInstancePorts** (``Boolean``) :versionbadge:`clang-format 17` :ref:`¶ <VerilogBreakBetweenInstancePorts>`
@@ -6829,6 +6844,45 @@ the configuration (without a prefix: ``Auto``).
 
   For example: BOOST_PP_STRINGIZE
 
+.. _WrapNamespaceBodyWithEmptyLines:
+
+**WrapNamespaceBodyWithEmptyLines** (``WrapNamespaceBodyWithEmptyLinesStyle``) :versionbadge:`clang-format 20` :ref:`¶ <WrapNamespaceBodyWithEmptyLines>`
+  Wrap namespace body with empty lines.
+
+  Possible values:
+
+  * ``WNBWELS_Never`` (in configuration: ``Never``)
+    Remove all empty lines at the beginning and the end of namespace body.
+
+    .. code-block:: c++
+
+      namespace N1 {
+      namespace N2
+      function();
+      }
+      }
+
+  * ``WNBWELS_Always`` (in configuration: ``Always``)
+    Always have at least one empty line at the beginning and the end of
+    namespace body except that the number of empty lines between consecutive
+    nested namespace definitions is not increased.
+
+    .. code-block:: c++
+
+      namespace N1 {
+      namespace N2 {
+
+      function();
+
+      }
+      }
+
+  * ``WNBWELS_Leave`` (in configuration: ``Leave``)
+    Keep existing newlines at the beginning and the end of namespace body.
+    ``MaxEmptyLinesToKeep`` still applies.
+
+
+
 .. END_FORMAT_STYLE_OPTIONS
 
 Adding additional style options
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index cc5f1d4..e020710 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3641,7 +3641,7 @@ program location should be executed. It is expected to be used to implement
 <https://llvm.org/docs/LangRef.html#llvm-allow-runtime-check-intrinsic>`_
 intrinsic.
 
-The ``__builtin_allow_runtime_check()`` can be used within constrol structures
+The ``__builtin_allow_runtime_check()`` can be used within control structures
 like ``if`` to guard expensive runtime checks. The return value is determined
 by the following compiler options and may differ per call site:
 
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index f18e9cf..48dfd9c 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -1842,6 +1842,12 @@ Example matches x in if (x)
   if (x) {}
 </pre></td></tr>
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('dependentScopeDeclRefExpr0')"><a name="dependentScopeDeclRefExpr0Anchor">dependentScopeDeclRefExpr</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentScopeDeclRefExpr.html">DependentScopeDeclRefExpr</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="dependentScopeDeclRefExpr0"><pre>Matches expressions that refer to dependent scope declarations.
+
+Example matches T::v
+   template <class T> class X : T { void f() { T::v; } };
+</pre></td></tr>
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Stmt.html">Stmt</a>&gt;</td><td class="name" onclick="toggle('declStmt0')"><a name="declStmt0Anchor">declStmt</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeclStmt.html">DeclStmt</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="declStmt0"><pre>Matches declaration statements.
@@ -2530,6 +2536,26 @@ decltypeType()
   matches "decltype(i + j)"
 </pre></td></tr>
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('dependentNameType0')"><a name="dependentNameType0Anchor">dependentNameType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentNameType.html">DependentNameType</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="dependentNameType0"><pre>Matches a dependent name type.
+
+Example matches T::type
+
+  template <typename T> struct declToImport {
+    typedef typename T::type dependent_name;
+  };
+</pre></td></tr>
+
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('dependentTemplateSpecializationType0')"><a name="dependentTemplateSpecializationType0Anchor">dependentTemplateSpecializationType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentTemplateSpecializationType.html">DependentTemplateSpecializationType</a>&gt;...</td></tr>
+<tr><td colspan="4" class="doc" id="dependentTemplateSpecializationType0"><pre>Matches a dependent template specialization type.
+
+Example matches A<T>::template B<T>
+
+  template<typename T> struct A;
+  template<typename T> struct declToImport {
+    typename A<T>::template B<T> a;
+  };
+</pre></td></tr>
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Type.html">Type</a>&gt;</td><td class="name" onclick="toggle('deducedTemplateSpecializationType0')"><a name="deducedTemplateSpecializationType0Anchor">deducedTemplateSpecializationType</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DeducedTemplateSpecializationType.html">DeducedTemplateSpecializationType</a>&gt;...</td></tr>
 <tr><td colspan="4" class="doc" id="deducedTemplateSpecializationType0"><pre>Matches C++17 deduced template specialization types, e.g. deduced class
@@ -3423,6 +3449,34 @@ unresolvedMemberExpr(isArrow())
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentScopeDeclRefExpr.html">DependentScopeDeclRefExpr</a>&gt;</td><td class="name" onclick="toggle('hasDependentName0')"><a name="hasDependentName0Anchor">hasDependentName</a></td><td>std::string N</td></tr>
+<tr><td colspan="4" class="doc" id="hasDependentName0"><pre>Matches the dependent name of a DependentScopeDeclRefExpr.
+
+Matches the dependent name of a DependentScopeDeclRefExpr
+
+Given:
+
+  template &lt;class T&lt; class X : T { void f() { T::v; } };
+
+dependentScopeDeclRefExpr(hasDependentName("v")) matches `T::v`
+</pre></td></tr>
+
+
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentNameType.html">DependentNameType</a>&gt;</td><td class="name" onclick="toggle('hasDependentName1')"><a name="hasDependentName1Anchor">hasDependentName</a></td><td>std::string N</td></tr>
+<tr><td colspan="4" class="doc" id="hasDependentName1"><pre>Matches the dependent name of a DependentNameType.
+
+Matches the dependent name of a DependentNameType
+
+Given:
+
+  template &lt;typename T&lt; struct declToImport {
+    typedef typename T::type dependent_name;
+  };
+
+dependentNameType(hasDependentName("type")) matches `T::type`
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXDependentScopeMemberExpr.html">CXXDependentScopeMemberExpr</a>&gt;</td><td class="name" onclick="toggle('memberHasSameNameAsBoundNode0')"><a name="memberHasSameNameAsBoundNode0Anchor">memberHasSameNameAsBoundNode</a></td><td>std::string BindingID</td></tr>
 <tr><td colspan="4" class="doc" id="memberHasSameNameAsBoundNode0"><pre>Matches template-dependent, but known, member names against an already-bound
 node
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8b984ec..2258452 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -704,6 +704,16 @@ Improvements to Clang's diagnostics
       return ptr + index < ptr; // warning
     }
 
+- Clang now emits a ``-Wvarargs`` diagnostic when the second argument
+  to ``va_arg`` is of array type, which is an undefined behavior (#GH119360).
+
+  .. code-block:: c++
+
+    void test() {
+      va_list va;
+      va_arg(va, int[10]); // warning
+    }
+
 - Fix -Wdangling false positives on conditional operators (#120206).
 
 - Fixed a bug where Clang hung on an unsupported optional scope specifier ``::`` when parsing
@@ -754,6 +764,7 @@ Bug Fixes in This Version
   the unsupported type instead of the ``register`` keyword (#GH109776).
 - Fixed a crash when emit ctor for global variant with flexible array init (#GH113187).
 - Fixed a crash when GNU statement expression contains invalid statement (#GH113468).
+- Fixed a crash when passing the variable length array type to ``va_arg`` (#GH119360).
 - Fixed a failed assertion when using ``__attribute__((noderef))`` on an
   ``_Atomic``-qualified type (#GH116124).
 - No longer return ``false`` for ``noexcept`` expressions involving a
@@ -885,6 +896,12 @@ Bug Fixes to C++ Support
 - Fixed recognition of ``std::initializer_list`` when it's surrounded with ``extern "C++"`` and exported
   out of a module (which is the case e.g. in MSVC's implementation of ``std`` module). (#GH118218)
 - Fixed a pack expansion issue in checking unexpanded parameter sizes. (#GH17042)
+- Fixed a bug where captured structured bindings were modifiable inside non-mutable lambda (#GH95081)
+- Passing incomplete types to ``__is_base_of`` and other builtin type traits for which the corresponding
+  standard type trait mandates a complete type is now a hard (non-sfinae-friendly) error
+  (`LWG3929 <https://wg21.link/LWG3929>`__.) (#GH121278)
+- Clang now identifies unexpanded parameter packs within the type constraint on a non-type template parameter. (#GH88866)
+- Fixed an issue while resolving type of expression indexing into a pack of values of non-dependent type (#GH121242)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1005,6 +1022,12 @@ Arm and AArch64 Support
   in leaf functions after enabling ``-fno-omit-frame-pointer``, you can do so by adding
   the ``-momit-leaf-frame-pointer`` option.
 
+- Support has been added for the following processors (-mcpu identifiers in parenthesis):
+
+  For AArch64:
+
+  * FUJITSU-MONAKA (fujitsu-monaka)
+
 Android Support
 ^^^^^^^^^^^^^^^
 
@@ -1101,6 +1124,14 @@ AST Matchers
 
 - Ensure ``pointee`` matches Objective-C pointer types.
 
+- Add ``dependentScopeDeclRefExpr`` matcher to match expressions that refer to dependent scope declarations.
+
+- Add ``dependentNameType`` matcher to match a dependent name type.
+
+- Add ``dependentTemplateSpecializationType`` matcher to match a dependent template specialization type.
+
+- Add ``hasDependentName`` matcher to match the dependent name of a DependentScopeDeclRefExpr or DependentNameType.
+
 clang-format
 ------------
 
@@ -1112,6 +1143,10 @@ clang-format
   ``Never``, and ``true`` to ``Always``.
 - Adds ``RemoveEmptyLinesInUnwrappedLines`` option.
 - Adds ``KeepFormFeed`` option and set it to ``true`` for ``GNU`` style.
+- Adds ``AllowShortNamespacesOnASingleLine`` option.
+- Adds ``VariableTemplates`` option.
+- Adds support for bash globstar in ``.clang-format-ignore``.
+- Adds ``WrapNamespaceBodyWithEmptyLines`` option.
 
 libclang
 --------
@@ -1142,6 +1177,13 @@ New features
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
 
+- In loops where the loop condition is opaque (i.e. the analyzer cannot
+  determine whether it's true or false), the analyzer will no longer assume
+  execution paths that perform more that two iterations. These unjustified
+  assumptions caused false positive reports (e.g. 100+ out-of-bounds reports in
+  the FFMPEG codebase) in loops where the programmer intended only two or three
+  steps but the analyzer wasn't able to understand that the loop is limited.
+
 Improvements
 ^^^^^^^^^^^^
 
@@ -1244,12 +1286,17 @@ Sanitizers
 Python Binding Changes
 ----------------------
 - Fixed an issue that led to crashes when calling ``Type.get_exception_specification_kind``.
+- Added binding for ``clang_Cursor_isAnonymousRecordDecl``, which allows checking if
+  a declaration is an anonymous union or anonymous struct.
 
 OpenMP Support
 --------------
 - Added support for 'omp assume' directive.
 - Added support for 'omp scope' directive.
 - Added support for allocator-modifier in 'allocate' clause.
+- Changed the OpenMP DeviceRTL to use 'generic' IR. The
+  ``LIBOMPTARGET_DEVICE_ARCHITECTURES`` CMake argument is now unused and will
+  always build support for AMDGPU and NVPTX targets.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 29d5e1f..e093b2d 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -476,6 +476,9 @@ cplusplus.NewDelete (C++)
 """""""""""""""""""""""""
 Check for double-free and use-after-free problems. Traces memory managed by new/delete.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. literalinclude:: checkers/newdelete_example.cpp
     :language: cpp
 
@@ -485,6 +488,9 @@ cplusplus.NewDeleteLeaks (C++)
 """"""""""""""""""""""""""""""
 Check for memory leaks. Traces memory managed by new/delete.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. code-block:: cpp
 
  void test() {
@@ -1263,6 +1269,9 @@ You can silence this warning either by bound checking the ``size`` parameter, or
 by explicitly marking the ``size`` parameter as sanitized. See the
 :ref:`optin-taint-GenericTaint` checker for an example.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. code-block:: c
 
   void vulnerable(void) {
@@ -1857,6 +1866,9 @@ unix.Malloc (C)
 """""""""""""""
 Check for memory leaks, double free, and use-after-free problems. Traces memory managed by malloc()/free().
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. literalinclude:: checkers/unix_malloc_example.c
     :language: c
 
@@ -1866,6 +1878,9 @@ unix.MallocSizeof (C)
 """""""""""""""""""""
 Check for dubious ``malloc`` arguments involving ``sizeof``.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. code-block:: c
 
  void test() {
@@ -1881,6 +1896,9 @@ unix.MismatchedDeallocator (C, C++)
 """""""""""""""""""""""""""""""""""
 Check for mismatched deallocators.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. literalinclude:: checkers/mismatched_deallocator_example.cpp
     :language: c
 
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index dfc562d..63d266d 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2198,7 +2198,15 @@ enum CXCursorKind {
    */
   CXCursor_OpenACCShutdownConstruct = 329,
 
-  CXCursor_LastStmt = CXCursor_OpenACCShutdownConstruct,
+  /** OpenACC set Construct.
+   */
+  CXCursor_OpenACCSetConstruct = 330,
+
+  /** OpenACC update Construct.
+   */
+  CXCursor_OpenACCUpdateConstruct = 331,
+
+  CXCursor_LastStmt = CXCursor_OpenACCUpdateConstruct,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index b4747c6..4e4dd34 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -327,18 +327,89 @@ public:
                                  SourceLocation EndLoc);
 };
 
-/// A 'self' clause, which has an optional condition expression.
-class OpenACCSelfClause : public OpenACCClauseWithCondition {
+/// A 'self' clause, which has an optional condition expression, or, in the
+/// event of an 'update' directive, contains a 'VarList'.
+class OpenACCSelfClause final
+    : public OpenACCClauseWithParams,
+      private llvm::TrailingObjects<OpenACCSelfClause, Expr *> {
+  friend TrailingObjects;
+  // Holds whether this HAS a condition expression. Lacks a value if this is NOT
+  // a condition-expr self clause.
+  std::optional<bool> HasConditionExpr;
+  // Holds the number of stored expressions.  In the case of a condition-expr
+  // self clause, this is expected to be ONE (and there to be 1 trailing
+  // object), whether or not that is null.
+  unsigned NumExprs;
+
   OpenACCSelfClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
                     Expr *ConditionExpr, SourceLocation EndLoc);
+  OpenACCSelfClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                    ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+
+  // Intentionally internal, meant to be an implementation detail of everything
+  // else. All non-internal uses should go through getConditionExpr/getVarList.
+  llvm::ArrayRef<Expr *> getExprs() const {
+    return {getTrailingObjects<Expr *>(), NumExprs};
+  }
 
 public:
   static bool classof(const OpenACCClause *C) {
     return C->getClauseKind() == OpenACCClauseKind::Self;
   }
+
+  bool isConditionExprClause() const { return HasConditionExpr.has_value(); }
+
+  bool hasConditionExpr() const {
+    assert(HasConditionExpr.has_value() &&
+           "VarList Self Clause asked about condition expression");
+    return *HasConditionExpr;
+  }
+
+  const Expr *getConditionExpr() const {
+    assert(HasConditionExpr.has_value() &&
+           "VarList Self Clause asked about condition expression");
+    assert(getExprs().size() == 1 &&
+           "ConditionExpr Self Clause with too many Exprs");
+    return getExprs()[0];
+  }
+
+  Expr *getConditionExpr() {
+    assert(HasConditionExpr.has_value() &&
+           "VarList Self Clause asked about condition expression");
+    assert(getExprs().size() == 1 &&
+           "ConditionExpr Self Clause with too many Exprs");
+    return getExprs()[0];
+  }
+
+  ArrayRef<Expr *> getVarList() {
+    assert(!HasConditionExpr.has_value() &&
+           "Condition Expr self clause asked about var list");
+    return getExprs();
+  }
+  ArrayRef<Expr *> getVarList() const {
+    assert(!HasConditionExpr.has_value() &&
+           "Condition Expr self clause asked about var list");
+    return getExprs();
+  }
+
+  child_range children() {
+    return child_range(
+        reinterpret_cast<Stmt **>(getTrailingObjects<Expr *>()),
+        reinterpret_cast<Stmt **>(getTrailingObjects<Expr *>() + NumExprs));
+  }
+
+  const_child_range children() const {
+    child_range Children = const_cast<OpenACCSelfClause *>(this)->children();
+    return const_child_range(Children.begin(), Children.end());
+  }
+
   static OpenACCSelfClause *Create(const ASTContext &C, SourceLocation BeginLoc,
                                    SourceLocation LParenLoc,
                                    Expr *ConditionExpr, SourceLocation EndLoc);
+  static OpenACCSelfClause *Create(const ASTContext &C, SourceLocation BeginLoc,
+                                   SourceLocation LParenLoc,
+                                   ArrayRef<Expr *> ConditionExpr,
+                                   SourceLocation EndLoc);
 };
 
 /// Represents a clause that has one or more expressions associated with it.
@@ -633,6 +704,19 @@ public:
                                         SourceLocation EndLoc);
 };
 
+class OpenACCDefaultAsyncClause : public OpenACCClauseWithSingleIntExpr {
+  OpenACCDefaultAsyncClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                            Expr *IntExpr, SourceLocation EndLoc);
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::DefaultAsync;
+  }
+  static OpenACCDefaultAsyncClause *
+  Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+         Expr *IntExpr, SourceLocation EndLoc);
+};
+
 /// Represents a 'collapse' clause on a 'loop' construct. This clause takes an
 /// integer constant expression 'N' that represents how deep to collapse the
 /// construct. It also takes an optional 'force' tag that permits intervening
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index f5b32ed..d500f4e 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -4080,6 +4080,10 @@ DEF_TRAVERSE_STMT(OpenACCInitConstruct,
                   { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
 DEF_TRAVERSE_STMT(OpenACCShutdownConstruct,
                   { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
+DEF_TRAVERSE_STMT(OpenACCSetConstruct,
+                  { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
+DEF_TRAVERSE_STMT(OpenACCUpdateConstruct,
+                  { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
 
 // Traverse HLSL: Out argument expression
 DEF_TRAVERSE_STMT(HLSLOutArgExpr, {})
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index e311ede..ebbee152 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -672,5 +672,84 @@ public:
          SourceLocation End, ArrayRef<const OpenACCClause *> Clauses);
 };
 
+// This class represents a 'set' construct, which has just a clause list.
+class OpenACCSetConstruct final
+    : public OpenACCConstructStmt,
+      private llvm::TrailingObjects<OpenACCSetConstruct,
+                                    const OpenACCClause *> {
+  friend TrailingObjects;
+  OpenACCSetConstruct(unsigned NumClauses)
+      : OpenACCConstructStmt(OpenACCSetConstructClass,
+                             OpenACCDirectiveKind::Set, SourceLocation{},
+                             SourceLocation{}, SourceLocation{}) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+
+  OpenACCSetConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                      SourceLocation End,
+                      ArrayRef<const OpenACCClause *> Clauses)
+      : OpenACCConstructStmt(OpenACCSetConstructClass,
+                             OpenACCDirectiveKind::Set, Start, DirectiveLoc,
+                             End) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCSetConstructClass;
+  }
+  static OpenACCSetConstruct *CreateEmpty(const ASTContext &C,
+                                          unsigned NumClauses);
+  static OpenACCSetConstruct *Create(const ASTContext &C, SourceLocation Start,
+                                     SourceLocation DirectiveLoc,
+                                     SourceLocation End,
+                                     ArrayRef<const OpenACCClause *> Clauses);
+};
+// This class represents an 'update' construct, which has just a clause list.
+class OpenACCUpdateConstruct final
+    : public OpenACCConstructStmt,
+      private llvm::TrailingObjects<OpenACCUpdateConstruct,
+                                    const OpenACCClause *> {
+  friend TrailingObjects;
+  OpenACCUpdateConstruct(unsigned NumClauses)
+      : OpenACCConstructStmt(OpenACCUpdateConstructClass,
+                             OpenACCDirectiveKind::Update, SourceLocation{},
+                             SourceLocation{}, SourceLocation{}) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+
+  OpenACCUpdateConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                         SourceLocation End,
+                         ArrayRef<const OpenACCClause *> Clauses)
+      : OpenACCConstructStmt(OpenACCUpdateConstructClass,
+                             OpenACCDirectiveKind::Update, Start, DirectiveLoc,
+                             End) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCUpdateConstructClass;
+  }
+  static OpenACCUpdateConstruct *CreateEmpty(const ASTContext &C,
+                                             unsigned NumClauses);
+  static OpenACCUpdateConstruct *
+  Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+         SourceLocation End, ArrayRef<const OpenACCClause *> Clauses);
+};
 } // namespace clang
 #endif // LLVM_CLANG_AST_STMTOPENACC_H
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index 5383b53..4aaae48 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -417,7 +417,9 @@ public:
   void VisitOpenACCHostDataConstruct(const OpenACCHostDataConstruct *S);
   void VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *S);
   void VisitOpenACCInitConstruct(const OpenACCInitConstruct *S);
+  void VisitOpenACCSetConstruct(const OpenACCSetConstruct *S);
   void VisitOpenACCShutdownConstruct(const OpenACCShutdownConstruct *S);
+  void VisitOpenACCUpdateConstruct(const OpenACCUpdateConstruct *S);
   void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S);
   void VisitEmbedExpr(const EmbedExpr *S);
   void VisitAtomicExpr(const AtomicExpr *AE);
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 897aa25..f32170c 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -2125,6 +2125,16 @@ extern const internal::VariadicDynCastAllOfMatcher<Stmt, Expr> expr;
 extern const internal::VariadicDynCastAllOfMatcher<Stmt, DeclRefExpr>
     declRefExpr;
 
+/// Matches expressions that refer to dependent scope declarations.
+///
+/// example matches T::v;
+/// \code
+///  template <class T> class X : T { void f() { T::v; } };
+/// \endcode
+extern const internal::VariadicDynCastAllOfMatcher<Stmt,
+                                                   DependentScopeDeclRefExpr>
+    dependentScopeDeclRefExpr;
+
 /// Matches a reference to an ObjCIvar.
 ///
 /// Example: matches "a" in "init" method:
@@ -3247,6 +3257,29 @@ AST_MATCHER_P(CXXDependentScopeMemberExpr, memberHasSameNameAsBoundNode,
       });
 }
 
+/// Matches the dependent name of a DependentScopeDeclRefExpr or
+/// DependentNameType
+///
+/// Given:
+/// \code
+///  template <class T> class X : T { void f() { T::v; } };
+/// \endcode
+/// \c dependentScopeDeclRefExpr(hasDependentName("v")) matches `T::v`
+///
+/// Given:
+/// \code
+///  template <typename T> struct declToImport {
+///    typedef typename T::type dependent_name;
+///  };
+/// \endcode
+/// \c dependentNameType(hasDependentName("type")) matches `T::type`
+AST_POLYMORPHIC_MATCHER_P(hasDependentName,
+                          AST_POLYMORPHIC_SUPPORTED_TYPES(
+                              DependentScopeDeclRefExpr, DependentNameType),
+                          std::string, N) {
+  return internal::getDependentName(Node) == N;
+}
+
 /// Matches C++ classes that are directly or indirectly derived from a class
 /// matching \c Base, or Objective-C classes that directly or indirectly
 /// subclass a class matching \c Base.
@@ -7701,6 +7734,28 @@ AST_MATCHER_P(DecayedType, hasDecayedType, internal::Matcher<QualType>,
   return InnerType.matches(Node.getDecayedType(), Finder, Builder);
 }
 
+/// Matches a dependent name type
+///
+/// Example matches T::type
+/// \code
+///  template <typename T> struct declToImport {
+///    typedef typename T::type dependent_name;
+///  };
+/// \endcode
+extern const AstTypeMatcher<DependentNameType> dependentNameType;
+
+/// Matches a dependent template specialization type
+///
+/// Example matches A<T>::template B<T>
+/// \code
+///   template<typename T> struct A;
+///   template<typename T> struct declToImport {
+///     typename A<T>::template B<T> a;
+///   };
+/// \endcode
+extern const AstTypeMatcher<DependentTemplateSpecializationType>
+    dependentTemplateSpecializationType;
+
 /// Matches declarations whose declaration context, interpreted as a
 /// Decl, matches \c InnerMatcher.
 ///
diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
index 04804d5..1f7b5e7 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -2343,6 +2343,14 @@ MatchTemplateArgLocAt(const TemplateSpecializationTypeLoc &Node,
          InnerMatcher.matches(Node.getArgLoc(Index), Finder, Builder);
 }
 
+inline std::string getDependentName(const DependentScopeDeclRefExpr &node) {
+  return node.getDeclName().getAsString();
+}
+
+inline std::string getDependentName(const DependentNameType &node) {
+  return node.getIdentifier()->getName().str();
+}
+
 } // namespace internal
 
 } // namespace ast_matchers
diff --git a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
index 48c5287..aaf89f4 100644
--- a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
+++ b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
 #define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
 
+#include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
@@ -71,10 +73,27 @@ public:
   /// Requirements:
   ///
   ///  - `CE` should return a location (GLValue or a record type).
+  ///
+  /// DEPRECATED: switch users to the below overload which takes Callee and Type
+  /// directly.
   StorageLocation *getOrCreateConstMethodReturnStorageLocation(
       const RecordStorageLocation &RecordLoc, const CallExpr *CE,
       Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize);
 
+  /// Creates or returns a previously created `StorageLocation` associated with
+  /// a const method call `obj.getFoo()` where `RecordLoc` is the
+  /// `RecordStorageLocation` of `obj`, `Callee` is the decl for `getFoo`.
+  ///
+  /// The callback `Initialize` runs on the storage location if newly created.
+  ///
+  /// Requirements:
+  ///
+  ///  - `Callee` should return a location (return type is a reference type or a
+  ///     record type).
+  StorageLocation &getOrCreateConstMethodReturnStorageLocation(
+      const RecordStorageLocation &RecordLoc, const FunctionDecl *Callee,
+      Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize);
+
   void clearConstMethodReturnValues(const RecordStorageLocation &RecordLoc) {
     ConstMethodReturnValues.erase(&RecordLoc);
   }
@@ -212,6 +231,27 @@ CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnStorageLocation(
   return &Loc;
 }
 
+template <typename Base>
+StorageLocation &
+CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnStorageLocation(
+    const RecordStorageLocation &RecordLoc, const FunctionDecl *Callee,
+    Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize) {
+  assert(Callee != nullptr);
+  QualType Type = Callee->getReturnType();
+  assert(!Type.isNull());
+  assert(Type->isReferenceType() || Type->isRecordType());
+  auto &ObjMap = ConstMethodReturnStorageLocations[&RecordLoc];
+  auto it = ObjMap.find(Callee);
+  if (it != ObjMap.end())
+    return *it->second;
+
+  StorageLocation &Loc = Env.createStorageLocation(Type.getNonReferenceType());
+  Initialize(Loc);
+
+  ObjMap.insert({Callee, &Loc});
+  return Loc;
+}
+
 } // namespace dataflow
 } // namespace clang
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
index 7134941..fb11c2e 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
@@ -37,14 +37,13 @@ struct UncheckedOptionalAccessModelOptions {
   /// can't identify when their results are used safely (across calls),
   /// resulting in false positives in all such cases. Note: this option does not
   /// cover access through `operator[]`.
-  /// FIXME: we currently cache and equate the result of const accessors
-  /// returning pointers, so cover the case of operator-> followed by
-  /// operator->, which covers the common case of smart pointers. We also cover
-  /// some limited cases of returning references (if return type is an optional
-  /// type), so cover some cases of operator* followed by operator*. We don't
-  /// cover mixing operator-> and operator*. Once we are confident in this const
-  /// accessor caching, we shouldn't need the IgnoreSmartPointerDereference
-  /// option anymore.
+  ///
+  /// FIXME: we now cache and equate the result of const accessors
+  /// that look like unique_ptr, have both `->` (returning a pointer type) and
+  /// `*` (returning a reference type). This includes mixing `->` and
+  /// `*` in a sequence of calls as long as the object is not modified. Once we
+  /// are confident in this const accessor caching, we shouldn't need the
+  /// IgnoreSmartPointerDereference option anymore.
   bool IgnoreSmartPointerDereference = false;
 };
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h b/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
index 3e40165..1b116a0 100644
--- a/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
+++ b/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
@@ -27,8 +27,13 @@
 #include <cassert>
 
 #include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/Stmt.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/FlowSensitive/MatchSwitch.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Analysis/FlowSensitive/Value.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 
 namespace clang::dataflow {
 
@@ -58,6 +63,107 @@ ast_matchers::StatementMatcher isSmartPointerLikeOperatorArrow();
 ast_matchers::StatementMatcher isSmartPointerLikeValueMethodCall();
 ast_matchers::StatementMatcher isSmartPointerLikeGetMethodCall();
 
+// Common transfer functions.
+
+/// Returns the "canonical" callee for smart pointer operators (`*` and `->`)
+/// as a key for caching.
+///
+/// We choose `*` as the canonical one, since it needs a
+/// StorageLocation anyway.
+///
+/// Note: there may be multiple `operator*` (one const, one non-const).
+/// We pick the const one, which the above provided matchers require to exist.
+const FunctionDecl *
+getCanonicalSmartPointerLikeOperatorCallee(const CallExpr *CE);
+
+/// A transfer function for `operator*` (and `value`) calls that can be
+/// cached. Runs the `InitializeLoc` callback to initialize any new
+/// StorageLocations.
+///
+/// Requirements:
+///
+/// - LatticeT should use the `CachedConstAccessorsLattice` mixin.
+template <typename LatticeT>
+void transferSmartPointerLikeCachedDeref(
+    const CallExpr *DerefExpr, RecordStorageLocation *SmartPointerLoc,
+    TransferState<LatticeT> &State,
+    llvm::function_ref<void(StorageLocation &)> InitializeLoc);
+
+/// A transfer function for `operator->` (and `get`) calls that can be cached.
+/// Runs the `InitializeLoc` callback to initialize any new StorageLocations.
+///
+/// Requirements:
+///
+/// - LatticeT should use the `CachedConstAccessorsLattice` mixin.
+template <typename LatticeT>
+void transferSmartPointerLikeCachedGet(
+    const CallExpr *GetExpr, RecordStorageLocation *SmartPointerLoc,
+    TransferState<LatticeT> &State,
+    llvm::function_ref<void(StorageLocation &)> InitializeLoc);
+
+template <typename LatticeT>
+void transferSmartPointerLikeCachedDeref(
+    const CallExpr *DerefExpr, RecordStorageLocation *SmartPointerLoc,
+    TransferState<LatticeT> &State,
+    llvm::function_ref<void(StorageLocation &)> InitializeLoc) {
+  if (State.Env.getStorageLocation(*DerefExpr) != nullptr)
+    return;
+  if (SmartPointerLoc == nullptr)
+    return;
+
+  const FunctionDecl *Callee = DerefExpr->getDirectCallee();
+  if (Callee == nullptr)
+    return;
+  const FunctionDecl *CanonicalCallee =
+      getCanonicalSmartPointerLikeOperatorCallee(DerefExpr);
+  // This shouldn't happen, as we should at least find `Callee` itself.
+  assert(CanonicalCallee != nullptr);
+  if (CanonicalCallee != Callee) {
+    // When using the provided matchers, we should always get a reference to
+    // the same type.
+    assert(CanonicalCallee->getReturnType()->isReferenceType() &&
+           Callee->getReturnType()->isReferenceType());
+    assert(CanonicalCallee->getReturnType()
+               .getNonReferenceType()
+               ->getCanonicalTypeUnqualified() ==
+           Callee->getReturnType()
+               .getNonReferenceType()
+               ->getCanonicalTypeUnqualified());
+  }
+
+  StorageLocation &LocForValue =
+      State.Lattice.getOrCreateConstMethodReturnStorageLocation(
+          *SmartPointerLoc, CanonicalCallee, State.Env, InitializeLoc);
+  State.Env.setStorageLocation(*DerefExpr, LocForValue);
+}
+
+template <typename LatticeT>
+void transferSmartPointerLikeCachedGet(
+    const CallExpr *GetExpr, RecordStorageLocation *SmartPointerLoc,
+    TransferState<LatticeT> &State,
+    llvm::function_ref<void(StorageLocation &)> InitializeLoc) {
+  if (SmartPointerLoc == nullptr)
+    return;
+
+  const FunctionDecl *CanonicalCallee =
+      getCanonicalSmartPointerLikeOperatorCallee(GetExpr);
+
+  if (CanonicalCallee != nullptr) {
+    auto &LocForValue =
+        State.Lattice.getOrCreateConstMethodReturnStorageLocation(
+            *SmartPointerLoc, CanonicalCallee, State.Env, InitializeLoc);
+    State.Env.setValue(*GetExpr,
+                       State.Env.template create<PointerValue>(LocForValue));
+  } else {
+    // Otherwise, just cache the pointer value as if it was a const accessor.
+    Value *Val = State.Lattice.getOrCreateConstMethodReturnValue(
+        *SmartPointerLoc, GetExpr, State.Env);
+    if (Val == nullptr)
+      return;
+    State.Env.setValue(*GetExpr, *Val);
+  }
+}
+
 } // namespace clang::dataflow
 
 #endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_SMARTPOINTERACCESSORCACHING_H
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 52ad72e..12faf06 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2776,7 +2776,7 @@ def Ownership : InheritableAttr {
   let Args = [IdentifierArgument<"Module">,
               VariadicParamIdxArgument<"Args">];
   let Subjects = SubjectList<[HasFunctionProto]>;
-  let Documentation = [Undocumented];
+  let Documentation = [OwnershipDocs];
 }
 
 def Packed : InheritableAttr {
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index fdad4c9..b8d702e 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1389,6 +1389,82 @@ Query for this attribute with ``__has_attribute(overloadable)``.
   }];
 }
 
+def OwnershipDocs : Documentation {
+  let Heading = "ownership_holds, ownership_returns, ownership_takes (Clang "
+                "Static Analyzer)";
+  let Category = DocCatFunction;
+  let Label = "analyzer-ownership-attrs";
+  let Content = [{
+
+.. note::
+
+  In order for the Clang Static Analyzer to acknowledge these attributes, the
+  ``Optimistic`` config needs to be set to true for the checker
+  ``unix.DynamicMemoryModeling``:
+
+  ``-Xclang -analyzer-config -Xclang unix.DynamicMemoryModeling:Optimistic=true``
+
+These attributes are used by the Clang Static Analyzer's dynamic memory modeling
+facilities to mark custom allocating/deallocating functions.
+
+All 3 attributes' first parameter of type string is the type of the allocation:
+``malloc``, ``new``, etc. to allow for catching :ref:`mismatched deallocation
+<unix-MismatchedDeallocator>` bugs. The allocation type can be any string, e.g.
+a function annotated with
+returning a piece of memory of type ``lasagna`` but freed with a function
+annotated to release ``cheese`` typed memory will result in mismatched
+deallocation warning.
+
+The (currently) only allocation type having special meaning is ``malloc`` --
+the Clang Static Analyzer makes sure that allocating functions annotated with
+``malloc`` are treated like they used the standard ``malloc()``, and can be
+safely deallocated with the standard ``free()``.
+
+* Use ``ownership_returns`` to mark a function as an allocating function. Takes
+  1 parameter to denote the allocation type.
+* Use ``ownership_takes`` to mark a function as a deallocating function. Takes 2
+  parameters: the allocation type, and the index of the parameter that is being
+  deallocated (counting from 1).
+* Use ``ownership_holds`` to mark that a function takes over the ownership of a
+  piece of memory and will free it at some unspecified point in the future. Like
+  ``ownership_takes``, this takes 2 parameters: the allocation type, and the
+  index of the parameter whose ownership will be taken over (counting from 1).
+
+The annotations ``ownership_takes`` and ``ownership_holds`` both prevent memory
+leak reports (concerning the specified argument); the difference between them
+is that using taken memory is a use-after-free error, while using held memory
+is assumed to be legitimate.
+
+Example:
+
+.. code-block:: c
+
+  // Denotes that my_malloc will return with a dynamically allocated piece of
+  // memory using malloc().
+  void __attribute((ownership_returns(malloc))) *my_malloc(size_t);
+
+  // Denotes that my_free will deallocate its parameter using free().
+  void __attribute((ownership_takes(malloc, 1))) my_free(void *);
+
+  // Denotes that my_hold will take over the ownership of its parameter that was
+  // allocated via malloc().
+  void __attribute((ownership_holds(malloc, 1))) my_hold(void *);
+
+Further reading about dynamic memory modeling in the Clang Static Analyzer is
+found in these checker docs:
+:ref:`unix.Malloc <unix-Malloc>`, :ref:`unix.MallocSizeof <unix-MallocSizeof>`,
+:ref:`unix.MismatchedDeallocator <unix-MismatchedDeallocator>`,
+:ref:`cplusplus.NewDelete <cplusplus-NewDelete>`,
+:ref:`cplusplus.NewDeleteLeaks <cplusplus-NewDeleteLeaks>`,
+:ref:`optin.taint.TaintedAlloc <optin-taint-TaintedAlloc>`.
+Mind that many more checkers are affected by dynamic memory modeling changes to
+some extent.
+
+Further reading for other annotations:
+`Source Annotations in the Clang Static Analyzer <https://clang-analyzer.llvm.org/annotations.html>`_.
+  }];
+}
+
 def ObjCMethodFamilyDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h
index e27d8cc..63559d9 100644
--- a/clang/include/clang/Basic/Builtins.h
+++ b/clang/include/clang/Basic/Builtins.h
@@ -102,6 +102,9 @@ public:
   /// e.g. "__builtin_abs".
   llvm::StringRef getName(unsigned ID) const { return getRecord(ID).Name; }
 
+  /// Return a quoted name for the specified builtin for use in diagnostics.
+  std::string getQuotedName(unsigned ID) const;
+
   /// Get the type descriptor string for the specified builtin.
   const char *getTypeString(unsigned ID) const { return getRecord(ID).Type; }
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index b5b47ae..468c160 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -3568,6 +3568,19 @@ def Frexp : FPMathTemplate, LibBuiltin<"math.h"> {
   let AddBuiltinPrefixedAlias = 1;
 }
 
+def Sincos : FPMathTemplate, GNULibBuiltin<"math.h"> {
+  let Spellings = ["sincos"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(T, T*, T*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def SincosF16F128 : F16F128MathTemplate, Builtin {
+  let Spellings = ["__builtin_sincos"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void(T, T*, T*)";
+}
+
 def Ldexp : FPMathTemplate, LibBuiltin<"math.h"> {
   let Spellings = ["ldexp"];
   let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td
index cff182f..1a1096d 100644
--- a/clang/include/clang/Basic/BuiltinsBase.td
+++ b/clang/include/clang/Basic/BuiltinsBase.td
@@ -88,6 +88,8 @@ class Builtin {
   // On some platforms, some functions are actually macros. In that case we need
   // to #undef them.
   bit RequiresUndef = 0;
+  // Enables builtins to generate `long long` outside of OpenCL and `long` inside.
+  bit EnableOpenCLLong = 0;
 }
 
 class CustomEntry {
@@ -95,9 +97,6 @@ class CustomEntry {
 }
 
 class AtomicBuiltin : Builtin;
-class TargetBuiltin : Builtin {
-  string Features = "";
-}
 
 class LibBuiltin<string header, string languages = "ALL_LANGUAGES"> : Builtin {
   string Header = header;
@@ -122,6 +121,14 @@ class OCL_DSELangBuiltin : LangBuiltin<"OCL_DSE">;
 class OCL_GASLangBuiltin : LangBuiltin<"OCL_GAS">;
 class OCLLangBuiltin : LangBuiltin<"ALL_OCL_LANGUAGES">;
 
+class TargetBuiltin : Builtin {
+  string Features = "";
+}
+class TargetLibBuiltin : TargetBuiltin {
+  string Header;
+  string Languages = "ALL_LANGUAGES";
+}
+
 class Template<list<string> substitutions,
                list<string> affixes,
                bit as_prefix = 0> {
diff --git a/clang/include/clang/Basic/BuiltinsSPIRV.td b/clang/include/clang/Basic/BuiltinsSPIRV.td
new file mode 100644
index 0000000..1e66939
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsSPIRV.td
@@ -0,0 +1,15 @@
+//===--- BuiltinsSPIRV.td - SPIRV Builtin function database ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+def SPIRVDistance : Builtin {
+  let Spellings = ["__builtin_spirv_distance"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
deleted file mode 100644
index 352b3a9..0000000
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ /dev/null
@@ -1,2225 +0,0 @@
-//===--- BuiltinsX86.def - X86 Builtin function database --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the X86-specific builtin function database.  Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-// The format of this database matches clang/Basic/Builtins.def.
-
-// FIXME: Ideally we would be able to pull this information from what
-// LLVM already knows about X86 builtins. We need to match the LLVM
-// definition anyway, since code generation will lower to the
-// intrinsic if one exists.
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN)
-#  define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-// MMX
-//
-// All MMX instructions will be generated via builtins. Any MMX vector
-// types (<1 x i64>, <2 x i32>, etc.) that aren't used by these builtins will be
-// expanded by the back-end.
-// FIXME: _mm_prefetch must be a built-in because it takes a compile-time constant
-// argument and our prior approach of using a #define to the current built-in
-// doesn't work in the presence of re-declaration of _mm_prefetch for windows.
-TARGET_BUILTIN(_mm_prefetch, "vcC*i", "nc", "mmx")
-
-// SSE intrinsics.
-
-TARGET_BUILTIN(__builtin_ia32_ldmxcsr, "vUi", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_setcsr, "vUi", "nh",XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_stmxcsr, "Ui", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_getcsr, "Ui", "nh", XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_cvtss2si, "iV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cvttss2si, "iV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "nV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sfence, "v", "n", "sse")
-TARGET_HEADER_BUILTIN(_mm_sfence, "v", "nh", XMMINTRIN_H, ALL_LANGUAGES, "sse")
-TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rsqrtps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_rsqrtss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sqrtps, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_sqrtss, "V4fV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_shufps, "V4fV4fV4fIi", "ncV:128:", "sse")
-
-TARGET_BUILTIN(__builtin_ia32_maskmovdqu, "vV16cV16cc*", "nV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movmskpd, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "n", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshufd, "V4iV4iIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshuflw, "V8sV8sIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pshufhw, "V8sV8sIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2OiV16cV16c", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_shufpd, "V2dV2dV2dIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq, "V2OiV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps, "V4fV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq, "V4iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2si, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttsd2si, "iV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2ss, "V4fV4fV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq, "V4iV4f", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq, "V4iV4f", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_clflush, "vvC*", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_clflush, "vvC*", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_lfence, "v", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_lfence, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_mfence, "v", "n", "sse2")
-TARGET_HEADER_BUILTIN(_mm_mfence, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "sse2")
-TARGET_BUILTIN(__builtin_ia32_pause, "v", "n", "")
-TARGET_HEADER_BUILTIN(_mm_pause, "v", "nh", EMMINTRIN_H, ALL_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_pmuludq128, "V2OiV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psraw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrad128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrld128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlq128, "V2OiV2OiV2Oi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllw128, "V8sV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslld128, "V4iV4iV4i", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllq128, "V2OiV2OiV2Oi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllwi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslldi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psllqi128, "V2OiV2Oii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlwi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrldi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrlqi128, "V2OiV2Oii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrawi128, "V8sV8si", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psradi128, "V4iV4ii", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd128, "V4iV8sV8s", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_pslldqi128_byteshift, "V2OiV2OiIi", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_psrldqi128_byteshift, "V2OiV2OiIi", "ncV:128:", "sse2")
-
-TARGET_BUILTIN(__builtin_ia32_monitor, "vvC*UiUi", "n", "sse3")
-TARGET_BUILTIN(__builtin_ia32_mwait, "vUiUi", "n", "sse3")
-TARGET_BUILTIN(__builtin_ia32_lddqu, "V16ccC*", "nV:128:", "sse3")
-
-TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIi", "ncV:128:", "ssse3")
-
-TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pblendw128, "V8sV8sV8sIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendpd, "V2dV2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendps, "V4fV4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendvpd, "V2dV2dV2dV2d", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_blendvps, "V4fV4fV4fV4f", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_packusdw128, "V8sV4iV4i", "ncV:128:", "sse4.1")
-
-TARGET_BUILTIN(__builtin_ia32_pmuldq128, "V2OiV4iV4i", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundps, "V4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundss, "V4fV4fV4fIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundsd, "V2dV2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_roundpd, "V2dV2dIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_dpps, "V4fV4fV4fIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_dppd, "V2dV2dV2dIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestz128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestc128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_ptestnzc128, "iV2OiV2Oi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_mpsadbw128, "V16cV16cV16cIc", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_phminposuw128, "V8sV8s", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v16qi, "cV16cIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v16qi, "V16cV16ccIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4si, "V4iV4iiIi", "ncV:128:", "sse4.1")
-
-// SSE 4.2
-TARGET_BUILTIN(__builtin_ia32_pcmpistrm128, "V16cV16cV16cIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistri128, "iV16cV16cIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestrm128, "V16cV16ciV16ciIc", "ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestri128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-
-TARGET_BUILTIN(__builtin_ia32_pcmpistria128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistric128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistrio128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistris128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpistriz128, "iV16cV16cIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestria128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestric128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestrio128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestris128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-TARGET_BUILTIN(__builtin_ia32_pcmpestriz128, "iV16ciV16ciIc","ncV:128:", "sse4.2")
-
-TARGET_BUILTIN(__builtin_ia32_crc32qi, "UiUiUc", "nc", "crc32")
-TARGET_BUILTIN(__builtin_ia32_crc32hi, "UiUiUs", "nc", "crc32")
-TARGET_BUILTIN(__builtin_ia32_crc32si, "UiUiUi", "nc", "crc32")
-
-// SSE4a
-TARGET_BUILTIN(__builtin_ia32_extrqi, "V2OiV2OiIcIc", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_extrq, "V2OiV2OiV16c", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_insertqi, "V2OiV2OiV2OiIcIc", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_insertq, "V2OiV2OiV2Oi", "ncV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_movntsd, "vd*V2d", "nV:128:", "sse4a")
-TARGET_BUILTIN(__builtin_ia32_movntss, "vf*V4f", "nV:128:", "sse4a")
-
-// AES
-TARGET_BUILTIN(__builtin_ia32_aesenc128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesdec128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast128, "V2OiV2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aesimc128, "V2OiV2Oi", "ncV:128:", "aes")
-TARGET_BUILTIN(__builtin_ia32_aeskeygenassist128, "V2OiV2OiIc", "ncV:128:", "aes")
-
-// VAES
-TARGET_BUILTIN(__builtin_ia32_aesenc256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenc512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesenclast512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdec256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdec512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast256, "V4OiV4OiV4Oi", "ncV:256:", "vaes")
-TARGET_BUILTIN(__builtin_ia32_aesdeclast512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512,vaes")
-
-// GFNI
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v16qi, "V16cV16cV16cIc", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v32qi, "V32cV32cV32cIc", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineinvqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512f,evex512,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v16qi, "V16cV16cV16cIc", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v32qi, "V32cV32cV32cIc", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8affineqb_v64qi, "V64cV64cV64cIc", "ncV:512:", "avx512f,evex512,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v16qi, "V16cV16cV16c", "ncV:128:", "gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v32qi, "V32cV32cV32c", "ncV:256:", "avx,gfni")
-TARGET_BUILTIN(__builtin_ia32_vgf2p8mulb_v64qi, "V64cV64cV64c", "ncV:512:", "avx512f,evex512,gfni")
-
-// CLMUL
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq128, "V2OiV2OiV2OiIc", "ncV:128:", "pclmul")
-
-// VPCLMULQDQ
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq256, "V4OiV4OiV4OiIc", "ncV:256:", "vpclmulqdq")
-TARGET_BUILTIN(__builtin_ia32_pclmulqdq512, "V8OiV8OiV8OiIc", "ncV:512:", "avx512f,evex512,vpclmulqdq")
-
-// AVX
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd, "V2dV2dV2Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps, "V4fV4fV4i", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256, "V4dV4dV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps256, "V8fV8fV8i", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendpd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_shufpd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_shufps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cmppd256, "V4dV4dV4dIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_pd256, "V2dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_ps256, "V4fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vextractf128_si256, "V4iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps256, "V4fV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq256, "V8iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq256, "V4iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq256, "V4iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq256, "V8iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_pd256, "V4dV4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_ps256, "V8fV8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vperm2f128_si256, "V8iV8iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd, "V2dV2dIi", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilps, "V4fV4fIi", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd256, "V4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vpermilps256, "V8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_pd256, "V4dV4dV2dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_ps256, "V8fV8fV4fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vinsertf128_si256, "V8iV8iV4iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_sqrtpd256, "V4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_sqrtps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_rsqrtps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_rcpps256, "V8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_roundpd256, "V4dV4dIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_roundps256, "V8fV8fIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcpd, "iV2dV2d", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcps, "iV4fV4f", "ncV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcpd256, "iV4dV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestzps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestcps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vtestnzcps256, "iV8fV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestz256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestc256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_ptestnzc256, "iV4OiV4Oi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_movmskpd256, "iV4d", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_movmskps256, "iV8f", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vzeroall, "v", "n", "avx")
-TARGET_BUILTIN(__builtin_ia32_vzeroupper, "v", "n", "avx")
-TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2Oi", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4Oi", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskloadps256, "V8fV8fC*V8i", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstorepd, "vV2d*V2OiV2d", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstoreps, "vV4f*V4iV4f", "nV:128:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstorepd256, "vV4d*V4OiV4d", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_maskstoreps256, "vV8f*V8iV8f", "nV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v32qi, "cV32cIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v16hi, "sV16sIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v8si, "iV8iIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v32qi, "V32cV32ccIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v16hi, "V16sV16ssIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v8si, "V8iV8iiIi", "ncV:256:", "avx")
-
-// AVX2
-TARGET_BUILTIN(__builtin_ia32_mpsadbw256, "V32cV32cV32cIc", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packsswb256, "V32cV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packssdw256, "V16sV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packuswb256, "V32cV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_packusdw256, "V16sV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phaddsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_phsubsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmaddubsw256, "V16sV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd256, "V8iV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmovmskb256, "iV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmuldq256, "V4OiV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhrsw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhuw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmulhw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pmuludq256, "V4OiV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psadbw256, "V4OiV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufd256, "V8iV8iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshuflw256, "V16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pshufhw256, "V16sV16sIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignb256, "V32cV32cV32c", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignw256, "V16sV16sV16s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psignd256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllwi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslldqi256_byteshift, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslldi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pslld256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllqi256, "V4OiV4Oii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllq256, "V4OiV4OiV2Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrawi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psraw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psradi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrad256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrldqi256_byteshift, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlwi256, "V16sV16si", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlw256, "V16sV16sV8s", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4OiV4Oii", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4OiV4OiV2Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendd128, "V4iV4iV4iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_pblendd256, "V8iV8iV8iIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permdf256, "V4dV4dIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permti256, "V4OiV4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_permdi256, "V4OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_extract128i256, "V2OiV4OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_insert128i256, "V4OiV4OiV2OiIi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadd256, "V8iV8iC*V8i", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadq256, "V4OiV4OiC*V4Oi", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadd, "V4iV4iC*V4i", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskloadq, "V2OiV2OiC*V2Oi", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstored256, "vV8i*V8iV8i", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstoreq256, "vV4Oi*V4OiV4Oi", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstored, "vV4i*V4iV4i", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_maskstoreq, "vV2Oi*V2OiV2Oi", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv4di, "V4OiV4OiV4Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psllv2di, "V2OiV2OiV2Oi", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrav8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrav4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv8si, "V8iV8iV8i", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv4si, "V4iV4iV4i", "ncV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv4di, "V4OiV4OiV4Oi", "ncV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_psrlv2di, "V2OiV2OiV2Oi", "ncV:128:", "avx2")
-
-// GATHER
-TARGET_BUILTIN(__builtin_ia32_gatherd_pd, "V2dV2ddC*V4iV2dIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_pd256, "V4dV4ddC*V4iV4dIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_pd, "V2dV2ddC*V2OiV2dIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_pd256, "V4dV4ddC*V4OiV4dIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_ps, "V4fV4ffC*V4iV4fIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_ps256, "V8fV8ffC*V8iV8fIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_ps, "V4fV4ffC*V2OiV4fIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_ps256, "V4fV4ffC*V4OiV4fIc", "nV:256:", "avx2")
-
-TARGET_BUILTIN(__builtin_ia32_gatherd_q, "V2OiV2OiOiC*V4iV2OiIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_q256, "V4OiV4OiOiC*V4iV4OiIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_q, "V2OiV2OiOiC*V2OiV2OiIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_q256, "V4OiV4OiOiC*V4OiV4OiIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_d, "V4iV4iiC*V4iV4iIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherd_d256, "V8iV8iiC*V8iV8iIc", "nV:256:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_d, "V4iV4iiC*V2OiV4iIc", "nV:128:", "avx2")
-TARGET_BUILTIN(__builtin_ia32_gatherq_d256, "V4iV4iiC*V4OiV4iIc", "nV:256:", "avx2")
-
-// F16C
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph, "V8sV4fIi", "ncV:128:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256, "V8sV8fIi", "ncV:256:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps, "V4fV8s", "ncV:128:", "f16c")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256, "V8fV8s", "ncV:256:", "f16c")
-
-// RDRAND
-TARGET_BUILTIN(__builtin_ia32_rdrand16_step, "UiUs*", "n", "rdrnd")
-TARGET_BUILTIN(__builtin_ia32_rdrand32_step, "UiUi*", "n", "rdrnd")
-
-// FXSR
-TARGET_BUILTIN(__builtin_ia32_fxrstor, "vv*", "n", "fxsr")
-TARGET_BUILTIN(__builtin_ia32_fxsave, "vv*", "n", "fxsr")
-
-// XSAVE
-TARGET_BUILTIN(__builtin_ia32_xsave, "vv*UOi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xrstor, "vv*UOi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xgetbv, "UOiUi", "n", "xsave")
-TARGET_HEADER_BUILTIN(_xgetbv, "UWiUi", "nh", IMMINTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_xsetbv, "vUiUOi", "n", "xsave")
-TARGET_HEADER_BUILTIN(_xsetbv, "vUiUWi", "nh", IMMINTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_BUILTIN(__builtin_ia32_xsaveopt, "vv*UOi", "n", "xsaveopt")
-TARGET_BUILTIN(__builtin_ia32_xrstors, "vv*UOi", "n", "xsaves")
-TARGET_BUILTIN(__builtin_ia32_xsavec, "vv*UOi", "n", "xsavec")
-TARGET_BUILTIN(__builtin_ia32_xsaves, "vv*UOi", "n", "xsaves")
-
-// SHSTK
-TARGET_BUILTIN(__builtin_ia32_incsspd, "vUi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_rdsspd, "UiUi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_saveprevssp, "v", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_rstorssp, "vv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrssd, "vUiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrussd, "vUiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_setssbsy, "v", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_clrssbsy, "vv*", "n", "shstk")
-
-//CLFLUSHOPT
-TARGET_BUILTIN(__builtin_ia32_clflushopt, "vvC*", "n", "clflushopt")
-
-//CLWB
-TARGET_BUILTIN(__builtin_ia32_clwb, "vvC*", "n", "clwb")
-
-//WB[NO]INVD
-TARGET_BUILTIN(__builtin_ia32_wbinvd, "v", "n", "")
-TARGET_BUILTIN(__builtin_ia32_wbnoinvd, "v", "n", "wbnoinvd")
-
-// ADX
-TARGET_BUILTIN(__builtin_ia32_addcarryx_u32, "UcUcUiUiUi*", "nE", "")
-TARGET_BUILTIN(__builtin_ia32_subborrow_u32, "UcUcUiUiUi*", "nE", "")
-
-// RDSEED
-TARGET_BUILTIN(__builtin_ia32_rdseed16_step, "UiUs*", "n", "rdseed")
-TARGET_BUILTIN(__builtin_ia32_rdseed32_step, "UiUi*", "n", "rdseed")
-
-// LZCNT
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u16, "UsUs", "ncE", "lzcnt")
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u32, "UiUi", "ncE", "lzcnt")
-
-// BMI
-TARGET_BUILTIN(__builtin_ia32_bextr_u32, "UiUiUi", "ncE", "bmi")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u16, "UsUs", "ncE", "")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u32, "UiUi", "ncE", "")
-
-// BMI2
-TARGET_BUILTIN(__builtin_ia32_bzhi_si, "UiUiUi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pdep_si, "UiUiUi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pext_si, "UiUiUi", "ncE", "bmi2")
-
-// TBM
-TARGET_BUILTIN(__builtin_ia32_bextri_u32, "UiUiIUi", "ncE", "tbm")
-
-// LWP
-TARGET_BUILTIN(__builtin_ia32_llwpcb, "vv*", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_slwpcb, "v*", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_lwpins32, "UcUiUiIUi", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_lwpval32, "vUiUiIUi", "n", "lwp")
-
-// SHA
-TARGET_BUILTIN(__builtin_ia32_sha1rnds4, "V4iV4iV4iIc", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1nexte, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1msg1, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha1msg2, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256rnds2, "V4iV4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256msg1, "V4iV4iV4i", "ncV:128:", "sha")
-TARGET_BUILTIN(__builtin_ia32_sha256msg2, "V4iV4iV4i", "ncV:128:", "sha")
-
-// FMA
-TARGET_BUILTIN(__builtin_ia32_vfmaddps, "V4fV4fV4fV4f", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd, "V2dV2dV2dV2d", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3, "V4fV4fV4fV4f", "ncV:128:", "fma")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3, "V2dV2dV2dV2d", "ncV:128:", "fma")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss, "V4fV4fV4fV4f", "ncV:128:", "fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd, "V2dV2dV2dV2d", "ncV:128:", "fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps, "V4fV4fV4fV4f", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd, "V2dV2dV2dV2d", "ncV:128:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256, "V8fV8fV8fV8f", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256, "V4dV4dV4dV4d", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256, "V8fV8fV8fV8f", "ncV:256:", "fma|fma4")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256, "V4dV4dV4dV4d", "ncV:256:", "fma|fma4")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_maskz, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_maskz, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd512_mask3, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddps512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-
-// XOP
-TARGET_BUILTIN(__builtin_ia32_vpmacssww, "V8sV8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsww, "V8sV8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdd, "V4iV4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdd, "V4iV4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdql, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdql, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacssdqh, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmacsdqh, "V2OiV4iV4iV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmadcsswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpmadcswd, "V4iV8sV8sV4i", "ncV:128:", "xop")
-
-TARGET_BUILTIN(__builtin_ia32_vphaddbw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddbd, "V4iV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddbq, "V2OiV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddwq, "V2OiV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadddq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubd, "V4iV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddubq, "V2OiV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadduwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphadduwq, "V2OiV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphaddudq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubbw, "V8sV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubwd, "V4iV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vphsubdq, "V2OiV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpperm, "V16cV16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotb, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotd, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotbi, "V16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotwi, "V8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotdi, "V4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vprotqi, "V2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlb, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshld, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshlq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshab, "V16cV16cV16c", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshaw, "V8sV8sV8s", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshad, "V4iV4iV4i", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpshaq, "V2OiV2OiV2Oi", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomub, "V16cV16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomuw, "V8sV8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomud, "V4iV4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomuq, "V2OiV2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomb, "V16cV16cV16cIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomw, "V8sV8sV8sIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomd, "V4iV4iV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpcomq, "V2OiV2OiV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2pd, "V2dV2dV2dV2OiIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2pd256, "V4dV4dV4dV4OiIc", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2ps, "V4fV4fV4fV4iIc", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vpermil2ps256, "V8fV8fV8fV8iIc", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczss, "V4fV4f", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczsd, "V2dV2d", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczps, "V4fV4f", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczpd, "V2dV2d", "ncV:128:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczps256, "V8fV8f", "ncV:256:", "xop")
-TARGET_BUILTIN(__builtin_ia32_vfrczpd256, "V4dV4d", "ncV:256:", "xop")
-
-TARGET_BUILTIN(__builtin_ia32_xbegin, "i", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xend, "v", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xabort, "vIc", "n", "rtm")
-TARGET_BUILTIN(__builtin_ia32_xtest, "i", "n", "rtm")
-
-BUILTIN(__builtin_ia32_rdpmc, "UOii", "")
-BUILTIN(__builtin_ia32_rdtsc, "UOi", "")
-BUILTIN(__rdtsc, "UOi", "")
-BUILTIN(__builtin_ia32_rdtscp, "UOiUi*", "")
-
-TARGET_BUILTIN(__builtin_ia32_rdpid, "Ui", "n", "rdpid")
-TARGET_BUILTIN(__builtin_ia32_rdpru, "ULLii", "n", "rdpru")
-
-// PKU
-TARGET_BUILTIN(__builtin_ia32_rdpkru, "Ui", "n", "pku")
-TARGET_BUILTIN(__builtin_ia32_wrpkru, "vUi", "n", "pku")
-
-// AVX-512
-TARGET_BUILTIN(__builtin_ia32_sqrtpd512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_sqrtps512, "V16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_rcp14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rcp14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_cvttps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_cmpps512_mask,   "UsV16fV16fIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpps256_mask,   "UcV8fV8fIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpps128_mask,   "UcV4fV4fIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmppd512_mask, "UcV8dV8dIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmppd256_mask, "UcV4dV4dIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmppd128_mask, "UcV2dV2dIiUc", "ncV:128:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_minps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_minpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtdq2ps512_mask, "V16fV16iV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtudq2ps512_mask, "V16fV16iV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps512_mask, "V8fV8dV8fUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph512_mask, "V16sV16fIiV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps512_mask, "V16fV16sV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmuldq512, "V8OiV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmuludq512, "V8OiV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi512_mask, "V8OiOiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadups512_mask, "V16ffC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadaps512_mask, "V16fV16fC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadupd512_mask, "V8ddC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_loadapd512_mask, "V8dV8dC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedqudi512_mask, "vOi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedqusi512_mask, "vi*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeupd512_mask, "vd*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeapd512_mask, "vV8d*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeups512_mask, "vf*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_storeaps512_mask, "vV16f*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignd512, "V16iV16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_alignd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f,evex512")
-
-// AVX-VNNI and AVX512-VNNI
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
-
-// AVX-VNNI-INT8
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
-
-// MOVRS
-TARGET_BUILTIN(__builtin_ia32_prefetchrs, "vvC*", "nc", "movrs")
-
-TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4df, "V4dV4dvC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4di, "V4OiV4OivC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4sf, "V4fV4fvC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div4si, "V4iV4ivC*V2OiUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div8sf, "V4fV4fvC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3div8si, "V4iV4ivC*V4OiUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv2df, "V2dV2dvC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv2di, "V2OiV2OivC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4df, "V4dV4dvC*V4iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4di, "V4OiV4OivC*V4iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4sf, "V4fV4fvC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv4si, "V4iV4ivC*V4iUcIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv8sf, "V8fV8fvC*V8iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gather3siv8si, "V8iV8ivC*V8iUcIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_gathersiv8df, "V8dV8dvC*V8iUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv16sf, "V16fV16fvC*V16iUsIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv8df, "V8dV8dvC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv16sf, "V8fV8fvC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv8di, "V8OiV8OivC*V8iUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gathersiv16si, "V16iV16ivC*V16iUsIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv8di, "V8OiV8OivC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherdiv16si, "V8iV8ivC*V8OiUcIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8df, "vv*UcV8iV8dIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv16sf, "vv*UsV16iV16fIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8df,  "vv*UcV8OiV8dIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv16sf, "vv*UcV8OiV8fIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8di,  "vv*UcV8iV8OiIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vv*UsV16iV16iIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8di,  "vv*UcV8OiV8OiIi", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vv*UcV8OiV8iIi", "nV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_knotqi, "UcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_knothi, "UsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_knotsi, "UiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_knotdi, "UOiUOi", "nc", "avx512bw")
-
-TARGET_BUILTIN(__builtin_ia32_cmpb128_mask, "UsV16cV16cIiUs", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpd128_mask, "UcV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpq128_mask, "UcV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpw128_mask, "UcV8sV8sIiUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpb256_mask, "UiV32cV32cIiUi", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpd256_mask, "UcV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpq256_mask, "UcV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpw256_mask, "UsV16sV16sIiUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cmpb512_mask, "UOiV64cV64cIiUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpd512_mask, "UsV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpq512_mask, "UcV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpw512_mask, "UiV32sV32sIiUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpb128_mask, "UsV16cV16cIiUs", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpd128_mask, "UcV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpq128_mask, "UcV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpw128_mask, "UcV8sV8sIiUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpb256_mask, "UiV32cV32cIiUi", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpd256_mask, "UcV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpq256_mask, "UcV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_ucmpw256_mask, "UsV16sV16sIiUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ucmpb512_mask, "UOiV64cV64cIiUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpd512_mask, "UsV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpq512_mask, "UcV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_ucmpw512_mask, "UiV32sV32sIiUi", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_packssdw512, "V32sV16iV16i", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packsswb512, "V64cV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packusdw512, "V32sV16iV16i", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_packuswb512, "V64cV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pavgb512, "V64cV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pavgw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pshufb512, "V64cV64cV64c", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_128, "V2OiV2Oi", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_256, "V4OiV4Oi", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_128, "V4iV4i", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_256, "V8iV8i", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpconflictdi_512, "V8OiV8Oi", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpconflictsi_512, "V16iV16i", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_512, "V16iV16i", "ncV:512:", "avx512cd,evex512")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_512, "V8OiV8Oi", "ncV:512:", "avx512cd,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb128_mask, "UsV16cV16cUs", "ncV:128:", "avx512vl,avx512bitalg")
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb256_mask, "UiV32cV32cUi", "ncV:256:", "avx512vl,avx512bitalg")
-TARGET_BUILTIN(__builtin_ia32_vpshufbitqmb512_mask, "UOiV64cV64cUOi", "ncV:512:", "avx512bitalg,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmulhrsw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmulhuw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmulhw512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_addpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_addps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_divpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_divps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_subpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_subps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmaddubsw512, "V32sV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmaddwd512, "V16iV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_addss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_divss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_mulss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_subss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_maxss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_minss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_addsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_divsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_mulsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_subsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_maxsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_minsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-
-TARGET_BUILTIN(__builtin_ia32_compressdf128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdf256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdi128_mask, "V2OiV2OiV2OiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressdi256_mask, "V4OiV4OiV4OiUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_compresshi128_mask, "V8sV8sV8sUc", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compresshi256_mask, "V16sV16sV16sUs", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressqi128_mask, "V16cV16cV16cUs", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressqi256_mask, "V32cV32cV32cUi", "ncV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_compresssf128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssf256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssi128_mask, "V4iV4iV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compresssi256_mask, "V8iV8iV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_compressstorehi128_mask, "vV8s*V8sUc", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstorehi256_mask, "vV16s*V16sUs", "nV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi128_mask, "vV16c*V16cUs", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi256_mask, "vV32c*V32cUi", "nV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_compressstoresf128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresf256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2dq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2ps_mask, "V4fV2dV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2udq256_mask, "V4iV4dV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq128_mask, "V4iV4fV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtps2udq256_mask, "V8iV8fV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2dq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq128_mask, "V4iV2dV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2udq256_mask, "V4iV4dV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq128_mask, "V4iV4fV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvttps2udq256_mask, "V8iV8fV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddf128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddf256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddi128_mask, "V2OiV2OiV2OiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expanddi256_mask, "V4OiV4OiV4OiUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_expandhi128_mask, "V8sV8sV8sUc", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandhi256_mask, "V16sV16sV16sUs", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandqi128_mask, "V16cV16cV16cUs", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandqi256_mask, "V32cV32cV32cUi", "ncV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_expandloaddf128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddf256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi128_mask, "V4iV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_expandloadhi128_mask, "V8sV8sC*V8sUc", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadhi256_mask, "V16sV16sC*V16sUs", "nV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi128_mask, "V16cV16cC*V16cUs", "nV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi256_mask, "V32cV32cC*V32cUi", "nV:256:", "avx512vl,avx512vbmi2")
-
-TARGET_BUILTIN(__builtin_ia32_expandloadsf128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsf256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsf128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsf256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsi128_mask, "V4iV4iV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_expandsi256_mask, "V8iV8iV8iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexppd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexppd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscalepd_256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleps_256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefpd128_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefpd256_mask, "V4dV4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefps128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefps256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_scatterdiv2df, "vv*UcV2OiV2dIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv2di, "vv*UcV2OiV2OiIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4df, "vv*UcV4OiV4dIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4di, "vv*UcV4OiV4OiIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4sf, "vv*UcV2OiV4fIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv4si, "vv*UcV2OiV4iIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8sf, "vv*UcV4OiV4fIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scatterdiv8si, "vv*UcV4OiV4iIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv2df, "vv*UcV4iV2dIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv2di, "vv*UcV4iV2OiIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4df, "vv*UcV4iV4dIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4di, "vv*UcV4iV4OiIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4sf, "vv*UcV4iV4fIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv4si, "vv*UcV4iV4iIi", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8sf, "vv*UcV8iV8fIi", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scattersiv8si, "vv*UcV8iV8iIi", "nV:256:", "avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2vard512, "V16iV16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd128, "V2dV2dV2OiV2d", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd256, "V4dV4dV4OiV4d", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varpd512, "V8dV8dV8OiV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps128, "V4fV4fV4iV4f", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps256, "V8fV8fV8iV8f", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varps512, "V16fV16fV16iV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi128, "V16cV16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi256, "V32cV32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varqi512, "V64cV64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_vpermi2varhi512, "V32sV32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshldd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldd512, "V16iV16iV16iIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldw128, "V8sV8sV8sIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldw256, "V16sV16sV16sIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldw512, "V32sV32sV32sIi", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshldvd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshldvw512, "V32sV32sV32sV32s", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw128, "V8sV8sV8sV8s", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw256, "V16sV16sV16sV16s", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdvw512, "V32sV32sV32sV32s", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vpshrdd128, "V4iV4iV4iIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdd256, "V8iV8iV8iIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdd512, "V16iV16iV16iIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq128, "V2OiV2OiV2OiIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdq512, "V8OiV8OiV8OiIi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw128, "V8sV8sV8sIi", "ncV:128:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw256, "V16sV16sV16sIi", "ncV:256:", "avx512vl,avx512vbmi2")
-TARGET_BUILTIN(__builtin_ia32_vpshrdw512, "V32sV32sV32sIi", "ncV:512:", "avx512vbmi2,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_pmovswb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovwb512_mask, "V32cV32sV32cUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2ps128_mask, "V4fV2OiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq128_mask, "V2OiV2dV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq256_mask, "V4OiV4dV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq128_mask, "V2OiV4fV2OiUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq256_mask, "V4OiV4fV4OiUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps128_mask, "V4fV2OiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangepd128_mask, "V2dV2dV2dIiV2dUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangepd256_mask, "V4dV4dV4dIiV4dUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangeps128_mask, "V4fV4fV4fIiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangeps256_mask, "V8fV8fV8fIiV8fUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangesd128_round_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_rangess128_round_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducepd128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducepd256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reduceps128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reduceps256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl,avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducesd_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_reducess_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_pmovswb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovswb256_mask, "V16cV16sV16cUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb256_mask, "V16cV16sV16cUs", "ncV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovwb128_mask, "V16cV8sV16cUc", "ncV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2qq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2qq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2uqq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2pd512_mask, "V8dV8OiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtqq2ps512_mask, "V8fV8OiV8fUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2qq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq512_mask, "V8OiV8dV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2qq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvttps2uqq512_mask, "V8OiV8fV8OiUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2pd512_mask, "V8dV8OiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps512_mask, "V8fV8OiV8fUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_rangepd512_mask, "V8dV8dV8dIiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_rangeps512_mask, "V16fV16fV16fIiV16fUsIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_reducepd512_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduceps512_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_prold512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolq512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prold128, "V4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prold256, "V8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolq128, "V2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolq256, "V4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvd512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolvq512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prord512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorq512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prolvd128, "V4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvd256, "V8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prolvq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prord128, "V4iV4iIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prord256, "V8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorq128, "V2OiV2OiIi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorq256, "V4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvd512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorvq512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_prorvd128, "V4iV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvd256, "V8iV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_prorvq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pshufhw512, "V32sV32sIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pshuflw512, "V32sV32sIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllwi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psllv8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pslldi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrlv8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrldi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav32hi, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav16hi, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psrav8hi, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psravq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psravq256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrawi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlw512, "V32sV32sV8s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlwi512, "V32sV32si", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pslldqi512_byteshift, "V8OiV8OiIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrldqi512_byteshift, "V8OiV8OiIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32load512_mask, "V16iV16iC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store512_mask, "vV16i*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load512_mask, "V8OiV8OiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store512_mask, "vV8Oi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa32store256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load128_mask, "V2OiV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64load256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma")
-TARGET_BUILTIN(__builtin_ia32_vcomisd, "iV2dV2dIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcomiss, "iV4fV4fIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kunpckdi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kunpcksi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi512_mask, "V32sV32sC*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi512_mask, "V64cV64cC*V64cUOi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_mask, "V8dV8dV8dV8OiIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_maskz, "V8dV8dV8dV8OiIiUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps512_mask, "V16fV16fV16fV16iIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps512_maskz, "V16fV16fV16fV16iIiUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_fixupimmsd_mask, "V2dV2dV2dV2OiIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmsd_maskz, "V2dV2dV2dV2OiIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmss_mask, "V4fV4fV4fV4iIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_fixupimmss_maskz, "V4fV4fV4fV4iIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getexpsd128_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getexpss128_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getmantsd_round_mask, "V2dV2dV2dIiV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_getmantss_round_mask, "V4fV4fV4fIiV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi128_mask, "V8sV8sC*V8sUc", "nV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquhi256_mask, "V16sV16sC*V16sUs", "nV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi128_mask, "V16cV16cC*V16cUs", "nV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddquqi256_mask, "V32cV32cC*V32cUi", "nV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_mask, "V2dV2dV2dV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_maskz, "V2dV2dV2dV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_mask, "V4dV4dV4dV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_maskz, "V4dV4dV4dV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps128_mask, "V4fV4fV4fV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps128_maskz, "V4fV4fV4fV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps256_mask, "V8fV8fV8fV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fixupimmps256_maskz, "V8fV8fV8fV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadapd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadsd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadapd256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadaps128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadss128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_loadaps256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi128_mask, "V2OiV2OiC*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqudi256_mask, "V4OiV4OiC*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi128_mask, "V4iV4iC*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loaddqusi256_mask, "V8iV8iC*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadupd128_mask, "V2dV2dC*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadupd256_mask, "V4dV4dC*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadups128_mask, "V4fV4fC*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_loadups256_mask, "V8fV8fC*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedquhi512_mask, "vV32s*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedquqi512_mask, "vV64c*V64cUOi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_storedquhi128_mask, "vV8s*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquhi256_mask, "vV16s*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquqi128_mask, "vV16c*V16cUs", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storedquqi256_mask, "vV32c*V32cUi", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_storeapd128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storesd128_mask, "vV2d*V2dUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeapd256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeaps128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storess128_mask, "vV4f*V4fUc", "nV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_storeaps256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqudi128_mask, "vV2Oi*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqudi256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqusi128_mask, "vV4i*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storedqusi256_mask, "vV8i*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeupd128_mask, "vV2d*V2dUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeupd256_mask, "vV4d*V4dUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeups128_mask, "vV4f*V4fUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_storeups256_mask, "vV8f*V8fUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14pd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcp14ps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_128, "V4iV4i", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntd_256, "V8iV8i", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_128, "V2OiV2Oi", "ncV:128:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vplzcntq_256, "V4OiV4Oi", "ncV:256:", "avx512cd,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2si32, "iV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi32, "UiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2si32, "iV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2usi32, "UiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2si32, "iV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi32, "UiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2si32, "iV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usi32, "UiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vpermilpd512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilps512, "V16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarpd512, "V8dV8dV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpermilvarps512, "V16fV16fV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_rndscalesd_round_mask, "V2dV2dV2dV2dUcIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rndscaless_round_mask, "V4fV4fV4fV4fUcIiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scalefpd512_mask, "V8dV8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scalefps512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_scalefsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_scalefss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_psradi512, "V16iV16ii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraqi512, "V8OiV8Oii", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraq128, "V2OiV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraq256, "V4OiV4OiV2Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraqi128, "V2OiV2Oii", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_psraqi256, "V4OiV4Oii", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pslld512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psllv8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrad512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psraq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrav8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrld512, "V16iV16iV4i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlq512, "V8OiV8OiV2Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv16si, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_psrlv8di, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd512_mask, "V16iV16iV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd512_maskz, "V16iV16iV16iV16iIiUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogq512_mask, "V8OiV8OiV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogq512_maskz, "V8OiV8OiV8OiV8OiIiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pternlogd128_mask, "V4iV4iV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd128_maskz, "V4iV4iV4iV4iIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd256_mask, "V8iV8iV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogd256_maskz, "V8iV8iV8iV8iIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq128_mask, "V2OiV2OiV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq128_maskz, "V2OiV2OiV2OiV2OiIiUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq256_mask, "V4OiV4OiV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pternlogq256_maskz, "V4OiV4OiV4OiV4OiIiUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_f32x4, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_f64x2, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_i32x4, "V16iV16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_i64x2, "V8OiV8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shufpd512, "V8dV8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shufps512, "V16fV16fV16fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_shuf_f32x4_256, "V8fV8fV8fIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_f64x2_256, "V4dV4dV4dIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_i32x4_256, "V8iV8iV8iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_shuf_i64x2_256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtsd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_sqrtss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd128_mask, "V2dV2dV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14pd256_mask, "V4dV4dV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps128_mask, "V4fV4fV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrt14ps256_mask, "V8fV8fV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask512, "UOiV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b512, "V64cUOi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w512, "V32sUi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask512, "UsV16i", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d512, "V16iUs", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q512, "V8OiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask512, "UcV8Oi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask128, "UsV16c", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtb2mask256, "UiV32c", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b128, "V16cUs", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2b256, "V32cUi", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w128, "V8sUc", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2w256, "V16sUs", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask128, "UcV4i", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtd2mask256, "UcV8i", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d128, "V4iUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2d256, "V8iUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q128, "V2OiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtmask2q256, "V4OiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask128, "UcV2Oi", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtq2mask256, "UcV4Oi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovswb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovswb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovswb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd256_mask, "V4iV4OiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovsqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovuswb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd256_mask, "V4iV4OiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovusqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb512_mask, "V16cV16iV16cUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdb512mem_mask, "vV16c*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovwb512mem_mask, "vV32c*V32sUi", "nV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdw512_mask, "V16sV16iV16sUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdw512mem_mask, "vV16s*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqb512_mask, "V16cV8OiV16cUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqb512mem_mask, "vV16c*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqd512_mask, "V8iV8OiV8iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqd512mem_mask, "vV8i*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqw512_mask, "V8sV8OiV8sUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovqw512mem_mask, "vV8s*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_pmovdb128_mask, "V16cV4iV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovwb128mem_mask, "vV16c*V8sUc", "nV:128:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovdb128mem_mask, "vV16c*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb256_mask, "V16cV8iV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdb256mem_mask, "vV16c*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovwb256mem_mask, "vV16c*V16sUs", "nV:256:", "avx512vl,avx512bw")
-TARGET_BUILTIN(__builtin_ia32_pmovdw128_mask, "V8sV4iV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw128mem_mask, "vV8s*V4iUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw256_mask, "V8sV8iV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovdw256mem_mask, "vV8s*V8iUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb128_mask, "V16cV2OiV16cUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb128mem_mask, "vV16c*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb256_mask, "V16cV4OiV16cUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqb256mem_mask, "vV16c*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd128_mask, "V4iV2OiV4iUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd128mem_mask, "vV4i*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqd256mem_mask, "vV4i*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw128_mask, "V8sV2OiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw128mem_mask, "vV8s*V2OiUc", "nV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw256_mask, "V8sV4OiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_pmovqw256mem_mask, "vV8s*V4OiUc", "nV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf32x8_mask, "V8fV16fIiV8fUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf64x2_512_mask, "V2dV8dIiV2dUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti32x8_mask, "V8iV16iIiV8iUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti64x2_512_mask, "V2OiV8OiIiV2OiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti32x4_mask, "V4iV16iIiV4iUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extracti64x4_mask, "V4OiV8OiIiV4OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_extractf64x2_256_mask, "V2dV4dIiV2dUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extracti64x2_256_mask, "V2OiV4OiIiV2OiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extractf32x4_256_mask, "V4fV8fIiV4fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_extracti32x4_256_mask, "V4iV8iIiV4iUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x8, "V16fV16fV8fIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x2_512, "V8dV8dV2dIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti32x8, "V16iV16iV8iIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti64x2_512, "V8OiV8OiV2OiIi", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x4, "V8dV8dV4dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti64x4, "V8OiV8OiV4OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_insertf64x2_256, "V4dV4dV2dIi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_inserti64x2_256, "V4OiV4OiV2OiIi", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x4_256, "V8fV8fV4fIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_inserti32x4_256, "V8iV8iV4iIi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_insertf32x4, "V16fV16fV4fIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_inserti32x4, "V16iV16iV4iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantpd128_mask, "V2dV2dIiV2dUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantpd256_mask, "V4dV4dIiV4dUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantps128_mask, "V4fV4fIiV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantps256_mask, "V8fV8fIiV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantpd512_mask, "V8dV8dIiV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantps512_mask, "V16fV16fIiV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getexppd512_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_getexpps512_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask,  "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_maskz, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask3, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmsubsd3_mask3, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmsubss3_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_permdf512, "V8dV8dIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permdi512, "V8OiV8OiIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarhi512, "V32sV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvardf512, "V8dV8dV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvardi512, "V8OiV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarsf512, "V16fV16fV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarsi512, "V16iV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarqi512, "V64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_permvarqi128, "V16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarqi256, "V32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarhi128, "V8sV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvarhi256, "V16sV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvardf256, "V4dV4dV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_permvardi256, "V4OiV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd128_mask, "UcV2dIiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd256_mask, "UcV4dIiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps128_mask, "UcV4fIiUc", "ncV:128:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps256_mask, "UcV8fIiUc", "ncV:256:", "avx512dq,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassps512_mask, "UsV16fIiUs", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasspd512_mask, "UcV8dIiUc", "ncV:512:", "avx512dq,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasssd_mask, "UcV2dIiUc", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_fpclassss_mask, "UcV4fIiUc", "ncV:128:", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddhi, "UsUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kaddsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kadddi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kandhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kandsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kanddi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandnqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kandnhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kandnsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kandndi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_korqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_korhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_korsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestcqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kortestzqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kortestchi, "iUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kortestzhi, "iUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kortestcsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestzsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestcdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kortestzdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestcqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestzqi, "iUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestchi, "iUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestzhi, "iUsUs", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_ktestcsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestzsi, "iUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestcdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_ktestzdi, "iUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxnorqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxnorsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxnordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxorqi, "UcUcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kxorsi, "UiUiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kxordi, "UOiUOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftliqi, "UcUcIUi", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kshiftlihi, "UsUsIUi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kshiftlisi, "UiUiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftlidi, "UOiUOiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftriqi, "UcUcIUi", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kshiftrihi, "UsUsIUi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kshiftrisi, "UiUiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kshiftridi, "UOiUOiIUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kmovb, "UcUc", "nc", "avx512dq")
-TARGET_BUILTIN(__builtin_ia32_kmovw, "UsUs", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_kmovd, "UiUi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_kmovq, "UOiUOi", "nc", "avx512bw")
-TARGET_BUILTIN(__builtin_ia32_palignr512, "V64cV64cV64cIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw128, "V8sV16cV16cIi", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw256, "V16sV32cV32cIi", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dbpsadbw512, "V32sV64cV64cIi", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_psadbw512, "V8OiV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressdf512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressdi512_mask, "V8OiV8OiV8OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresshi512_mask, "V32sV32sV32sUi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressqi512_mask, "V64cV64cV64cUOi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresssf512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compresssi512_mask, "V16iV16iV16iUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpsd_mask, "UcV2dV2dIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cmpss_mask, "UcV4fV4fIiUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_pshufd512, "V16iV16iIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expanddf512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expanddi512_mask, "V8OiV8OiV8OiUc", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandhi512_mask, "V32sV32sV32sUi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandqi512_mask, "V64cV64cV64cUOi", "ncV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloaddf512_mask, "V8dV8dC*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloaddi512_mask, "V8OiV8OiC*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadhi512_mask, "V32sV32sC*V32sUi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadqi512_mask, "V64cV64cC*V64cUOi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadsf512_mask, "V16fV16fC*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandloadsi512_mask, "V16iV16iC*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandsf512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_expandsi512_mask, "V16iV16iV16iUs", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtps2pd512_mask, "V8dV8fV8dUcIi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoredf512_mask, "vV8d*V8dUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoredi512_mask, "vV8Oi*V8OiUc", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstorehi512_mask, "vV32s*V32sUi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoreqi512_mask, "vV64c*V64cUOi", "nV:512:", "avx512vbmi2,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoresf512_mask, "vV16f*V16fUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_compressstoresi512_mask, "vV16i*V16iUs", "nV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps_mask, "V4fV8sV4fUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256_mask, "V8fV8sV8fUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph_mask, "V8sV4fIiV8sUc", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256_mask, "V8sV8fIiV8sUc", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask512, "UiV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask128, "UcV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtw2mask256, "UsV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2ss_round_mask, "V4fV4fV2dV4fUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtsi2ss32, "V4fV4fiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtss2sd_round_mask, "V2dV2dV4fV2dUcIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtusi2ss32, "V4fV4fUiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb512, "V64cV64cV64c", "ncV:512:", "avx512vbmi,evex512")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb128, "V16cV16cV16c", "ncV:128:", "avx512vbmi,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb256, "V32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl")
-
-// bf16 intrinsics
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_128, "V8yV4fV4f", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_256, "V16yV8fV8f", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_512, "V32yV16fV16f", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_128_mask, "V8yV4fV8yUc", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_256_mask, "V8yV8fV8yUc", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_512_mask, "V16yV16fV16yUs", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_128, "V4fV4fV8yV8y", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_256, "V8fV8fV16yV16y", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_dpbf16ps_512, "V16fV16fV32yV32y", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cvtsbf162ss_32, "fy", "nc", "avx512bf16")
-
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_512, "vV8OiV8OiUc*Uc*", "nV:512:", "avx512vp2intersect,evex512")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_256, "vV4OiV4OiUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_128, "vV2OiV2OiUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_512, "vV16iV16iUs*Us*", "nV:512:", "avx512vp2intersect,evex512")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_256, "vV8iV8iUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_128, "vV4iV4iUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl")
-
-// AVX512 fp16 intrinsics
-TARGET_BUILTIN(__builtin_ia32_vcomish,       "iV8xV8xIiIi",    "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_addph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_subph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_mulph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_divph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_maxph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_minph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_minph256,      "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_minph128,      "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_maxph256,      "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_maxph128,      "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-
-TARGET_BUILTIN(__builtin_ia32_addsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_divsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_mulsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_subsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_maxsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_minsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_cmpph512_mask, "UiV32xV32xIiUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_cmpph256_mask, "UsV16xV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpph128_mask, "UcV8xV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_cmpsh_mask, "UcV8xV8xIiUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8xC*V8xUc", "nV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_rcpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rcpph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rsqrtph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_getmantph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getmantph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_getexpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_getexpph512_mask, "V32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_scalefph128_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefph256_mask, "V16xV16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_scalefph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_rndscaleph_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduceph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduceph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduceph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_rcpsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_rsqrtsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_getmantsh_round_mask, "V8xV8xV8xIiV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_getexpsh128_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_scalefsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_rndscalesh_round_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_reducesh_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_sqrtph, "V8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtph256, "V16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_sqrtph512, "V32xV32xIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_sqrtsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_fpclassph128_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassph256_mask, "UsV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_fpclassph512_mask, "UiV32xIiUi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_fpclasssh_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph128_mask, "V8xV2dV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_mask, "V8xV4dV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph512_mask, "V8xV8dV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd128_mask, "V2dV8xV2dUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_mask, "V4dV8xV4dUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd512_mask, "V8dV8xV8dUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2ss_round_mask, "V4fV4fV8xV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2sh_round_mask, "V8xV8xV4fV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2sh_round_mask, "V8xV8xV2dV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2sd_round_mask, "V2dV2dV8xV2dUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph128_mask, "V8xV8sV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_mask, "V16xV16sV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph512_mask, "V32xV32sV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph128_mask, "V8xV8UsV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_mask, "V16xV16UsV16xUs", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph512_mask, "V32xV32UsV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph128_mask, "V8xV4iV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_mask, "V8xV8iV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph512_mask, "V16xV16iV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph128_mask, "V8xV4UiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_mask, "V8xV8UiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph512_mask, "V16xV16UiV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph128_mask, "V8xV2OiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_mask, "V8xV4OiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph512_mask, "V8xV8OiV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph128_mask, "V8xV2UOiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_mask, "V8xV4UOiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph512_mask, "V8xV8UOiV8xUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2si32, "iV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtusi2sh, "V8xV8xUiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsi2sh, "V8xV8xiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvttsh2si32, "iV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx128_mask, "V4fV8xV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_mask, "V8fV8xV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx512_mask, "V16fV16xV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx128_mask, "V8xV4fV8xUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_mask, "V8xV8fV8xUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx512_mask, "V16xV16fV16xUsIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask,  "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph, "V8xV8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256, "V16xV16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_maskz, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmsubph512_mask3, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16,evex512")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_maskz, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmsubsh3_mask3, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph128_maskz,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_maskz,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_maskz,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask3,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_maskz,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_maskz,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_maskz,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask3,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_mask,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_maskz,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_maskz,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask3,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask3,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-
-TARGET_BUILTIN(__builtin_ia32_vfmulcsh_mask,   "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcsh_mask,  "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph128_mask,  "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_mask,  "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph512_mask,  "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512")
-
-// generic select intrinsics
-TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectb_512, "V64cUOiV64cV64c", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectw_128, "V8sUcV8sV8s", "ncV:128:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectw_256, "V16sUsV16sV16s", "ncV:256:", "avx512bw,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectw_512, "V32sUiV32sV32s", "ncV:512:", "avx512bw,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectd_128, "V4iUcV4iV4i", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectd_256, "V8iUcV8iV8i", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectd_512, "V16iUsV16iV16i", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectph_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectph_256, "V16xUsV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectph_512, "V32xUiV32xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_256, "V16yUsV16yV16y", "ncV:256:", "avx512bf16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpbf_512, "V32yUiV32yV32y", "ncV:512:", "avx512bf16,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2OiUcV2OiV2Oi", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4OiUcV4OiV4Oi", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8OiUcV8OiV8Oi", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectps_128, "V4fUcV4fV4f", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectps_256, "V8fUcV8fV8f", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl")
-TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_selectsh_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_selectsbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16")
-TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f")
-
-// generic reduction intrinsics
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph512, "xxV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_pd512, "dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ps512, "fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph512, "xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_pd512, "dV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ps512, "fV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph512, "xV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph512, "xxV32x", "ncV:512:", "avx512fp16,evex512")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl")
-
-// MONITORX/MWAITX
-TARGET_BUILTIN(__builtin_ia32_monitorx, "vvC*UiUi", "n", "mwaitx")
-TARGET_BUILTIN(__builtin_ia32_mwaitx, "vUiUiUi", "n", "mwaitx")
-
-// WAITPKG
-TARGET_BUILTIN(__builtin_ia32_umonitor, "vvC*", "n", "waitpkg")
-TARGET_BUILTIN(__builtin_ia32_umwait, "UcUiUiUi", "n", "waitpkg")
-TARGET_BUILTIN(__builtin_ia32_tpause, "UcUiUiUi", "n", "waitpkg")
-
-// CLZERO
-TARGET_BUILTIN(__builtin_ia32_clzero, "vv*", "n", "clzero")
-
-// CLDEMOTE
-TARGET_BUILTIN(__builtin_ia32_cldemote, "vvC*", "n", "cldemote")
-
-// Direct Move
-TARGET_BUILTIN(__builtin_ia32_directstore_u32, "vUi*Ui", "n", "movdiri")
-TARGET_BUILTIN(__builtin_ia32_movdir64b, "vv*vC*", "n", "movdir64b")
-
-// PTWRITE
-TARGET_BUILTIN(__builtin_ia32_ptwrite32, "vUi", "n", "ptwrite")
-
-// INVPCID
-TARGET_BUILTIN(__builtin_ia32_invpcid, "vUiv*", "nc", "invpcid")
-
-// ENQCMD
-TARGET_BUILTIN(__builtin_ia32_enqcmd, "Ucv*vC*", "n", "enqcmd")
-TARGET_BUILTIN(__builtin_ia32_enqcmds, "Ucv*vC*", "n", "enqcmd")
-
-// KEY LOCKER
-TARGET_BUILTIN(__builtin_ia32_loadiwkey, "vV2OiV2OiV2OiUi", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_encodekey128_u32, "UiUiV2Oiv*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_encodekey256_u32, "UiUiV2OiV2Oiv*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesenc128kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesenc256kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesdec128kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesdec256kl_u8, "UcV2Oi*V2OivC*", "nV:128:", "kl")
-TARGET_BUILTIN(__builtin_ia32_aesencwide128kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesencwide256kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesdecwide128kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-TARGET_BUILTIN(__builtin_ia32_aesdecwide256kl_u8, "UcV2Oi*V2OiC*vC*", "nV:128:", "kl,widekl")
-
-// SERIALIZE
-TARGET_BUILTIN(__builtin_ia32_serialize, "v", "n", "serialize")
-
-// TSXLDTRK
-TARGET_BUILTIN(__builtin_ia32_xsusldtrk, "v", "n", "tsxldtrk")
-TARGET_BUILTIN(__builtin_ia32_xresldtrk, "v", "n", "tsxldtrk")
-
-// RAO-INT
-TARGET_BUILTIN(__builtin_ia32_aadd32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_aand32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_aor32, "vv*Si", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_axor32, "vv*Si", "n", "raoint")
-
-// MSVC
-TARGET_HEADER_BUILTIN(_BitScanForward, "UcUNi*UNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_BitScanReverse, "UcUNi*UNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(_ReadWriteBarrier, "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_ReadBarrier,      "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_WriteBarrier,     "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__cpuid,   "vi*i",  "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__cpuidex, "vi*ii", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__emul,  "LLiii",    "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__emulu, "ULLiUiUi", "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(_AddressOfReturnAddress, "v*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__stosb, "vUc*Ucz", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__int2c, "v",       "nhr", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__ud2,   "v",       "nhr", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__readfsbyte,  "UcUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsword,  "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readfsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__readgsbyte,  "UcUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsword,  "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-// AVX10.2 VNNI FP16
-TARGET_BUILTIN(__builtin_ia32_vdpphps128, "V4fV4fV8xV8x", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdpphps256, "V8fV8fV16xV16x", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdpphps512, "V16fV16fV32xV32x", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 VNNI INT8
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 VNNI INT16
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
-
-// AVX10.2 VMPSADBW
-TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
-
-// AVX10.2 YMM Rounding
-TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppd256_round_mask, "UcV4dV4dIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmpph256_round_mask, "UsV16xV16xIiUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmpps256_round_mask, "UcV8fV8fIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_round_mask, "V8xV8iV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtdq2ps256_round_mask, "V8fV8iV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2dq256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_round_mask, "V8xV4dV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2ps256_round_mask, "V4fV4dV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2qq256_round_mask, "V4LLiV4dV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2udq256_round_mask, "V4UiV4dV4UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtpd2uqq256_round_mask, "V4ULLiV4dV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_round_mask, "V8iV8xV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_round_mask, "V4dV8xV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_round_mask, "V8fV8xV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_round_mask, "V4LLiV8xV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_round_mask, "V8UiV8xV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_round_mask, "V4ULLiV8xV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_round_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_round_mask, "V16sV16xV16sUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2dq256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2pd256_round_mask, "V4dV4fV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_round_mask, "V8xV8fV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2qq256_round_mask, "V4LLiV4fV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2udq256_round_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2uqq256_round_mask, "V4ULLiV4fV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2pd256_round_mask, "V4dV4LLiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_round_mask, "V8xV4LLiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtqq2ps256_round_mask, "V4fV4LLiV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dq256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qq256_round_mask, "V4LLiV4dV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udq256_round_mask, "V4UiV4dV4UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqq256_round_mask, "V4ULLiV4dV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_round_mask, "V8iV8xV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_round_mask, "V4LLiV8xV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_round_mask, "V8UiV8xV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_round_mask, "V4ULLiV8xV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_round_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_round_mask, "V16sV16xV16sUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dq256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qq256_round_mask, "V4LLiV4fV4LLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udq256_round_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqq256_round_mask, "V4ULLiV4fV4ULLiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_round_mask, "V8xV8UiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtudq2ps256_round_mask, "V8fV8UiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2pd256_round_mask, "V4dV4ULLiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_round_mask, "V8xV4ULLiV8xUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ps256_round_mask, "V4fV4ULLiV4fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_round_mask, "V16xV16UsV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_round_mask, "V16xV16sV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmpd256_round_mask, "V4dV4dV4dV4LLiIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmpd256_round_maskz, "V4dV4dV4dV4LLiIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmps256_round_mask, "V8fV8fV8fV8iIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfixupimmps256_round_maskz, "V8fV8fV8fV8iIiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_maskz, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_maskz, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_maskz, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_maskz, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_maskz, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd256_round_mask3, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddph256_round_mask3, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmsubaddps256_round_mask3, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmulcph256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppd256_round_mask, "V4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexpph256_round_mask, "V16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexpps256_round_mask, "V8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrangepd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrangeps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducepd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreduceph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreduceps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalepd256_round_mask, "V4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscaleph256_round_mask, "V16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscaleps256_round_mask, "V8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpd256_round_mask, "V4dV4dV4dV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefph256_round_mask, "V16xV16xV16xV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefps256_round_mask, "V8fV8fV8fV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtpd256_round, "V4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtph256_round, "V16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtps256_round, "V8fV8fIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
-
-// AVX-VNNI-INT16
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
-
-// AVX10.2 SATCVT-DS
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2sis32, "iV2dIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usis32, "UiV2dIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2sis32, "iV4fIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usis32, "UiV4fIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs128_mask, "V4iV2dV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs512_round_mask, "V8iV8dV8iUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs128_mask, "V4iV2dV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs512_round_mask, "V8iV8dV8iUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs128_mask,  "V2OiV2dV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs256_round_mask, "V4OiV4dV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs512_round_mask, "V8OiV8dV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs128_mask, "V2OiV2dV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs256_round_mask, "V4OiV4dV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs512_round_mask, "V8OiV8dV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs128_mask, "V4iV4fV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs512_round_mask, "V16iV16fV16iUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs128_mask, "V4iV4fV4iUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs512_round_mask, "V16iV16fV16iUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs128_mask, "V2OiV4fV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs256_round_mask, "V4OiV4fV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs512_round_mask, "V8OiV8fV8OiUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs128_mask, "V2OiV4fV2OiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs256_round_mask, "V4OiV4fV4OiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs512_round_mask, "V8OiV8fV8OiUcIi", "nV:512:", "avx10.2-512")
-
-// AVX-NE-CONVERT
-TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps256, "V8fyC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnesh2ps128, "V4fxC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vbcstnesh2ps256, "V8fxC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneebf162ps128, "V4fV8yC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneebf162ps256, "V8fV16yC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneeph2ps128, "V4fV8xC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneeph2ps256, "V8fV16xC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneobf162ps128, "V4fV8yC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneobf162ps256, "V8fV16yC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneoph2ps128, "V4fV8xC*", "nV:128:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneoph2ps256, "V8fV16xC*", "nV:256:", "avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneps2bf16128, "V8yV4f", "nV:128:", "avx512bf16,avx512vl|avxneconvert")
-TARGET_BUILTIN(__builtin_ia32_vcvtneps2bf16256, "V8yV8f", "nV:256:", "avx512bf16,avx512vl|avxneconvert")
-
-// SHA512
-TARGET_BUILTIN(__builtin_ia32_vsha512msg1, "V4ULLiV4ULLiV2ULLi", "nV:256:", "sha512")
-TARGET_BUILTIN(__builtin_ia32_vsha512msg2, "V4ULLiV4ULLiV4ULLi", "nV:256:", "sha512")
-TARGET_BUILTIN(__builtin_ia32_vsha512rnds2, "V4ULLiV4ULLiV4ULLiV2ULLi", "nV:256:", "sha512")
-
-TARGET_HEADER_BUILTIN(_InterlockedAnd64,         "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedDecrement64,   "WiWiD*",   "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchange64,    "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchangeAdd64, "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedExchangeSub64, "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedIncrement64,   "WiWiD*",   "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedOr64,          "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_InterlockedXor64,         "WiWiD*Wi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-// SM3
-TARGET_BUILTIN(__builtin_ia32_vsm3msg1, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
-TARGET_BUILTIN(__builtin_ia32_vsm3msg2, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
-TARGET_BUILTIN(__builtin_ia32_vsm3rnds2, "V4UiV4UiV4UiV4UiIUi", "nV:128:", "sm3")
-
-// SM4
-TARGET_BUILTIN(__builtin_ia32_vsm4key4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
-
-// SM4_EVEX
-TARGET_BUILTIN(__builtin_ia32_vsm4key4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
-TARGET_BUILTIN(__builtin_ia32_vsm4rnds4512, "V16UiV16UiV16Ui", "nV:512:", "avx10.2-512,sm4")
-
-// AVX10 MINMAX
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16128, "V8yV8yV8yIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16256, "V16yV16yV16yIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxnepbf16512, "V32yV32yV32yIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd128_mask, "V2dV2dV2dIiV2dUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd256_round_mask, "V4dV4dV4dIiV4dUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxpd512_round_mask, "V8dV8dV8dIiV8dUcIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph128_mask, "V8xV8xV8xIiV8xUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph256_round_mask, "V16xV16xV16xIiV16xUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxph512_round_mask, "V32xV32xV32xIiV32xUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps128_mask, "V4fV4fV4fIiV4fUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps256_round_mask, "V8fV8fV8fIiV8fUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxps512_round_mask, "V16fV16fV16fIiV16fUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminmaxsd_round_mask, "V2dV2dV2dIiV2dUcIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxsh_round_mask, "V8xV8xV8xIiV8xUcIi", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminmaxss_round_mask, "V4fV4fV4fIiV4fUcIi", "nV:128:", "avx10.2-256")
-
-// AVX10.2 SATCVT
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162ibs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtnebf162iubs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2ibs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtph2iubs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2ibs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtps2iubs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162ibs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs128, "V8UsV8y", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs256, "V16UsV16y", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttnebf162iubs512, "V32UsV32y", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2ibs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs128_mask, "V8UsV8xV8UsUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs256_mask, "V16UsV16xV16UsUsIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttph2iubs512_mask, "V32UsV32xV32UsUiIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512")
-
-// AVX10.2 CONVERT
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx128_mask, "V8xV4fV4fV8xUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx256_mask, "V16xV8fV8fV16xUsIi", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx512_mask, "V32xV16fV16fV32xUiIi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph128_mask, "V8xV16cV8xUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph256_mask, "V16xV16cV16xUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph512_mask, "V32xV32cV32xUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
-
-// AVX10.2 BF16
-TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256")
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
-#undef TARGET_HEADER_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cf8d277..18fc10e 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -10,12 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-include "clang/Basic/BuiltinsBase.td"
+include "clang/Basic/BuiltinsX86Base.td"
 
-class X86Builtin<string prototype> : TargetBuiltin {
-  let Spellings = ["__builtin_ia32_" # NAME];
-  let Prototype = prototype;
+def rdpmc : X86Builtin<"unsigned long long int(int)">;
+def rdtsc : X86Builtin<"unsigned long long int()">;
+def __rdtsc : X86NoPrefixBuiltin<"unsigned long long int()"> {
+  let EnableOpenCLLong = 1;
 }
+def rdtscp : X86Builtin<"unsigned long long int(unsigned int*)">;
 
 // Undefined Values
 def undef128 : X86Builtin<"_Vector<2, double>()"> {
@@ -135,3 +137,5375 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in
     def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
   }
 }
+
+
+// Mechanically ported builtins from the original `.def` file.
+//
+// TODO: Build structured ways of synthesizing relevant groups and improve the
+// organization of the builtins below this point (and move them above it). The
+// current formulation is based on what was easiest to recognize from the
+// pre-TableGen version.
+
+let Features = "mmx", Attributes = [NoThrow, Const] in {
+  def _mm_prefetch : X86NoPrefixBuiltin<"void(char const *, int)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def ldmxcsr : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_setcsr : X86LibBuiltin<"void(unsigned int)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def stmxcsr : X86Builtin<"unsigned int()">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_getcsr : X86LibBuiltin<"unsigned int()">;
+}
+
+let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtss2si : X86Builtin<"int(_Vector<4, float>)">;
+  def cvttss2si : X86Builtin<"int(_Vector<4, float>)">;
+}
+
+let Features = "sse", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movmskps : X86Builtin<"int(_Vector<4, float>)">;
+}
+
+let Features = "sse", Attributes = [NoThrow] in {
+  def sfence : X86Builtin<"void()">;
+}
+
+let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_sfence : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskmovdqu : X86Builtin<"void(_Vector<16, char>, _Vector<16, char>, char *)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
+  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def movnti : X86Builtin<"void(int *, int)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+  def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
+  def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
+  def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
+  def cvttpd2dq : X86Builtin<"_Vector<4, int>(_Vector<2, double>)">;
+  def cvtsd2si : X86Builtin<"int(_Vector<2, double>)">;
+  def cvttsd2si : X86Builtin<"int(_Vector<2, double>)">;
+  def cvtsd2ss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<2, double>)">;
+  def cvtps2dq : X86Builtin<"_Vector<4, int>(_Vector<4, float>)">;
+  def cvttps2dq : X86Builtin<"_Vector<4, int>(_Vector<4, float>)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def clflush : X86Builtin<"void(void const *)">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_clflush : X86LibBuiltin<"void(void const *)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def lfence : X86Builtin<"void()">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_lfence : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def mfence : X86Builtin<"void()">;
+}
+
+let Features = "sse2", Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_mfence : X86LibBuiltin<"void()">;
+}
+
+let Attributes = [NoThrow] in {
+  def pause : X86Builtin<"void()">;
+}
+
+let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in {
+  def _mm_pause : X86LibBuiltin<"void()">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def psrld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrlq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def pslldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def psllqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+  def psrlwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def psrldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def psrlqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+  def psrawi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
+  def psradi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+  def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
+  def pslldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+  def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "sse3", Attributes = [NoThrow] in {
+  def monitor : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
+  def mwait : X86Builtin<"void(unsigned int, unsigned int)">;
+}
+
+let Features = "sse3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def lddqu : X86Builtin<"_Vector<16, char>(char const *)">;
+}
+
+let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
+}
+
+let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
+  def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+  def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+  def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+  def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+  def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+  def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+  def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
+  def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+  def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
+  def roundpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">;
+  def dpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
+  def dppd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant char)">;
+  def ptestz128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def ptestc128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def ptestnzc128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def mpsadbw128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def phminposuw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>)">;
+  def vec_ext_v16qi : X86Builtin<"char(_Vector<16, char>, _Constant int)">;
+  def vec_set_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, char, _Constant int)">;
+  def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
+}
+
+let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pcmpistrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistri128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpestrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestri128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpistria128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistric128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistrio128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistris128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpistriz128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def pcmpestria128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestric128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestrio128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestris128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+  def pcmpestriz128 : X86Builtin<"int(_Vector<16, char>, int, _Vector<16, char>, int, _Constant char)">;
+}
+
+let Features = "crc32", Attributes = [NoThrow, Const] in {
+  def crc32qi : X86Builtin<"unsigned int(unsigned int, unsigned char)">;
+  def crc32hi : X86Builtin<"unsigned int(unsigned int, unsigned short)">;
+  def crc32si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "sse4a", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def extrqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char, _Constant char)">;
+  def extrq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<16, char>)">;
+  def insertqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char, _Constant char)">;
+  def insertq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "sse4a", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movntsd : X86Builtin<"void(double *, _Vector<2, double>)">;
+  def movntss : X86Builtin<"void(float *, _Vector<4, float>)">;
+}
+
+let Features = "aes", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def aesenc128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesenclast128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesdec128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesdeclast128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def aesimc128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+  def aeskeygenassist128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesenc256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesenc512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesenclast256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesenclast512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesdec256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesdec512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def aesdeclast256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def aesdeclast512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8affineinvqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8affineinvqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8affineinvqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8affineqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8affineqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8affineqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgf2p8mulb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgf2p8mulb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512f,evex512,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "pclmul", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pclmulqdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pclmulqdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx512f,evex512,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermilvarpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>)">;
+  def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">;
+  def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
+  def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
+  def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+  def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
+  def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
+  def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
+  def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
+  def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
+  def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
+  def cvtpd2ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, double>)">;
+  def cvtps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
+  def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
+  def cvtpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
+  def cvttps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
+  def vperm2f128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vperm2f128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vperm2f128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermilpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">;
+  def vpermilps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+  def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
+  def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
+  def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
+  def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
+  def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def roundps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vtestzpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestnzcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
+  def vtestzps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+  def vtestcps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+  def vtestnzcps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vtestzpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestcpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestnzcpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
+  def vtestzps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def vtestcps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def vtestnzcps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
+  def ptestz256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def ptestc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def ptestnzc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def movmskpd256 : X86Builtin<"int(_Vector<4, double>)">;
+  def movmskps256 : X86Builtin<"int(_Vector<8, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow] in {
+  def vzeroall : X86Builtin<"void()">;
+  def vzeroupper : X86Builtin<"void()">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def lddqu256 : X86Builtin<"_Vector<32, char>(char const *)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskloadpd : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, long long int>)">;
+  def maskloadps : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, int>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskloadpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, long long int>)">;
+  def maskloadps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, int>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskstorepd : X86Builtin<"void(_Vector<2, double *>, _Vector<2, long long int>, _Vector<2, double>)">;
+  def maskstoreps : X86Builtin<"void(_Vector<4, float *>, _Vector<4, int>, _Vector<4, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskstorepd256 : X86Builtin<"void(_Vector<4, double *>, _Vector<4, long long int>, _Vector<4, double>)">;
+  def maskstoreps256 : X86Builtin<"void(_Vector<8, float *>, _Vector<8, int>, _Vector<8, float>)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vec_ext_v32qi : X86Builtin<"char(_Vector<32, char>, _Constant int)">;
+  def vec_ext_v16hi : X86Builtin<"short(_Vector<16, short>, _Constant int)">;
+  def vec_ext_v8si : X86Builtin<"int(_Vector<8, int>, _Constant int)">;
+  def vec_set_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, char, _Constant int)">;
+  def vec_set_v16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, short, _Constant int)">;
+  def vec_set_v8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int, _Constant int)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+  def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+  def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+  def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
+  def pavgb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def pavgw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
+  def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+  def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
+  def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
+  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmulhuw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
+  def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+  def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
+  def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
+  def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def pslldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+  def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+  def psrawi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def psradi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psrldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def psrlwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
+  def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
+  def psrldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+  def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
+  def psrlqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+  def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+  def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+  def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+  def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
+  def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+  def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
+  def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskloadd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>)">;
+  def maskloadq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskloadd : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>)">;
+  def maskloadq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def maskstored256 : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, _Vector<8, int>)">;
+  def maskstoreq256 : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def maskstored : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, _Vector<4, int>)">;
+  def maskstoreq : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<4, int>, _Vector<2, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, double const *, _Vector<4, int>, _Vector<4, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<2, long long int>, _Vector<2, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, double const *, _Vector<4, long long int>, _Vector<4, double>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<4, int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, float const *, _Vector<8, int>, _Vector<8, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<2, long long int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, float const *, _Vector<4, long long int>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_q : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int const *, _Vector<4, int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_q256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int const *, _Vector<4, int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_q : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int const *, _Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_q256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int const *, _Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherd_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<4, int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherd_d256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int const *, _Vector<8, int>, _Vector<8, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gatherq_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<2, long long int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gatherq_d256 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<4, long long int>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2ph : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
+}
+
+let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
+}
+
+let Features = "rdrnd", Attributes = [NoThrow] in {
+  def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
+  def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
+}
+
+let Features = "fxsr", Attributes = [NoThrow] in {
+  def fxrstor : X86Builtin<"void(void *)">;
+  def fxsave : X86Builtin<"void(void *)">;
+}
+
+let Features = "xsave", Attributes = [NoThrow] in {
+  def xsave : X86Builtin<"void(void *, unsigned long long int)">;
+  def xrstor : X86Builtin<"void(void *, unsigned long long int)">;
+  def xgetbv : X86Builtin<"unsigned long long int(unsigned int)">;
+}
+
+let Header = "immintrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _xgetbv : X86LibBuiltin<"uint64_t(unsigned int)">;
+}
+
+let Features = "xsave", Attributes = [NoThrow] in {
+  def xsetbv : X86Builtin<"void(unsigned int, unsigned long long int)">;
+}
+
+let Header = "immintrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _xsetbv : X86LibBuiltin<"void(unsigned int, uint64_t)">;
+}
+
+let Features = "xsaveopt", Attributes = [NoThrow] in {
+  def xsaveopt : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaves", Attributes = [NoThrow] in {
+  def xrstors : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsavec", Attributes = [NoThrow] in {
+  def xsavec : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaves", Attributes = [NoThrow] in {
+  def xsaves : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "shstk", Attributes = [NoThrow] in {
+  def incsspd : X86Builtin<"void(unsigned int)">;
+  def rdsspd : X86Builtin<"unsigned int(unsigned int)">;
+  def saveprevssp : X86Builtin<"void()">;
+  def rstorssp : X86Builtin<"void(void *)">;
+  def wrssd : X86Builtin<"void(unsigned int, void *)">;
+  def wrussd : X86Builtin<"void(unsigned int, void *)">;
+  def setssbsy : X86Builtin<"void()">;
+  def clrssbsy : X86Builtin<"void(void *)">;
+}
+
+let Features = "clflushopt", Attributes = [NoThrow] in {
+  def clflushopt : X86Builtin<"void(void const *)">;
+}
+
+let Features = "clwb", Attributes = [NoThrow] in {
+  def clwb : X86Builtin<"void(void const *)">;
+}
+
+let Attributes = [NoThrow] in {
+  def wbinvd : X86Builtin<"void()">;
+}
+
+let Features = "wbnoinvd", Attributes = [NoThrow] in {
+  def wbnoinvd : X86Builtin<"void()">;
+}
+
+let Attributes = [NoThrow, Constexpr] in {
+  def addcarryx_u32 : X86Builtin<"unsigned char(unsigned char, unsigned int, unsigned int, unsigned int *)">;
+  def subborrow_u32 : X86Builtin<"unsigned char(unsigned char, unsigned int, unsigned int, unsigned int *)">;
+}
+
+let Features = "rdseed", Attributes = [NoThrow] in {
+  def rdseed16_step : X86Builtin<"unsigned int(unsigned short *)">;
+  def rdseed32_step : X86Builtin<"unsigned int(unsigned int *)">;
+}
+
+let Features = "lzcnt", Attributes = [NoThrow, Const, Constexpr] in {
+  def lzcnt_u16 : X86Builtin<"unsigned short(unsigned short)">;
+  def lzcnt_u32 : X86Builtin<"unsigned int(unsigned int)">;
+}
+
+let Features = "bmi", Attributes = [NoThrow, Const, Constexpr] in {
+  def bextr_u32 : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Attributes = [NoThrow, Const, Constexpr] in {
+  def tzcnt_u16 : X86Builtin<"unsigned short(unsigned short)">;
+  def tzcnt_u32 : X86Builtin<"unsigned int(unsigned int)">;
+}
+
+let Features = "bmi2", Attributes = [NoThrow, Const, Constexpr] in {
+  def bzhi_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def pdep_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def pext_si : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "tbm", Attributes = [NoThrow, Const, Constexpr] in {
+  def bextri_u32 : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+}
+
+let Features = "lwp", Attributes = [NoThrow] in {
+  def llwpcb : X86Builtin<"void(void *)">;
+  def slwpcb : X86Builtin<"void *()">;
+  def lwpins32 : X86Builtin<"unsigned char(unsigned int, unsigned int, _Constant unsigned int)">;
+  def lwpval32 : X86Builtin<"void(unsigned int, unsigned int, _Constant unsigned int)">;
+}
+
+let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sha1rnds4 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def sha1nexte : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha1msg1 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha1msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha256rnds2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def sha256msg1 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
+  def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def vfmaddpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+  def vfmaddsubps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def vfmaddsubpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmsubpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmsubps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmsubaddpd512_mask3 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def vfmaddsubps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddsubps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmsubaddps512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmacssww : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+  def vpmacsww : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+  def vpmacsswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmacswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmacssdd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpmacsdd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpmacssdql : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacsdql : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacssdqh : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmacsdqh : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>, _Vector<2, long long int>)">;
+  def vpmadcsswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vpmadcswd : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>, _Vector<4, int>)">;
+  def vphaddbw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphaddbd : X86Builtin<"_Vector<4, int>(_Vector<16, char>)">;
+  def vphaddbq : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>)">;
+  def vphaddwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphaddwq : X86Builtin<"_Vector<2, long long int>(_Vector<8, short>)">;
+  def vphadddq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vphaddubw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphaddubd : X86Builtin<"_Vector<4, int>(_Vector<16, char>)">;
+  def vphaddubq : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>)">;
+  def vphadduwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphadduwq : X86Builtin<"_Vector<2, long long int>(_Vector<8, short>)">;
+  def vphaddudq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vphsubbw : X86Builtin<"_Vector<8, short>(_Vector<16, char>)">;
+  def vphsubwd : X86Builtin<"_Vector<4, int>(_Vector<8, short>)">;
+  def vphsubdq : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>)">;
+  def vpperm : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+  def vprotb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vprotw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vprotd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vprotq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vprotbi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant char)">;
+  def vprotwi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant char)">;
+  def vprotdi : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant char)">;
+  def vprotqi : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant char)">;
+  def vpshlb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vpshlw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vpshld : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vpshlq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpshab : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  def vpshaw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+  def vpshad : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def vpshaq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpcomub : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def vpcomuw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant char)">;
+  def vpcomud : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def vpcomuq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+  def vpcomb : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
+  def vpcomw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant char)">;
+  def vpcomd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant char)">;
+  def vpcomq : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
+  def vpermil2pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermil2pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermil2ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermil2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant char)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfrczss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def vfrczsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+  def vfrczps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
+  def vfrczpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
+}
+
+let Features = "xop", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfrczps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
+  def vfrczpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
+}
+
+let Features = "rtm", Attributes = [NoThrow] in {
+  def xbegin : X86Builtin<"int()">;
+  def xend : X86Builtin<"void()">;
+  def xabort : X86Builtin<"void(_Constant char)">;
+  def xtest : X86Builtin<"int()">;
+}
+
+let Features = "rdpid", Attributes = [NoThrow] in {
+  def rdpid : X86Builtin<"unsigned int()">;
+}
+
+let Features = "rdpru", Attributes = [NoThrow], EnableOpenCLLong = 0 in {
+  def rdpru : X86Builtin<"unsigned long long int(int)">;
+}
+
+let Features = "pku", Attributes = [NoThrow] in {
+  def rdpkru : X86Builtin<"unsigned int()">;
+  def wrpkru : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def sqrtpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def sqrtps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14sd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+  def rsqrt14ss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rsqrt14pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def rsqrt14ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14sd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+  def rcp14ss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rcp14pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def rcp14ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def cvttps2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvttps2udq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvttpd2dq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cvttpd2udq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cmpps512_mask : X86Builtin<"unsigned short(_Vector<16, float>, _Vector<16, float>, _Constant int, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpps256_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Vector<8, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpps128_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmppd512_mask : X86Builtin<"unsigned char(_Vector<8, double>, _Vector<8, double>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmppd256_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Vector<4, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmppd128_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Vector<2, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rndscaleps_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def rndscalepd_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtps2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvtpd2dq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def cvtps2udq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+  def cvtpd2udq512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def minps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def minpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def maxps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def maxpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def cvtdq2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, int>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def cvtudq2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, int>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def cvtpd2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, double>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtps2ph512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, float>, _Constant int, _Vector<16, short>, unsigned short)">;
+  def vcvtph2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, short>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
+  def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def loaddqusi512_mask : X86Builtin<"_Vector<16, int>(int const *, _Vector<16, int>, unsigned short)">;
+  def loaddqudi512_mask : X86Builtin<"_Vector<8, long long int>(long long int const *, _Vector<8, long long int>, unsigned char)">;
+  def loadups512_mask : X86Builtin<"_Vector<16, float>(float const *, _Vector<16, float>, unsigned short)">;
+  def loadaps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float const *>, _Vector<16, float>, unsigned short)">;
+  def loadupd512_mask : X86Builtin<"_Vector<8, double>(double const *, _Vector<8, double>, unsigned char)">;
+  def loadapd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double const *>, _Vector<8, double>, unsigned char)">;
+  def storedqudi512_mask : X86Builtin<"void(long long int *, _Vector<8, long long int>, unsigned char)">;
+  def storedqusi512_mask : X86Builtin<"void(int *, _Vector<16, int>, unsigned short)">;
+  def storeupd512_mask : X86Builtin<"void(double *, _Vector<8, double>, unsigned char)">;
+  def storeapd512_mask : X86Builtin<"void(_Vector<8, double *>, _Vector<8, double>, unsigned char)">;
+  def storeups512_mask : X86Builtin<"void(float *, _Vector<16, float>, unsigned short)">;
+  def storeaps512_mask : X86Builtin<"void(_Vector<16, float *>, _Vector<16, float>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def alignq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def alignd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def alignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def alignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def alignq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def alignq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extractf64x4_mask : X86Builtin<"_Vector<4, double>(_Vector<8, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+  def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint8|avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "movrs", Attributes = [NoThrow, Const] in {
+  def prefetchrs : X86Builtin<"void(void const *)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3div2df : X86Builtin<"_Vector<2, double>(_Vector<2, double>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+  def gather3div2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3div4df : X86Builtin<"_Vector<4, double>(_Vector<4, double>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def gather3div4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3div4sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+  def gather3div4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<2, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3div8sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def gather3div8si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3siv2df : X86Builtin<"_Vector<2, double>(_Vector<2, double>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3siv4df : X86Builtin<"_Vector<4, double>(_Vector<4, double>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def gather3siv4sf : X86Builtin<"_Vector<4, float>(_Vector<4, float>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+  def gather3siv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, void const *, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def gather3siv8sf : X86Builtin<"_Vector<8, float>(_Vector<8, float>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gather3siv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def gathersiv8df : X86Builtin<"_Vector<8, double>(_Vector<8, double>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gathersiv16sf : X86Builtin<"_Vector<16, float>(_Vector<16, float>, void const *, _Vector<16, int>, unsigned short, _Constant int)">;
+  def gatherdiv8df : X86Builtin<"_Vector<8, double>(_Vector<8, double>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gatherdiv16sf : X86Builtin<"_Vector<8, float>(_Vector<8, float>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gathersiv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, void const *, _Vector<8, int>, unsigned char, _Constant int)">;
+  def gathersiv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, void const *, _Vector<16, int>, unsigned short, _Constant int)">;
+  def gatherdiv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def gatherdiv16si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, void const *, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def scattersiv8df : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, double>, _Constant int)">;
+  def scattersiv16sf : X86Builtin<"void(void *, unsigned short, _Vector<16, int>, _Vector<16, float>, _Constant int)">;
+  def scatterdiv8df : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, double>, _Constant int)">;
+  def scatterdiv16sf : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, float>, _Constant int)">;
+  def scattersiv8di : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, long long int>, _Constant int)">;
+  def scattersiv16si : X86Builtin<"void(void *, unsigned short, _Vector<16, int>, _Vector<16, int>, _Constant int)">;
+  def scatterdiv8di : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def scatterdiv16si : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def knotqi : X86Builtin<"unsigned char(unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def knothi : X86Builtin<"unsigned short(unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def knotsi : X86Builtin<"unsigned int(unsigned int)">;
+  def knotdi : X86Builtin<"unsigned long long int(unsigned long long int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def cmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def cmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def cmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpd128_mask : X86Builtin<"unsigned char(_Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def ucmpq128_mask : X86Builtin<"unsigned char(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def ucmpw128_mask : X86Builtin<"unsigned char(_Vector<8, short>, _Vector<8, short>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpd256_mask : X86Builtin<"unsigned char(_Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def ucmpq256_mask : X86Builtin<"unsigned char(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ucmpw256_mask : X86Builtin<"unsigned short(_Vector<16, short>, _Vector<16, short>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, _Constant int, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpd512_mask : X86Builtin<"unsigned short(_Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def ucmpq512_mask : X86Builtin<"unsigned char(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
+  def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+  def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+  def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+  def pavgb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+  def pavgw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpconflictdi_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpconflictsi_128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpconflictsi_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>)">;
+}
+
+let Features = "avx512cd,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpconflictdi_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>)">;
+  def vpconflictsi_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
+  def vplzcntd_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
+  def vplzcntq_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshufbitqmb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshufbitqmb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512bitalg,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshufbitqmb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pmulhuw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pmulhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def addpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def addps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def divpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def divps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def mulpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def mulps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def subpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def subps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
+  def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def addss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def divss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def mulss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def subss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def maxss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def minss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def addsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def divsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def mulsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def subsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def maxsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def minsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressdf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressdf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressdi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressdi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresshi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresshi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compressqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compressqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresssf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresssf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def compresssi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def compresssi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoredf128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoredf256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoredi128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoredi256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstorehi128_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstorehi256_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoreqi128_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoreqi256_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoresf128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoresf256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def compressstoresi128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def compressstoresi256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+  def cvtpd2ps_mask : X86Builtin<"_Vector<4, float>(_Vector<2, double>, _Vector<4, float>, unsigned char)">;
+  def cvtpd2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2udq256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2udq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttpd2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+  def cvttpd2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2udq256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2udq128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2udq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expanddf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expanddf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expanddi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expanddi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloaddf128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloaddf256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloaddi128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloaddi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short const *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short const *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char const *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char const *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadsf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadsf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def expandloadsi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def expandloadsi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandsf128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandsf256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def expandsi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def expandsi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexppd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexppd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexpps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexpps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscalepd_128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscalepd_256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscaleps_128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscaleps_256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scatterdiv2df : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<2, double>, _Constant int)">;
+  def scatterdiv2di : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scatterdiv4df : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, double>, _Constant int)">;
+  def scatterdiv4di : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scatterdiv4sf : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<4, float>, _Constant int)">;
+  def scatterdiv4si : X86Builtin<"void(void *, unsigned char, _Vector<2, long long int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scatterdiv8sf : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, float>, _Constant int)">;
+  def scatterdiv8si : X86Builtin<"void(void *, unsigned char, _Vector<4, long long int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scattersiv2df : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<2, double>, _Constant int)">;
+  def scattersiv2di : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scattersiv4df : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, double>, _Constant int)">;
+  def scattersiv4di : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def scattersiv4sf : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, float>, _Constant int)">;
+  def scattersiv4si : X86Builtin<"void(void *, unsigned char, _Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def scattersiv8sf : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, float>, _Constant int)">;
+  def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshldvw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshldvw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshldvw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdvw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdvw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdvw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpshrdw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512vbmi2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpshrdw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpshrdw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovswb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+  def pmovuswb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+  def pmovwb512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, short>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtpd2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtpd2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtps2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtps2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtqq2ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<2, long long int>, _Vector<4, float>, unsigned char)">;
+  def cvttpd2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttpd2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttpd2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvttps2uqq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvttps2uqq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtuqq2ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<2, long long int>, _Vector<4, float>, unsigned char)">;
+  def rangepd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rangepd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rangeps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rangeps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rangesd128_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def rangess128_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reducepd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reducepd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduceps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduceps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reducesd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def reducess_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovswb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovswb256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, short>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovuswb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovuswb256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, short>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovwb128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtpd2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtpd2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtps2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtps2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtqq2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, long long int>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtqq2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, long long int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def cvttpd2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttpd2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttps2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvttps2uqq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+  def cvtuqq2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, long long int>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def cvtuqq2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, long long int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def rangepd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def rangeps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def reducepd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def reduceps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prold512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def prolq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prold128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prold256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prolvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def prolvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def prord512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def prorq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prolvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prolvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prord128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prord256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def prorvd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def prorvq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorvd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorvd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def prorvq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def prorvq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
+  def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
+  def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def psllw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psllwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pslldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psllqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrlv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psrlqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psrav32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrav16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrav8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psravq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psravq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psraw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psrawi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+  def psrlw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+  def psrlwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
+  def pslldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+  def psrldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa32load128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa32load256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def movdqa32load512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int const *>, _Vector<16, int>, unsigned short)">;
+  def movdqa32store512_mask : X86Builtin<"void(_Vector<16, int *>, _Vector<16, int>, unsigned short)">;
+  def movdqa64load512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int const *>, _Vector<8, long long int>, unsigned char)">;
+  def movdqa64store512_mask : X86Builtin<"void(_Vector<8, long long int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa32store128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa32store256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa64load128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa64load256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def movdqa64store128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def movdqa64store256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512ifma,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+  def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomisd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>, _Constant int, _Constant int)">;
+  def vcomiss : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kunpckdi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+  def kunpcksi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def loaddquhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short const *>, _Vector<32, short>, unsigned int)">;
+  def loaddquqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char const *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fixupimmpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmpd512_maskz : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, int>, _Constant int, unsigned short, _Constant int)">;
+  def fixupimmps512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, int>, _Constant int, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmsd_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmsd_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmss_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char, _Constant int)">;
+  def fixupimmss_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char, _Constant int)">;
+  def getexpsd128_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def getexpss128_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def getmantsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char, _Constant int)">;
+  def getmantss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddquhi128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short const *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddquhi256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short const *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddquqi128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char const *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddquqi256_mask : X86Builtin<"_Vector<32, char>(_Vector<32, char const *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+  def fixupimmpd128_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fixupimmpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+  def fixupimmpd256_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fixupimmps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def fixupimmps128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fixupimmps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def fixupimmps256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadapd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadapd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadaps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadss128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadaps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddqudi128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddqudi256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loaddqusi128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loaddqusi256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadupd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double const *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadupd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double const *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadups128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float const *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def loadups256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float const *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def storedquhi512_mask : X86Builtin<"void(_Vector<32, short *>, _Vector<32, short>, unsigned int)">;
+  def storedquqi512_mask : X86Builtin<"void(_Vector<64, char *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedquhi128_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedquhi256_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedquqi128_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedquqi256_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeapd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storesd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeapd256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeaps128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storess128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeaps256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedqudi128_mask : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedqudi256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storedqusi128_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storedqusi256_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeupd128_mask : X86Builtin<"void(_Vector<2, double *>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeupd256_mask : X86Builtin<"void(_Vector<4, double *>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def storeups128_mask : X86Builtin<"void(_Vector<4, float *>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def storeups256_mask : X86Builtin<"void(_Vector<8, float *>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcp14pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcp14ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcp14ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vplzcntd_128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vplzcntd_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vplzcntq_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vplzcntq_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsd2si32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvtsd2usi32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvtss2si32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvtss2usi32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+  def vcvttsd2si32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvttsd2usi32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvttss2si32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvttss2usi32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpermilpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def vpermilps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int)">;
+  def vpermilvarpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>)">;
+  def vpermilvarps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscalesd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int, _Constant int)">;
+  def rndscaless_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def scalefpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def scalefps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def scalefss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def psradi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def psraqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psraq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psraq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psraqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psraqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pslld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psllq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psllv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psllv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def psrad512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psraq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psrav16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psrav8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def psrld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
+  def psrlq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+  def psrlv16si : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def psrlv8di : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def pternlogd512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def pternlogd512_maskz : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
+  def pternlogq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+  def pternlogq512_maskz : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pternlogd128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+  def pternlogd128_maskz : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pternlogd256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+  def pternlogd256_maskz : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pternlogq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+  def pternlogq128_maskz : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pternlogq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+  def pternlogq256_maskz : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def shuf_f32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+  def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
+  def shuf_i64x2 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+  def shufpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
+  def shufps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def shuf_f32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def shuf_f64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def shuf_i32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
+  def shuf_i64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def sqrtss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrt14pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrt14ps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrt14ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtb2mask512 : X86Builtin<"unsigned long long int(_Vector<64, char>)">;
+  def cvtmask2b512 : X86Builtin<"_Vector<64, char>(unsigned long long int)">;
+  def cvtmask2w512 : X86Builtin<"_Vector<32, short>(unsigned int)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtd2mask512 : X86Builtin<"unsigned short(_Vector<16, int>)">;
+  def cvtmask2d512 : X86Builtin<"_Vector<16, int>(unsigned short)">;
+  def cvtmask2q512 : X86Builtin<"_Vector<8, long long int>(unsigned char)">;
+  def cvtq2mask512 : X86Builtin<"unsigned char(_Vector<8, long long int>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtb2mask128 : X86Builtin<"unsigned short(_Vector<16, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtb2mask256 : X86Builtin<"unsigned int(_Vector<32, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2b128 : X86Builtin<"_Vector<16, char>(unsigned short)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2b256 : X86Builtin<"_Vector<32, char>(unsigned int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2w128 : X86Builtin<"_Vector<8, short>(unsigned char)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2w256 : X86Builtin<"_Vector<16, short>(unsigned short)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtd2mask128 : X86Builtin<"unsigned char(_Vector<4, int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtd2mask256 : X86Builtin<"unsigned char(_Vector<8, int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2d128 : X86Builtin<"_Vector<4, int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2d256 : X86Builtin<"_Vector<8, int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtmask2q128 : X86Builtin<"_Vector<2, long long int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtmask2q256 : X86Builtin<"_Vector<4, long long int>(unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtq2mask128 : X86Builtin<"unsigned char(_Vector<2, long long int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtq2mask256 : X86Builtin<"unsigned char(_Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovswb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovsqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovsqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovswb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovswb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqd256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovsqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovsqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovsqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovsqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovuswb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovusqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovusqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovuswb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovuswb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqd256_mask : X86Builtin<"_Vector<4, int>(_Vector<4, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovusqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovusqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovusqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovusqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovdb512_mask : X86Builtin<"_Vector<16, char>(_Vector<16, int>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovdb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovwb512mem_mask : X86Builtin<"void(_Vector<32, char *>, _Vector<32, short>, unsigned int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovdw512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, int>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovdw512mem_mask : X86Builtin<"void(_Vector<16, short *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqb512_mask : X86Builtin<"_Vector<16, char>(_Vector<8, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqb512mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqd512_mask : X86Builtin<"_Vector<8, int>(_Vector<8, long long int>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqd512mem_mask : X86Builtin<"void(_Vector<8, int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pmovqw512_mask : X86Builtin<"_Vector<8, short>(_Vector<8, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def pmovqw512mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovdb128_mask : X86Builtin<"_Vector<16, char>(_Vector<4, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovwb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovdb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovdb256_mask : X86Builtin<"_Vector<16, char>(_Vector<8, int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovdb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovwb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovdw128_mask : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovdw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovdw256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovdw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqb128_mask : X86Builtin<"_Vector<16, char>(_Vector<2, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqb128mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovqb256_mask : X86Builtin<"_Vector<16, char>(_Vector<4, long long int>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqb256mem_mask : X86Builtin<"void(_Vector<16, char *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqd128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, long long int>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqd128mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqd256mem_mask : X86Builtin<"void(_Vector<4, int *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def pmovqw128_mask : X86Builtin<"_Vector<8, short>(_Vector<2, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def pmovqw128mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def pmovqw256_mask : X86Builtin<"_Vector<8, short>(_Vector<4, long long int>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def pmovqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extractf32x8_mask : X86Builtin<"_Vector<8, float>(_Vector<16, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+  def extractf64x2_512_mask : X86Builtin<"_Vector<2, double>(_Vector<8, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+  def extracti32x8_mask : X86Builtin<"_Vector<8, int>(_Vector<16, int>, _Constant int, _Vector<8, int>, unsigned char)">;
+  def extracti64x2_512_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def extracti32x4_mask : X86Builtin<"_Vector<4, int>(_Vector<16, int>, _Constant int, _Vector<4, int>, unsigned char)">;
+  def extracti64x4_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, long long int>, _Constant int, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def extractf64x2_256_mask : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+  def extracti64x2_256_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def extractf32x4_256_mask : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+  def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf32x8 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<8, float>, _Constant int)">;
+  def insertf64x2_512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<2, double>, _Constant int)">;
+  def inserti32x8 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<8, int>, _Constant int)">;
+  def inserti64x2_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf64x4 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<4, double>, _Constant int)">;
+  def inserti64x4 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<4, long long int>, _Constant int)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def insertf64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
+  def inserti64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def insertf32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
+  def inserti32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def insertf32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<4, float>, _Constant int)">;
+  def inserti32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>, _Constant int)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantpd256_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getmantpd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+  def getmantps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+  def getexppd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char, _Constant int)">;
+  def getexpps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddss3_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddss3_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddss3_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddsd3_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmaddsd3_maskz : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmaddsd3_mask3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmsubsd3_mask3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vfmsubss3_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permdf512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">;
+  def permdi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvarhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvardf512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>)">;
+  def permvardi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
+  def permvarsf512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">;
+  def permvarsi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def permvarqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def permvarqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvarqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def permvarhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvarhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def permvardf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
+  def permvardi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasspd128_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclasspd256_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclassps128_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclassps256_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fpclassps512_mask : X86Builtin<"unsigned short(_Vector<16, float>, _Constant int, unsigned short)">;
+  def fpclasspd512_mask : X86Builtin<"unsigned char(_Vector<8, double>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasssd_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Constant int, unsigned char)">;
+  def fpclassss_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kaddqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+  def kaddhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kaddsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kadddi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kandqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kandhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kandsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kanddi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kandnqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kandnhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kandnsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kandndi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def korqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def korhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def korsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kortestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def kortestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kortestchi : X86Builtin<"int(unsigned short, unsigned short)">;
+  def kortestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kortestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def kortestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def kortestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+  def kortestzdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def ktestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def ktestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
+  def ktestchi : X86Builtin<"int(unsigned short, unsigned short)">;
+  def ktestzhi : X86Builtin<"int(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def ktestcsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def ktestzsi : X86Builtin<"int(unsigned int, unsigned int)">;
+  def ktestcdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+  def ktestzdi : X86Builtin<"int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kunpckhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kxnorqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kxnorhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kxnorsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kxnordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kxorqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kxorhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kxorsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">;
+  def kxordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kshiftliqi : X86Builtin<"unsigned char(unsigned char, _Constant unsigned int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kshiftlihi : X86Builtin<"unsigned short(unsigned short, _Constant unsigned int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kshiftlisi : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+  def kshiftlidi : X86Builtin<"unsigned long long int(unsigned long long int, _Constant unsigned int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kshiftriqi : X86Builtin<"unsigned char(unsigned char, _Constant unsigned int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kshiftrihi : X86Builtin<"unsigned short(unsigned short, _Constant unsigned int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kshiftrisi : X86Builtin<"unsigned int(unsigned int, _Constant unsigned int)">;
+  def kshiftridi : X86Builtin<"unsigned long long int(unsigned long long int, _Constant unsigned int)">;
+}
+
+let Features = "avx512dq", Attributes = [NoThrow, Const] in {
+  def kmovb : X86Builtin<"unsigned char(unsigned char)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const] in {
+  def kmovw : X86Builtin<"unsigned short(unsigned short)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const] in {
+  def kmovd : X86Builtin<"unsigned int(unsigned int)">;
+  def kmovq : X86Builtin<"unsigned long long int(unsigned long long int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def dbpsadbw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def dbpsadbw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
+  def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compressdf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def compressdi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compresshi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, unsigned int)">;
+  def compressqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def compresssf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def compresssi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpsd_mask : X86Builtin<"unsigned char(_Vector<2, double>, _Vector<2, double>, _Constant int, unsigned char, _Constant int)">;
+  def cmpss_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Vector<4, float>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
+  def expanddf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
+  def expanddi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def expandhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, unsigned int)">;
+  def expandqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloaddf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double const *>, _Vector<8, double>, unsigned char)">;
+  def expandloaddi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int const *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloadhi512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short const *>, _Vector<32, short>, unsigned int)">;
+  def expandloadqi512_mask : X86Builtin<"_Vector<64, char>(_Vector<64, char const *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def expandloadsf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float const *>, _Vector<16, float>, unsigned short)">;
+  def expandloadsi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int const *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def expandsf512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, unsigned short)">;
+  def expandsi512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned short)">;
+  def cvtps2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, float>, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstoredf512_mask : X86Builtin<"void(_Vector<8, double *>, _Vector<8, double>, unsigned char)">;
+  def compressstoredi512_mask : X86Builtin<"void(_Vector<8, long long int *>, _Vector<8, long long int>, unsigned char)">;
+}
+
+let Features = "avx512vbmi2,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstorehi512_mask : X86Builtin<"void(_Vector<32, short *>, _Vector<32, short>, unsigned int)">;
+  def compressstoreqi512_mask : X86Builtin<"void(_Vector<64, char *>, _Vector<64, char>, unsigned long long int)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def compressstoresf512_mask : X86Builtin<"void(_Vector<16, float *>, _Vector<16, float>, unsigned short)">;
+  def compressstoresi512_mask : X86Builtin<"void(_Vector<16, int *>, _Vector<16, int>, unsigned short)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2ps_mask : X86Builtin<"_Vector<4, float>(_Vector<8, short>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2ps256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, short>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2ph_mask : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2ph256_mask : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtw2mask512 : X86Builtin<"unsigned int(_Vector<32, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtw2mask128 : X86Builtin<"unsigned char(_Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtw2mask256 : X86Builtin<"unsigned short(_Vector<16, short>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtsd2ss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<2, double>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def cvtsi2ss32 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, int, _Constant int)">;
+  def cvtss2sd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<4, float>, _Vector<2, double>, unsigned char, _Constant int)">;
+  def cvtusi2ss32 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512vbmi,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vpmultishiftqb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vpmultishiftqb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vpmultishiftqb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtne2ps2bf16_128 : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtne2ps2bf16_256 : X86Builtin<"_Vector<16, __bf16>(_Vector<8, float>, _Vector<8, float>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtne2ps2bf16_512 : X86Builtin<"_Vector<32, __bf16>(_Vector<16, float>, _Vector<16, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtneps2bf16_128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cvtneps2bf16_256_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, float>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cvtneps2bf16_512_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, float>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def dpbf16ps_128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def dpbf16ps_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def dpbf16ps_512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx512bf16", Attributes = [NoThrow, Const] in {
+  def cvtsbf162ss_32 : X86Builtin<"float(__bf16)">;
+}
+
+let Features = "avx512vp2intersect,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vp2intersect_q_512 : X86Builtin<"void(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vp2intersect_q_256 : X86Builtin<"void(_Vector<4, long long int>, _Vector<4, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vp2intersect_q_128 : X86Builtin<"void(_Vector<2, long long int>, _Vector<2, long long int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,evex512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vp2intersect_d_512 : X86Builtin<"void(_Vector<16, int>, _Vector<16, int>, unsigned short *, unsigned short *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vp2intersect_d_256 : X86Builtin<"void(_Vector<8, int>, _Vector<8, int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512vp2intersect,avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vp2intersect_d_128 : X86Builtin<"void(_Vector<4, int>, _Vector<4, int>, unsigned char *, unsigned char *)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomish : X86Builtin<"int(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def addph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def subph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def mulph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def divph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def maxph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+  def minph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def minph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def minph128 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def maxph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def maxph128 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def addsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def divsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def mulsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def subsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def maxsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def minsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def cmpph512_mask : X86Builtin<"unsigned int(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def cmpph256_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpph128_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cmpsh_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsh128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16 const *>, _Vector<8, _Float16>, unsigned char)">;
+  def storesh128_mask : X86Builtin<"void(_Vector<8, _Float16 *>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rcpph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rcpph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rsqrtph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rsqrtph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rsqrtph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getmantph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getmantph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getmantph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def getexpph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def getexpph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def getexpph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def scalefph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def scalefph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def scalefph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rndscaleph_128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def rndscaleph_256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def rndscaleph_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduceph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduceph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduceph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def rcpsh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+  def rsqrtsh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char)">;
+  def getmantsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def getexpsh128_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def scalefsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def rndscalesh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
+  def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def sqrtsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclassph128_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def fpclassph256_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def fpclassph512_mask : X86Builtin<"unsigned int(_Vector<32, _Float16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def fpclasssh_mask : X86Builtin<"unsigned char(_Vector<8, _Float16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtpd2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, double>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtpd2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, double>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtpd2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2pd128_mask : X86Builtin<"_Vector<2, double>(_Vector<8, _Float16>, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2pd256_mask : X86Builtin<"_Vector<4, double>(_Vector<8, _Float16>, _Vector<4, double>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2pd512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, _Float16>, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsh2ss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, _Float16>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtss2sh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<4, float>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtsd2sh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<2, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtsh2sd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<8, _Float16>, _Vector<2, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2w128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, _Float16>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2w256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2w512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, _Float16>, _Vector<32, short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2w128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, _Float16>, _Vector<8, short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2w256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2w512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, _Float16>, _Vector<32, short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtw2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, short>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtw2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, short>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtw2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, short>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2uw128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2uw256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2uw512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2uw128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2uw256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2uw512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtuw2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned short>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtuw2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned short>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtuw2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, unsigned short>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<8, _Float16>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2dq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, _Float16>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2udq128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<8, _Float16>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2udq256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2udq512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, _Float16>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtdq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtdq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtdq2ph512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, int>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtudq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtudq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtudq2ph512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned int>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2dq128_mask : X86Builtin<"_Vector<4, int>(_Vector<8, _Float16>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2dq256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2dq512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, _Float16>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2udq128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<8, _Float16>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2udq256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2udq512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, _Float16>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtqq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtqq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtqq2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, _Float16>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, _Float16>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtuqq2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<2, unsigned long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtuqq2ph256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned long long int>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtuqq2ph512_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2uqq128_mask : X86Builtin<"_Vector<2, unsigned long long int>(_Vector<8, _Float16>, _Vector<2, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2uqq256_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2uqq512_mask : X86Builtin<"_Vector<8, unsigned long long int>(_Vector<8, _Float16>, _Vector<8, unsigned long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2qq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, _Float16>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2qq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2qq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, _Float16>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttph2uqq128_mask : X86Builtin<"_Vector<2, unsigned long long int>(_Vector<8, _Float16>, _Vector<2, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvttph2uqq256_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvttph2uqq512_mask : X86Builtin<"_Vector<8, unsigned long long int>(_Vector<8, _Float16>, _Vector<8, unsigned long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsh2si32 : X86Builtin<"int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvtsh2usi32 : X86Builtin<"unsigned int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvtusi2sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, unsigned int, _Constant int)">;
+  def vcvtsi2sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, int, _Constant int)">;
+  def vcvttsh2si32 : X86Builtin<"int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvttsh2usi32 : X86Builtin<"unsigned int(_Vector<8, _Float16>, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtph2psx128_mask : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtph2psx256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, _Float16>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtph2psx512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, _Float16>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtps2phx128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvtps2phx256_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvtps2phx512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddph512_maskz : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsubph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddsubph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddsubph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddsubph512_maskz : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmaddsubph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmsubaddph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+  def vfmsubph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddsh3_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmaddsh3_maskz : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmaddsh3_mask3 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vfmsubsh3_mask3 : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+  def vfmaddcph128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+  def vfmaddcph256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmaddcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddcph512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfmaddcph512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfcmaddcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+  def vfcmaddcph128_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfcmaddcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+  def vfcmaddcph256_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfcmaddcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfcmaddcph512_maskz : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+  def vfcmaddcph512_mask3 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_maskz : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmaddcsh_round_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmaddcsh_round_mask3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfmulcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vfcmulcsh_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmulcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmulcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfmulcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfcmulcph128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfcmulcph256_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfcmulcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectb_128 : X86Builtin<"_Vector<16, char>(unsigned short, _Vector<16, char>, _Vector<16, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectb_256 : X86Builtin<"_Vector<32, char>(unsigned int, _Vector<32, char>, _Vector<32, char>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectb_512 : X86Builtin<"_Vector<64, char>(unsigned long long int, _Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectw_128 : X86Builtin<"_Vector<8, short>(unsigned char, _Vector<8, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectw_256 : X86Builtin<"_Vector<16, short>(unsigned short, _Vector<16, short>, _Vector<16, short>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectw_512 : X86Builtin<"_Vector<32, short>(unsigned int, _Vector<32, short>, _Vector<32, short>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectd_128 : X86Builtin<"_Vector<4, int>(unsigned char, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectd_256 : X86Builtin<"_Vector<8, int>(unsigned char, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectd_512 : X86Builtin<"_Vector<16, int>(unsigned short, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectph_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectph_256 : X86Builtin<"_Vector<16, _Float16>(unsigned short, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectph_512 : X86Builtin<"_Vector<32, _Float16>(unsigned int, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectpbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectpbf_256 : X86Builtin<"_Vector<16, __bf16>(unsigned short, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectpbf_512 : X86Builtin<"_Vector<32, __bf16>(unsigned int, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectq_128 : X86Builtin<"_Vector<2, long long int>(unsigned char, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectq_256 : X86Builtin<"_Vector<4, long long int>(unsigned char, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectq_512 : X86Builtin<"_Vector<8, long long int>(unsigned char, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectps_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectps_256 : X86Builtin<"_Vector<8, float>(unsigned char, _Vector<8, float>, _Vector<8, float>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectps_512 : X86Builtin<"_Vector<16, float>(unsigned short, _Vector<16, float>, _Vector<16, float>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectpd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def selectpd_256 : X86Builtin<"_Vector<4, double>(unsigned char, _Vector<4, double>, _Vector<4, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def selectpd_512 : X86Builtin<"_Vector<8, double>(unsigned char, _Vector<8, double>, _Vector<8, double>)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectsh_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectsbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def selectss_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">;
+  def selectsd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fadd_pd512 : X86Builtin<"double(double, _Vector<8, double>)">;
+  def reduce_fadd_ps512 : X86Builtin<"float(float, _Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fadd_ph512 : X86Builtin<"_Float16(_Float16, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fadd_ph256 : X86Builtin<"_Float16(_Float16, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fadd_ph128 : X86Builtin<"_Float16(_Float16, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmax_pd512 : X86Builtin<"double(_Vector<8, double>)">;
+  def reduce_fmax_ps512 : X86Builtin<"float(_Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmax_ph512 : X86Builtin<"_Float16(_Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmax_ph256 : X86Builtin<"_Float16(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmax_ph128 : X86Builtin<"_Float16(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmin_pd512 : X86Builtin<"double(_Vector<8, double>)">;
+  def reduce_fmin_ps512 : X86Builtin<"float(_Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmin_ph512 : X86Builtin<"_Float16(_Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmin_ph256 : X86Builtin<"_Float16(_Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmin_ph128 : X86Builtin<"_Float16(_Vector<8, _Float16>)">;
+}
+
+let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmul_pd512 : X86Builtin<"double(double, _Vector<8, double>)">;
+  def reduce_fmul_ps512 : X86Builtin<"float(float, _Vector<16, float>)">;
+}
+
+let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def reduce_fmul_ph512 : X86Builtin<"_Float16(_Float16, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def reduce_fmul_ph256 : X86Builtin<"_Float16(_Float16, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def reduce_fmul_ph128 : X86Builtin<"_Float16(_Float16, _Vector<8, _Float16>)">;
+}
+
+let Features = "mwaitx", Attributes = [NoThrow] in {
+  def monitorx : X86Builtin<"void(void const *, unsigned int, unsigned int)">;
+  def mwaitx : X86Builtin<"void(unsigned int, unsigned int, unsigned int)">;
+}
+
+let Features = "waitpkg", Attributes = [NoThrow] in {
+  def umonitor : X86Builtin<"void(void const *)">;
+  def umwait : X86Builtin<"unsigned char(unsigned int, unsigned int, unsigned int)">;
+  def tpause : X86Builtin<"unsigned char(unsigned int, unsigned int, unsigned int)">;
+}
+
+let Features = "clzero", Attributes = [NoThrow] in {
+  def clzero : X86Builtin<"void(void *)">;
+}
+
+let Features = "cldemote", Attributes = [NoThrow] in {
+  def cldemote : X86Builtin<"void(void const *)">;
+}
+
+let Features = "movdiri", Attributes = [NoThrow] in {
+  def directstore_u32 : X86Builtin<"void(unsigned int *, unsigned int)">;
+}
+
+let Features = "movdir64b", Attributes = [NoThrow] in {
+  def movdir64b : X86Builtin<"void(void *, void const *)">;
+}
+
+let Features = "ptwrite", Attributes = [NoThrow] in {
+  def ptwrite32 : X86Builtin<"void(unsigned int)">;
+}
+
+let Features = "invpcid", Attributes = [NoThrow, Const] in {
+  def invpcid : X86Builtin<"void(unsigned int, void *)">;
+}
+
+let Features = "enqcmd", Attributes = [NoThrow] in {
+  def enqcmd : X86Builtin<"unsigned char(void *, void const *)">;
+  def enqcmds : X86Builtin<"unsigned char(void *, void const *)">;
+}
+
+let Features = "kl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadiwkey : X86Builtin<"void(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, unsigned int)">;
+  def encodekey128_u32 : X86Builtin<"unsigned int(unsigned int, _Vector<2, long long int>, void *)">;
+  def encodekey256_u32 : X86Builtin<"unsigned int(unsigned int, _Vector<2, long long int>, _Vector<2, long long int>, void *)">;
+  def aesenc128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesenc256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesdec128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+  def aesdec256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int>, void const *)">;
+}
+
+let Features = "kl,widekl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def aesencwide128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesencwide256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesdecwide128kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+  def aesdecwide256kl_u8 : X86Builtin<"unsigned char(_Vector<2, long long int *>, _Vector<2, long long int const *>, void const *)">;
+}
+
+let Features = "serialize", Attributes = [NoThrow] in {
+  def serialize : X86Builtin<"void()">;
+}
+
+let Features = "tsxldtrk", Attributes = [NoThrow] in {
+  def xsusldtrk : X86Builtin<"void()">;
+  def xresldtrk : X86Builtin<"void()">;
+}
+
+let Features = "raoint", Attributes = [NoThrow] in {
+  def aadd32 : X86Builtin<"void(void *, signed int)">;
+  def aand32 : X86Builtin<"void(void *, signed int)">;
+  def aor32 : X86Builtin<"void(void *, signed int)">;
+  def axor32 : X86Builtin<"void(void *, signed int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _BitScanForward : X86LibBuiltin<"unsigned char(msuint32_t *, msuint32_t)">;
+  def _BitScanReverse : X86LibBuiltin<"unsigned char(msuint32_t *, msuint32_t)">;
+  def _ReadWriteBarrier : X86LibBuiltin<"void()">;
+  def _ReadBarrier : X86LibBuiltin<"void()">;
+  def _WriteBarrier : X86LibBuiltin<"void()">;
+  def __cpuid : X86LibBuiltin<"void(int *, int)">;
+  def __cpuidex : X86LibBuiltin<"void(int *, int, int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, Const, RequireDeclaration] in {
+  def __emul : X86LibBuiltin<"long long int(int, int)">;
+  def __emulu : X86LibBuiltin<"unsigned long long int(unsigned int, unsigned int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _AddressOfReturnAddress : X86LibBuiltin<"void *()">;
+  def __stosb : X86LibBuiltin<"void(unsigned char *, unsigned char, size_t)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration, NoReturn] in {
+  def __int2c : X86LibBuiltin<"void()">;
+  def __ud2 : X86LibBuiltin<"void()">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def __readfsbyte : X86LibBuiltin<"unsigned char(msuint32_t)">;
+  def __readfsword : X86LibBuiltin<"unsigned short(msuint32_t)">;
+  def __readfsdword : X86LibBuiltin<"msuint32_t(msuint32_t)">;
+  def __readfsqword : X86LibBuiltin<"unsigned long long int(msuint32_t)">;
+  def __readgsbyte : X86LibBuiltin<"unsigned char(msuint32_t)">;
+  def __readgsword : X86LibBuiltin<"unsigned short(msuint32_t)">;
+  def __readgsdword : X86LibBuiltin<"msuint32_t(msuint32_t)">;
+  def __readgsqword : X86LibBuiltin<"unsigned long long int(msuint32_t)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vdpphps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vdpphps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vdpphps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
+  def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def mpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vaddpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vaddph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vaddps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vcmppd256_round_mask : X86Builtin<"unsigned char(_Vector<4, double>, _Vector<4, double>, _Constant int, unsigned char, _Constant int)">;
+  def vcmpph256_round_mask : X86Builtin<"unsigned short(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, unsigned short, _Constant int)">;
+  def vcmpps256_round_mask : X86Builtin<"unsigned char(_Vector<8, float>, _Vector<8, float>, _Constant int, unsigned char, _Constant int)">;
+  def vcvtdq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtdq2ps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtpd2dq256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+  def vcvtpd2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, double>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtpd2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, double>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtpd2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtpd2udq256_round_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, double>, _Vector<4, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtpd2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, double>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtph2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvtph2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<8, _Float16>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtph2psx256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, _Float16>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtph2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtph2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtph2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtph2uw256_round_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+  def vcvtph2w256_round_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short, _Constant int)">;
+  def vcvtps2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvtps2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, float>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtps2phx256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, float>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtps2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvtps2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvtps2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, float>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtqq2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, long long int>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtqq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtqq2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, long long int>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvttpd2dq256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+  def vcvttpd2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttpd2udq256_round_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, double>, _Vector<4, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttpd2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, double>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvttph2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, _Float16>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvttph2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, _Float16>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttph2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, _Float16>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttph2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<8, _Float16>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvttph2uw256_round_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+  def vcvttph2w256_round_mask : X86Builtin<"_Vector<16, short>(_Vector<16, _Float16>, _Vector<16, short>, unsigned short, _Constant int)">;
+  def vcvttps2dq256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+  def vcvttps2qq256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+  def vcvttps2udq256_round_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+  def vcvttps2uqq256_round_mask : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, float>, _Vector<4, unsigned long long int>, unsigned char, _Constant int)">;
+  def vcvtudq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, unsigned int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtudq2ps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, unsigned int>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vcvtuqq2pd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, unsigned long long int>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vcvtuqq2ph256_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, unsigned long long int>, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vcvtuqq2ps256_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, unsigned long long int>, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtuw2ph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, unsigned short>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vcvtw2ph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, short>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vdivpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vdivph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vdivps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vfcmaddcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmaddcph256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmaddcph256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfcmulcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfixupimmpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, long long int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char, _Constant int)">;
+  def vfixupimmps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, int>, _Constant int, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddph256_round_maskz : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddcph256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_maskz : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmaddsubph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubph256_round_maskz : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmaddsubps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubps256_round_maskz : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmaddsubps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmsubpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmsubph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmsubps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmsubaddpd256_round_mask3 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vfmsubaddph256_round_mask3 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vfmsubaddps256_round_mask3 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vfmulcph256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vgetexppd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vgetexpph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vgetexpps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vgetmantpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vgetmantph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vgetmantps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vmaxpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vmaxph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vmaxps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vminpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vminph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vminps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vmulpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vmulph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vmulps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+  def vrangepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vrangeps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vreducepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vreduceph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vreduceps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vrndscalepd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vrndscaleph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vrndscaleps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vscalefpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>, unsigned char, _Constant int)">;
+  def vscalefph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+  def vscalefps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>, unsigned char, _Constant int)">;
+  def vsqrtpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
+  def vsqrtph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Constant int)">;
+  def vsqrtps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
+  def vsubpd256_round : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def vsubph256_round : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int)">;
+  def vsubps256_round : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avxvnniint16|avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttsd2sis32 : X86Builtin<"int(_Vector<2, double>, _Constant int)">;
+  def vcvttsd2usis32 : X86Builtin<"unsigned int(_Vector<2, double>, _Constant int)">;
+  def vcvttss2sis32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">;
+  def vcvttss2usis32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2dqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2dqs256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2dqs512_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2udqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<2, double>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2udqs256_round_mask : X86Builtin<"_Vector<4, int>(_Vector<4, double>, _Vector<4, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2udqs512_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, double>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2qqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2qqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2qqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttpd2uqqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttpd2uqqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, double>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttpd2uqqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, double>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2dqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2dqs256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2dqs512_round_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2udqs128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, float>, _Vector<4, int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2udqs256_round_mask : X86Builtin<"_Vector<8, int>(_Vector<8, float>, _Vector<8, int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2udqs512_round_mask : X86Builtin<"_Vector<16, int>(_Vector<16, float>, _Vector<16, int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2qqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2qqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2qqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2uqqs128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, float>, _Vector<2, long long int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2uqqs256_round_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, float>, _Vector<4, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2uqqs512_round_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, float>, _Vector<8, long long int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vbcstnebf162ps128 : X86Builtin<"_Vector<4, float>(__bf16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vbcstnebf162ps256 : X86Builtin<"_Vector<8, float>(__bf16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vbcstnesh2ps128 : X86Builtin<"_Vector<4, float>(_Float16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vbcstnesh2ps256 : X86Builtin<"_Vector<8, float>(_Float16 const *)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneebf162ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneebf162ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneeph2ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneeph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneobf162ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneobf162ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, __bf16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneoph2ps128 : X86Builtin<"_Vector<4, float>(_Vector<8, _Float16 const *>)">;
+}
+
+let Features = "avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneoph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<16, _Float16 const *>)">;
+}
+
+let Features = "avx512bf16,avx512vl|avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneps2bf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<4, float>)">;
+}
+
+let Features = "avx512bf16,avx512vl|avxneconvert", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneps2bf16256 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, float>)">;
+}
+
+let Features = "sha512", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsha512msg1 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<2, unsigned long long int>)">;
+  def vsha512msg2 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<4, unsigned long long int>)">;
+  def vsha512rnds2 : X86Builtin<"_Vector<4, unsigned long long int>(_Vector<4, unsigned long long int>, _Vector<4, unsigned long long int>, _Vector<2, unsigned long long int>)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _InterlockedAnd64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedDecrement64 : X86LibBuiltin<"int64_t(int64_t volatile *)">;
+  def _InterlockedExchange64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedExchangeAdd64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedExchangeSub64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedIncrement64 : X86LibBuiltin<"int64_t(int64_t volatile *)">;
+  def _InterlockedOr64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+  def _InterlockedXor64 : X86LibBuiltin<"int64_t(int64_t volatile *, int64_t)">;
+}
+
+let Features = "sm3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm3msg1 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+  def vsm3msg2 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+  def vsm3rnds2 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>, _Vector<4, unsigned int>, _Constant unsigned int)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm4key4128 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsm4key4256 : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, unsigned int>, _Vector<8, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vsm4rnds4128 : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, unsigned int>, _Vector<4, unsigned int>)">;
+}
+
+let Features = "sm4", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vsm4rnds4256 : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, unsigned int>, _Vector<8, unsigned int>)">;
+}
+
+let Features = "avx10.2-512,sm4", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vsm4key4512 : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, unsigned int>, _Vector<16, unsigned int>)">;
+  def vsm4rnds4512 : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, unsigned int>, _Vector<16, unsigned int>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxpd128_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxpd256_round_mask : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int, _Vector<4, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxpd512_round_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int, _Vector<8, double>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxph256_round_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Constant int, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxph512_round_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Constant int, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxps128_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vminmaxps256_round_mask : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int, _Vector<8, float>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vminmaxps512_round_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int, _Vector<16, float>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vminmaxsd_round_mask : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int, _Vector<2, double>, unsigned char, _Constant int)">;
+  def vminmaxsh_round_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Constant int, _Vector<8, _Float16>, unsigned char, _Constant int)">;
+  def vminmaxss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int, _Vector<4, float>, unsigned char, _Constant int)">;
+  def vcvtnebf162ibs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtnebf162ibs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtnebf162ibs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtnebf162iubs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtnebf162iubs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtnebf162iubs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtph2ibs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtph2ibs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtph2ibs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtph2iubs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtph2iubs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtph2iubs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtps2ibs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtps2ibs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtps2ibs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtps2iubs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtps2iubs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtps2iubs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttnebf162ibs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttnebf162ibs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttnebf162ibs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttnebf162iubs128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttnebf162iubs256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttnebf162iubs512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttph2ibs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttph2ibs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttph2ibs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttph2iubs128_mask : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, _Float16>, _Vector<8, unsigned short>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttph2iubs256_mask : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, _Float16>, _Vector<16, unsigned short>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttph2iubs512_mask : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, _Float16>, _Vector<32, unsigned short>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2ibs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2ibs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2ibs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvttps2iubs128_mask : X86Builtin<"_Vector<4, unsigned int>(_Vector<4, float>, _Vector<4, unsigned int>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvttps2iubs256_mask : X86Builtin<"_Vector<8, unsigned int>(_Vector<8, float>, _Vector<8, unsigned int>, unsigned char, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvttps2iubs512_mask : X86Builtin<"_Vector<16, unsigned int>(_Vector<16, float>, _Vector<16, unsigned int>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvt2ps2phx128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<4, float>, _Vector<4, float>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcvt2ps2phx256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<8, float>, _Vector<8, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcvt2ps2phx512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<16, float>, _Vector<16, float>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2bf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2bf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2bf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2bf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2bf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2bf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2hf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2hf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2hf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtbiasph2hf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtbiasph2hf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<32, char>, _Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtbiasph2hf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<64, char>, _Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2bf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2bf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2bf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2bf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2bf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2bf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2hf8_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2hf8_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2hf8_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtne2ph2hf8s_128 : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<8, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtne2ph2hf8s_256 : X86Builtin<"_Vector<32, char>(_Vector<16, _Float16>, _Vector<16, _Float16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtne2ph2hf8s_512 : X86Builtin<"_Vector<64, char>(_Vector<32, _Float16>, _Vector<32, _Float16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvthf8_2ph128_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<16, char>, _Vector<8, _Float16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvthf8_2ph256_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, char>, _Vector<16, _Float16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvthf8_2ph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, char>, _Vector<32, _Float16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2bf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2bf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2bf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2bf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2bf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2bf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2hf8_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2hf8_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2hf8_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vcvtneph2hf8s_128_mask : X86Builtin<"_Vector<16, char>(_Vector<8, _Float16>, _Vector<16, char>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vcvtneph2hf8s_256_mask : X86Builtin<"_Vector<16, char>(_Vector<16, _Float16>, _Vector<16, char>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vcvtneph2hf8s_512_mask : X86Builtin<"_Vector<32, char>(_Vector<32, _Float16>, _Vector<32, char>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def loadsbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16 const *>, _Vector<8, __bf16>, unsigned char)">;
+  def storesbf16128_mask : X86Builtin<"void(_Vector<8, __bf16 *>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vaddnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vaddnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vaddnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vdivnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vdivnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vdivnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vmaxpbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vmaxpbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vmaxpbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vminpbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vminpbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vminpbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vmulnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vmulnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vmulnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vsubnepbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vsubnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vsubnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcomsbf16eq : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16lt : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16neq : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16ge : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16gt : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+  def vcomsbf16le : X86Builtin<"int(_Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vcmppbf16512_mask : X86Builtin<"unsigned int(_Vector<32, __bf16>, _Vector<32, __bf16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vcmppbf16256_mask : X86Builtin<"unsigned short(_Vector<16, __bf16>, _Vector<16, __bf16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcmppbf16128_mask : X86Builtin<"unsigned char(_Vector<8, __bf16>, _Vector<8, __bf16>, _Constant int, unsigned char)">;
+  def vfpclasspbf16128_mask : X86Builtin<"unsigned char(_Vector<8, __bf16>, _Constant int, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfpclasspbf16256_mask : X86Builtin<"unsigned short(_Vector<16, __bf16>, _Constant int, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vfpclasspbf16512_mask : X86Builtin<"unsigned int(_Vector<32, __bf16>, _Constant int, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vscalefpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vscalefpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vscalefpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrcppbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrcppbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrcppbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgetexppbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgetexppbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgetexppbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrsqrtpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrsqrtpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrsqrtpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vreducenepbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vreducenepbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vreducenepbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vrndscalenepbf16_128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vrndscalenepbf16_256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vrndscalenepbf16_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vgetmantpbf16128_mask : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Constant int, _Vector<8, __bf16>, unsigned char)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vgetmantpbf16256_mask : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Constant int, _Vector<16, __bf16>, unsigned short)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vgetmantpbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vsqrtnepbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vsqrtnepbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+  def vsqrtnepbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
+  def vfmaddnepbh512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vfmaddnepbh256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vfmaddnepbh128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
+}
diff --git a/clang/include/clang/Basic/BuiltinsX86Base.td b/clang/include/clang/Basic/BuiltinsX86Base.td
new file mode 100644
index 0000000..aca39c2
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsX86Base.td
@@ -0,0 +1,29 @@
+//===--- BuiltinsX86Base.td - X86 Builtin function classes ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific builtin function classes.
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+class X86Builtin<string prototype> : TargetBuiltin {
+  let Spellings = ["__builtin_ia32_" # NAME];
+  let Prototype = prototype;
+  let EnableOpenCLLong = 1;
+}
+
+class X86NoPrefixBuiltin<string prototype> : TargetBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+}
+
+class X86LibBuiltin<string prototype> : TargetLibBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+}
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
deleted file mode 100644
index 57928a1..0000000
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ /dev/null
@@ -1,253 +0,0 @@
-//===--- BuiltinsX86_64.def - X86-64 Builtin function database --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the X86-64-specific builtin function database. Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-// The format of this database matches clang/Basic/Builtins.def.
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN)
-#  define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-TARGET_HEADER_BUILTIN(_BitScanForward64, "UcUNi*ULLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_BitScanReverse64, "UcUNi*ULLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__mulh,  "LLiLLiLLi",    "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__umulh, "ULLiULLiULLi", "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_mul128, "LLiLLiLLiLLi*",      "nch",   INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(_umul128, "ULLiULLiULLiULLi*", "nch",   INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(__faststorefence, "v", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__shiftleft128, "ULLiULLiULLiUc", "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-TARGET_HEADER_BUILTIN(__shiftright128, "ULLiULLiULLiUc", "nch", INTRIN_H, ALL_MS_LANGUAGES, "")
-
-TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128, "UcLLiD*LLiLLiLLi*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "cx16")
-
-TARGET_BUILTIN(__builtin_ia32_readeflags_u64, "UOi", "n", "")
-TARGET_BUILTIN(__builtin_ia32_writeeflags_u64, "vUOi", "n", "")
-TARGET_BUILTIN(__builtin_ia32_cvtss2si64, "OiV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cvttss2si64, "OiV4f", "ncV:128:", "sse")
-TARGET_BUILTIN(__builtin_ia32_cvtsd2si64, "OiV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_cvttsd2si64, "OiV2d", "ncV:128:", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movnti64, "vOi*Oi", "n", "sse2")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v2di, "V2OiV2OiOiIi", "ncV:128:", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_crc32di, "UOiUOiUOi", "nc", "crc32")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v4di, "OiV4OiIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4di, "V4OiV4OiOiIi", "ncV:256:", "avx")
-TARGET_BUILTIN(__builtin_ia32_rdfsbase32, "Ui", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_rdfsbase64, "UOi", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_rdgsbase32, "Ui", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_rdgsbase64, "UOi", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_wrfsbase32, "vUi", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_wrfsbase64, "vUOi", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_wrgsbase32, "vUi", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_wrgsbase64, "vUOi", "n", "fsgsbase")
-TARGET_BUILTIN(__builtin_ia32_fxrstor64, "vv*", "n", "fxsr")
-TARGET_BUILTIN(__builtin_ia32_fxsave64, "vv*", "n", "fxsr")
-TARGET_BUILTIN(__builtin_ia32_xsave64, "vv*UOi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xrstor64, "vv*UOi", "n", "xsave")
-TARGET_BUILTIN(__builtin_ia32_xsaveopt64, "vv*UOi", "n", "xsaveopt")
-TARGET_BUILTIN(__builtin_ia32_xrstors64, "vv*UOi", "n", "xsaves")
-TARGET_BUILTIN(__builtin_ia32_xsavec64, "vv*UOi", "n", "xsavec")
-TARGET_BUILTIN(__builtin_ia32_xsaves64, "vv*UOi", "n", "xsaves")
-TARGET_BUILTIN(__builtin_ia32_incsspq, "vUOi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_rdsspq, "UOiUOi", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrssq, "vUOiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_wrussq, "vUOiv*", "n", "shstk")
-TARGET_BUILTIN(__builtin_ia32_addcarryx_u64, "UcUcUOiUOiUOi*", "nE", "")
-TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcUOiUOiUOi*", "nE", "")
-TARGET_BUILTIN(__builtin_ia32_rdrand64_step, "UiUOi*", "n", "rdrnd")
-TARGET_BUILTIN(__builtin_ia32_rdseed64_step, "UiUOi*", "n", "rdseed")
-TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "UOiUOi", "ncE", "lzcnt")
-TARGET_BUILTIN(__builtin_ia32_bextr_u64, "UOiUOiUOi", "ncE", "bmi")
-TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "UOiUOi", "ncE", "")
-TARGET_BUILTIN(__builtin_ia32_bzhi_di, "UOiUOiUOi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pdep_di, "UOiUOiUOi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_pext_di, "UOiUOiUOi", "ncE", "bmi2")
-TARGET_BUILTIN(__builtin_ia32_bextri_u64, "UOiUOiIUOi", "ncE", "tbm")
-TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcUOiUiIUi", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_lwpval64, "vUOiUiIUi", "n", "lwp")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "OiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "UOiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "OiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtss2usi64, "UOiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2si64, "OiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi64, "UOiV2dIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2si64, "OiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usi64, "UOiV4fIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtsi2sd64, "V2dV2dOiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fOiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtusi642sh, "V8xV8xUOiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvtsi642sh, "V8xV8xOiIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvttsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16")
-TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri")
-
-// AVX10.2 SATCVT-DS
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2sis64, "OiV2dIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttsd2usis64, "UOiV2dIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2sis64, "OiV4fIi", "ncV:128:", "avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vcvttss2usis64, "UOiV4fIi", "ncV:128:", "avx10.2-256")
-
-// UINTR
-TARGET_BUILTIN(__builtin_ia32_clui, "v", "n", "uintr")
-TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")
-TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
-TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")
-// USERMSR
-TARGET_BUILTIN(__builtin_ia32_urdmsr, "ULLiULLi", "n", "usermsr")
-TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr")
-
-// AMX internal builtin
-TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs")
-TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs")
-TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
-TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
-TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
-TARGET_BUILTIN(__builtin_ia32_tdpbuud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
-TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16")
-TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
-TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
-TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose")
-
-TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
-TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_tdpbf8ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp8")
-TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp8")
-TARGET_BUILTIN(__builtin_ia32_tdphbf8ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp8")
-TARGET_BUILTIN(__builtin_ia32_tdphf8ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp8")
-
-// AMX
-TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
-
-TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs")
-TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs")
-
-TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")
-TARGET_BUILTIN(__builtin_ia32_tilestored64, "vIUcv*z", "n", "amx-tile")
-
-TARGET_BUILTIN(__builtin_ia32_tdpbssd, "vIUcIUcIUc", "n", "amx-int8")
-TARGET_BUILTIN(__builtin_ia32_tdpbsud, "vIUcIUcIUc", "n", "amx-int8")
-TARGET_BUILTIN(__builtin_ia32_tdpbusd, "vIUcIUcIUc", "n", "amx-int8")
-TARGET_BUILTIN(__builtin_ia32_tdpbuud, "vIUcIUcIUc", "n", "amx-int8")
-TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16")
-TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
-
-TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
-TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
-
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
-TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose")
-
-TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l, "V32yIUcUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512,avx10.2-512")
-
-// AMX_FP16 FP16
-TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
-
-// AMX FP8
-TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
-TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8")
-TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
-TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8")
-
-// AMX TF32
-TARGET_BUILTIN(__builtin_ia32_tmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32")
-TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32,amx-transpose")
-
-TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd")
-
-// RAO-INT
-TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_aor64, "vv*SOi", "n", "raoint")
-TARGET_BUILTIN(__builtin_ia32_axor64, "vv*SOi", "n", "raoint")
-
-// MOVRS
-TARGET_BUILTIN(__builtin_ia32_movrsqi, "ScvC*", "n", "movrs")
-TARGET_BUILTIN(__builtin_ia32_movrshi, "SsvC*", "n", "movrs")
-TARGET_BUILTIN(__builtin_ia32_movrssi, "SivC*", "n", "movrs")
-TARGET_BUILTIN(__builtin_ia32_movrsdi, "SLLivC*", "n", "movrs")
-
-// MOVRS and AVX10.2
-TARGET_BUILTIN(__builtin_ia32_vmovrsb128, "V16cV16cC*", "nV:128:", "movrs,avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmovrsb256, "V32cV32cC*", "nV:256:", "movrs,avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmovrsb512, "V64cV64cC*", "nV:512:", "movrs,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmovrsd128, "V4iV4iC*", "nV:128:", "movrs,avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmovrsd256, "V8iV8iC*", "nV:256:", "movrs,avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmovrsd512, "V16iV16iC*", "nV:512:", "movrs,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmovrsq128, "V2OiV2OiC*", "nV:128:", "movrs,avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmovrsq256, "V4OiV4OiC*", "nV:256:", "movrs,avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmovrsq512, "V8OiV8OiC*", "nV:512:", "movrs,avx10.2-512")
-TARGET_BUILTIN(__builtin_ia32_vmovrsw128, "V8sV8sC*", "nV:128:", "movrs,avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmovrsw256, "V16sV16sC*", "nV:256:", "movrs,avx10.2-256")
-TARGET_BUILTIN(__builtin_ia32_vmovrsw512, "V32sV32sC*", "nV:512:", "movrs,avx10.2-512")
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
-#undef TARGET_HEADER_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
new file mode 100644
index 0000000..a6c6ef8
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsX86_64.td
@@ -0,0 +1,485 @@
+//===--- BuiltinsX86_64.td - X86-64 Builtin function database ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-64-specific builtin function database.
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsX86Base.td"
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _BitScanForward64 : X86LibBuiltin<"unsigned char(msuint32_t *, unsigned long long int)">;
+  def _BitScanReverse64 : X86LibBuiltin<"unsigned char(msuint32_t *, unsigned long long int)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, Const, RequireDeclaration] in {
+  def __mulh : X86LibBuiltin<"long long int(long long int, long long int)">;
+  def __umulh : X86LibBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+  def _mul128 : X86LibBuiltin<"long long int(long long int, long long int, long long int *)">;
+  def _umul128 : X86LibBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int, unsigned long long int *)">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def __faststorefence : X86LibBuiltin<"void()">;
+}
+
+let Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, Const, RequireDeclaration] in {
+  def __shiftleft128 : X86LibBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int, unsigned char)">;
+  def __shiftright128 : X86LibBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int, unsigned char)">;
+}
+
+let Features = "cx16", Header = "intrin.h", Languages = "ALL_MS_LANGUAGES", Attributes = [NoThrow, RequireDeclaration] in {
+  def _InterlockedCompareExchange128 : X86LibBuiltin<"unsigned char(long long int volatile *, long long int, long long int, long long int *)">;
+}
+
+let Attributes = [NoThrow] in {
+  def readeflags_u64 : X86Builtin<"unsigned long long int()">;
+  def writeeflags_u64 : X86Builtin<"void(unsigned long long int)">;
+}
+
+let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtss2si64 : X86Builtin<"long long int(_Vector<4, float>)">;
+  def cvttss2si64 : X86Builtin<"long long int(_Vector<4, float>)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def cvtsd2si64 : X86Builtin<"long long int(_Vector<2, double>)">;
+  def cvttsd2si64 : X86Builtin<"long long int(_Vector<2, double>)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow] in {
+  def movnti64 : X86Builtin<"void(long long int *, long long int)">;
+}
+
+let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vec_set_v2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int, _Constant int)">;
+}
+
+let Features = "crc32", Attributes = [NoThrow, Const] in {
+  def crc32di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def vec_ext_v4di : X86Builtin<"long long int(_Vector<4, long long int>, _Constant int)">;
+  def vec_set_v4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int, _Constant int)">;
+}
+
+let Features = "fsgsbase", Attributes = [NoThrow] in {
+  def rdfsbase32 : X86Builtin<"unsigned int()">;
+  def rdfsbase64 : X86Builtin<"unsigned long long int()">;
+  def rdgsbase32 : X86Builtin<"unsigned int()">;
+  def rdgsbase64 : X86Builtin<"unsigned long long int()">;
+  def wrfsbase32 : X86Builtin<"void(unsigned int)">;
+  def wrfsbase64 : X86Builtin<"void(unsigned long long int)">;
+  def wrgsbase32 : X86Builtin<"void(unsigned int)">;
+  def wrgsbase64 : X86Builtin<"void(unsigned long long int)">;
+}
+
+let Features = "fxsr", Attributes = [NoThrow] in {
+  def fxrstor64 : X86Builtin<"void(void *)">;
+  def fxsave64 : X86Builtin<"void(void *)">;
+}
+
+let Features = "xsave", Attributes = [NoThrow] in {
+  def xsave64 : X86Builtin<"void(void *, unsigned long long int)">;
+  def xrstor64 : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaveopt", Attributes = [NoThrow] in {
+  def xsaveopt64 : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaves", Attributes = [NoThrow] in {
+  def xrstors64 : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsavec", Attributes = [NoThrow] in {
+  def xsavec64 : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "xsaves", Attributes = [NoThrow] in {
+  def xsaves64 : X86Builtin<"void(void *, unsigned long long int)">;
+}
+
+let Features = "shstk", Attributes = [NoThrow] in {
+  def incsspq : X86Builtin<"void(unsigned long long int)">;
+  def rdsspq : X86Builtin<"unsigned long long int(unsigned long long int)">;
+  def wrssq : X86Builtin<"void(unsigned long long int, void *)">;
+  def wrussq : X86Builtin<"void(unsigned long long int, void *)">;
+}
+
+let Attributes = [NoThrow, Constexpr] in {
+  def addcarryx_u64 : X86Builtin<"unsigned char(unsigned char, unsigned long long int, unsigned long long int, unsigned long long int *)">;
+  def subborrow_u64 : X86Builtin<"unsigned char(unsigned char, unsigned long long int, unsigned long long int, unsigned long long int *)">;
+}
+
+let Features = "rdrnd", Attributes = [NoThrow] in {
+  def rdrand64_step : X86Builtin<"unsigned int(unsigned long long int *)">;
+}
+
+let Features = "rdseed", Attributes = [NoThrow] in {
+  def rdseed64_step : X86Builtin<"unsigned int(unsigned long long int *)">;
+}
+
+let Features = "lzcnt", Attributes = [NoThrow, Const, Constexpr] in {
+  def lzcnt_u64 : X86Builtin<"unsigned long long int(unsigned long long int)">;
+}
+
+let Features = "bmi", Attributes = [NoThrow, Const, Constexpr] in {
+  def bextr_u64 : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Attributes = [NoThrow, Const, Constexpr] in {
+  def tzcnt_u64 : X86Builtin<"unsigned long long int(unsigned long long int)">;
+}
+
+let Features = "bmi2", Attributes = [NoThrow, Const, Constexpr] in {
+  def bzhi_di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+  def pdep_di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+  def pext_di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "tbm", Attributes = [NoThrow, Const, Constexpr] in {
+  def bextri_u64 : X86Builtin<"unsigned long long int(unsigned long long int, _Constant unsigned long long int)">;
+}
+
+let Features = "lwp", Attributes = [NoThrow] in {
+  def lwpins64 : X86Builtin<"unsigned char(unsigned long long int, unsigned int, _Constant unsigned int)">;
+  def lwpval64 : X86Builtin<"void(unsigned long long int, unsigned int, _Constant unsigned int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsd2si64 : X86Builtin<"long long int(_Vector<2, double>, _Constant int)">;
+  def vcvtsd2usi64 : X86Builtin<"unsigned long long int(_Vector<2, double>, _Constant int)">;
+  def vcvtss2si64 : X86Builtin<"long long int(_Vector<4, float>, _Constant int)">;
+  def vcvtss2usi64 : X86Builtin<"unsigned long long int(_Vector<4, float>, _Constant int)">;
+  def vcvttsd2si64 : X86Builtin<"long long int(_Vector<2, double>, _Constant int)">;
+  def vcvttsd2usi64 : X86Builtin<"unsigned long long int(_Vector<2, double>, _Constant int)">;
+  def vcvttss2si64 : X86Builtin<"long long int(_Vector<4, float>, _Constant int)">;
+  def vcvttss2usi64 : X86Builtin<"unsigned long long int(_Vector<4, float>, _Constant int)">;
+  def cvtsi2sd64 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, long long int, _Constant int)">;
+  def cvtsi2ss64 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, long long int, _Constant int)">;
+  def cvtusi2sd64 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, unsigned long long int, _Constant int)">;
+  def cvtusi2ss64 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, unsigned long long int, _Constant int)">;
+}
+
+let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvtsh2si64 : X86Builtin<"long long int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvtsh2usi64 : X86Builtin<"unsigned long long int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvtusi642sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, unsigned long long int, _Constant int)">;
+  def vcvtsi642sh : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, long long int, _Constant int)">;
+  def vcvttsh2si64 : X86Builtin<"long long int(_Vector<8, _Float16>, _Constant int)">;
+  def vcvttsh2usi64 : X86Builtin<"unsigned long long int(_Vector<8, _Float16>, _Constant int)">;
+}
+
+let Features = "movdiri", Attributes = [NoThrow] in {
+  def directstore_u64 : X86Builtin<"void(unsigned long int *, unsigned long int)">;
+}
+
+let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def vcvttsd2sis64 : X86Builtin<"long long int(_Vector<2, double>, _Constant int)">;
+  def vcvttsd2usis64 : X86Builtin<"unsigned long long int(_Vector<2, double>, _Constant int)">;
+  def vcvttss2sis64 : X86Builtin<"long long int(_Vector<4, float>, _Constant int)">;
+  def vcvttss2usis64 : X86Builtin<"unsigned long long int(_Vector<4, float>, _Constant int)">;
+}
+
+let Features = "uintr", Attributes = [NoThrow] in {
+  def clui : X86Builtin<"void()">;
+  def stui : X86Builtin<"void()">;
+  def testui : X86Builtin<"unsigned char()">;
+  def senduipi : X86Builtin<"void(uint64_t)">;
+}
+
+let Features = "usermsr", Attributes = [NoThrow] in {
+  def urdmsr : X86Builtin<"unsigned long long int(unsigned long long int)">;
+  def uwrmsr : X86Builtin<"void(unsigned long long int, unsigned long long int)">;
+}
+
+let Features = "amx-tile", Attributes = [NoThrow] in {
+  def tile_loadconfig_internal : X86Builtin<"void(void const *)">;
+  def tileloadd64_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, void const *, size_t)">;
+}
+
+let Features = "amx-movrs", Attributes = [NoThrow] in {
+  def tileloaddrs64_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, void const *, size_t)">;
+}
+
+let Features = "amx-tile", Attributes = [NoThrow] in {
+  def tileloaddt164_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, void const *, size_t)">;
+}
+
+let Features = "amx-movrs", Attributes = [NoThrow] in {
+  def tileloaddrst164_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, void const *, size_t)">;
+}
+
+let Features = "amx-int8", Attributes = [NoThrow] in {
+  def tdpbssd_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tdpbsud_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tdpbusd_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tdpbuud_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-tile", Attributes = [NoThrow] in {
+  def tilestored64_internal : X86Builtin<"void(unsigned short, unsigned short, void *, size_t, _Vector<256, int>)">;
+  def tilezero_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short)">;
+}
+
+let Features = "amx-bf16", Attributes = [NoThrow] in {
+  def tdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-fp16", Attributes = [NoThrow] in {
+  def tdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-complex", Attributes = [NoThrow] in {
+  def tcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz0_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
+}
+
+let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz0rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
+}
+
+let Features = "amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz0t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
+}
+
+let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz0rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
+}
+
+let Features = "amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
+}
+
+let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz1rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
+}
+
+let Features = "amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz1t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
+}
+
+let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz1rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
+}
+
+let Features = "amx-transpose", Attributes = [NoThrow] in {
+  def ttransposed_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
+}
+
+let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
+  def ttdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
+  def ttdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
+  def ttcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def ttcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tconjtcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
+}
+
+let Features = "amx-avx512,avx10.2-512", Attributes = [NoThrow] in {
+  def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+  def tcvtrowps2pbf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+  def tcvtrowps2pbf16l_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+  def tcvtrowps2phh_internal : X86Builtin<"_Vector<32, _Float16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+  def tcvtrowps2phl_internal : X86Builtin<"_Vector<32, _Float16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+  def tilemovrow_internal : X86Builtin<"_Vector<16, int>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+}
+
+let Features = "amx-tf32", Attributes = [NoThrow] in {
+  def tmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
+  def ttmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-fp8", Attributes = [NoThrow] in {
+  def tdpbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tdpbhf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tdphbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+  def tdphf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
+}
+
+let Features = "amx-tile", Attributes = [NoThrow] in {
+  def tile_loadconfig : X86Builtin<"void(void const *)">;
+  def tile_storeconfig : X86Builtin<"void(void const *)">;
+  def tilerelease : X86Builtin<"void()">;
+  def tilezero : X86Builtin<"void(unsigned char)">;
+}
+
+let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz0rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def t2rpntlvwz0rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def t2rpntlvwz1rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def t2rpntlvwz1rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+}
+
+let Features = "amx-movrs", Attributes = [NoThrow] in {
+  def tileloaddrs64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def tileloaddrst164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+}
+
+let Features = "amx-tile", Attributes = [NoThrow] in {
+  def tileloadd64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def tileloaddt164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def tilestored64 : X86Builtin<"void(_Constant unsigned char, void *, size_t)">;
+}
+
+let Features = "amx-int8", Attributes = [NoThrow] in {
+  def tdpbssd : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+  def tdpbsud : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+  def tdpbusd : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+  def tdpbuud : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "amx-bf16", Attributes = [NoThrow] in {
+  def tdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "ptwrite", Attributes = [NoThrow] in {
+  def ptwrite64 : X86Builtin<"void(unsigned long long int)">;
+}
+
+let Features = "amx-complex", Attributes = [NoThrow] in {
+  def tcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+  def tcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "amx-transpose", Attributes = [NoThrow] in {
+  def t2rpntlvwz0 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def t2rpntlvwz0t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def t2rpntlvwz1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def t2rpntlvwz1t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
+  def ttransposed : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
+  def ttdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
+  def ttdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
+  def ttcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+  def ttcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+  def tconjtcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+  def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "amx-avx512,avx10.2-512", Attributes = [NoThrow] in {
+  def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">;
+  def tcvtrowps2pbf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
+  def tcvtrowps2pbf16l : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
+  def tcvtrowps2phh : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">;
+  def tcvtrowps2phl : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">;
+  def tilemovrow : X86Builtin<"_Vector<16, int>(_Constant unsigned char, unsigned int)">;
+}
+
+let Features = "amx-fp16", Attributes = [NoThrow] in {
+  def tdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "amx-fp8", Attributes = [NoThrow] in {
+  def tdpbf8ps : X86Builtin<"void(_Constant unsigned char, unsigned _Constant char, unsigned _Constant char)">;
+  def tdpbhf8ps : X86Builtin<"void(_Constant unsigned char, unsigned _Constant char, unsigned _Constant char)">;
+  def tdphbf8ps : X86Builtin<"void(_Constant unsigned char, unsigned _Constant char, unsigned _Constant char)">;
+  def tdphf8ps : X86Builtin<"void(_Constant unsigned char, unsigned _Constant char, unsigned _Constant char)">;
+}
+
+let Features = "amx-tf32", Attributes = [NoThrow] in {
+  def tmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
+  def ttmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
+}
+
+let Features = "prefetchi", Attributes = [NoThrow, Const] in {
+  def prefetchi : X86Builtin<"void(void const *, unsigned int)">;
+}
+
+let Features = "cmpccxadd", Attributes = [NoThrow] in {
+  def cmpccxadd32 : X86Builtin<"signed int(void *, signed int, signed int, _Constant int)">;
+  def cmpccxadd64 : X86Builtin<"signed long long int(signed long long int *, signed long long int, signed long long int, _Constant int)">;
+}
+
+let Features = "raoint", Attributes = [NoThrow] in {
+  def aadd64 : X86Builtin<"void(void *, signed long long int)">;
+  def aand64 : X86Builtin<"void(void *, signed long long int)">;
+  def aor64 : X86Builtin<"void(void *, signed long long int)">;
+  def axor64 : X86Builtin<"void(void *, signed long long int)">;
+}
+
+let Features = "movrs", Attributes = [NoThrow] in {
+  def movrsqi : X86Builtin<"signed char(void const *)">;
+  def movrshi : X86Builtin<"signed short(void const *)">;
+  def movrssi : X86Builtin<"signed int(void const *)">;
+  def movrsdi : X86Builtin<"signed long long int(void const *)">;
+}
+
+let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vmovrsb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char const *>)">;
+}
+
+let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vmovrsb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char const *>)">;
+}
+
+let Features = "movrs,avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vmovrsb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char const *>)">;
+}
+
+let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vmovrsd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int const *>)">;
+}
+
+let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vmovrsd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>)">;
+}
+
+let Features = "movrs,avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vmovrsd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int const *>)">;
+}
+
+let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vmovrsq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int const *>)">;
+}
+
+let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vmovrsq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>)">;
+}
+
+let Features = "movrs,avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vmovrsq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int const *>)">;
+}
+
+let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+  def vmovrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short const *>)">;
+}
+
+let Features = "movrs,avx10.2-256", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
+  def vmovrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short const *>)">;
+}
+
+let Features = "movrs,avx10.2-512", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
+  def vmovrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short const *>)">;
+}
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 76ac336..897a610 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -60,10 +60,18 @@ clang_tablegen(BuiltinsRISCV.inc -gen-clang-builtins
   SOURCE BuiltinsRISCV.td
   TARGET ClangBuiltinsRISCV)
 
+clang_tablegen(BuiltinsSPIRV.inc -gen-clang-builtins
+  SOURCE BuiltinsSPIRV.td
+  TARGET ClangBuiltinsSPIRV)
+
 clang_tablegen(BuiltinsX86.inc -gen-clang-builtins
   SOURCE BuiltinsX86.td
   TARGET ClangBuiltinsX86)
 
+clang_tablegen(BuiltinsX86_64.inc -gen-clang-builtins
+  SOURCE BuiltinsX86_64.td
+  TARGET ClangBuiltinsX86_64)
+
 # ARM NEON and MVE
 clang_tablegen(arm_neon.inc -gen-arm-neon-sema
   SOURCE arm_neon.td
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index 8097c9e..c555fb3 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -186,7 +186,7 @@ public:
   std::string ProfileExcludeFiles;
 
   /// The version string to put into coverage files.
-  char CoverageVersion[4];
+  char CoverageVersion[4] = {'0', '0', '0', '0'};
 
   /// Enable additional debugging information.
   std::string DebugPass;
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 5155b23..42c39ac 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -67,7 +67,7 @@ def err_drv_no_cuda_libdevice : Error<
   "libdevice">;
 
 def err_drv_no_rocm_device_lib : Error<
-  "cannot find ROCm device library%select{| for %1|for ABI version %1}0; provide its path via "
+  "cannot find ROCm device library%select{| for %1| for ABI version %1}0; provide its path via "
   "'--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build "
   "without ROCm device library">;
 def err_drv_no_hip_runtime : Error<
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 1ed379c..f3593f5 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -291,6 +291,8 @@ def warn_function_always_inline_attribute_mismatch : Warning<
   "inlining may change runtime behaviour">, InGroup<AArch64SMEAttributes>;
 def err_function_always_inline_new_za : Error<
   "always_inline function %0 has new za state">;
+def err_function_always_inline_new_zt0
+    : Error<"always_inline function %0 has new zt0 state">;
 
 def warn_avx_calling_convention
     : Warning<"AVX vector %select{return|argument}0 of type %1 without '%2' "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 330ae04..ab2d623 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5878,7 +5878,7 @@ def err_pack_expansion_without_parameter_packs : Error<
   "pack expansion does not contain any unexpanded parameter packs">;
 def err_pack_expansion_length_conflict : Error<
   "pack expansion contains parameter packs %0 and %1 that have different "
-  "lengths (%2 vs. %select{|at least }3%4))">;
+  "lengths (%2 vs. %select{|at least }3%4)">;
 def err_pack_expansion_length_conflict_multilevel : Error<
   "pack expansion contains parameter pack %0 that has a different "
   "length (%1 vs. %select{|at least }2%3) from outer parameter packs">;
@@ -9361,7 +9361,7 @@ def note_inequality_comparison_to_or_assign : Note<
   "use '|=' to turn this inequality comparison into an or-assignment">;
 
 def err_incomplete_type_used_in_type_trait_expr : Error<
-  "incomplete type %0 used in type trait expression">;
+  "incomplete type %0 used in type trait expression">, NoSFINAE;
 
 // C++20 constinit and require_constant_initialization attribute
 def warn_cxx20_compat_constinit : Warning<
@@ -10512,6 +10512,10 @@ def warn_second_parameter_to_va_arg_ownership_qualified : Warning<
 def warn_second_parameter_to_va_arg_never_compatible : Warning<
   "second argument to 'va_arg' is of promotable type %0; this va_arg has "
   "undefined behavior because arguments will be promoted to %1">, InGroup<Varargs>;
+def warn_second_parameter_to_va_arg_array : Warning<
+  "second argument to 'va_arg' is of array type %0; "
+  "this va_arg has undefined behavior because arguments "
+  "will never be compatible with array type">, InGroup<Varargs>;
 
 def warn_return_missing_expr : Warning<
   "non-void %select{function|method}1 %0 should return a value">, DefaultError,
@@ -12823,6 +12827,10 @@ def err_acc_loop_not_monotonic
             "('++', '--', or compound assignment)">;
 def err_acc_construct_one_clause_of
     : Error<"OpenACC '%0' construct must have at least one %1 clause">;
+def err_acc_update_as_body
+    : Error<"OpenACC 'update' construct may not appear in place of the "
+            "statement following a%select{n if statement| while statement| do "
+            "statement| switch statement| label statement}0">;
 
 // AMDGCN builtins diagnostics
 def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
diff --git a/clang/include/clang/Basic/OpenACCClauses.def b/clang/include/clang/Basic/OpenACCClauses.def
index 1b619bc..d6fb015 100644
--- a/clang/include/clang/Basic/OpenACCClauses.def
+++ b/clang/include/clang/Basic/OpenACCClauses.def
@@ -38,6 +38,7 @@ VISIT_CLAUSE(Create)
 CLAUSE_ALIAS(PCreate, Create, true)
 CLAUSE_ALIAS(PresentOrCreate, Create, true)
 VISIT_CLAUSE(Default)
+VISIT_CLAUSE(DefaultAsync)
 VISIT_CLAUSE(Delete)
 VISIT_CLAUSE(Detach)
 VISIT_CLAUSE(DeviceNum)
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 31280df..ce2c48b 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -315,6 +315,8 @@ def OpenACCHostDataConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
 def OpenACCWaitConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCInitConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCShutdownConstruct : StmtNode<OpenACCConstructStmt>;
+def OpenACCSetConstruct : StmtNode<OpenACCConstructStmt>;
+def OpenACCUpdateConstruct : StmtNode<OpenACCConstructStmt>;
 
 // OpenACC Additional Expressions.
 def OpenACCAsteriskSizeExpr : StmtNode<Expr>;
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index a14fd2c..4dc8b24 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -119,18 +119,26 @@ namespace clang {
   };
   }
 
+  /// SPIRV builtins
+  namespace SPIRV {
+  enum {
+    LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1,
+#define BUILTIN(ID, TYPE, ATTRS) BI##ID,
+#include "clang/Basic/BuiltinsSPIRV.inc"
+    LastTSBuiltin
+  };
+  } // namespace SPIRV
+
   /// X86 builtins
   namespace X86 {
   enum {
     LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1,
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/BuiltinsX86.def"
-#define BUILTIN(ID, TYPE, ATTRS) BI##ID,
 #include "clang/Basic/BuiltinsX86.inc"
     FirstX86_64Builtin,
     LastX86CommonBuiltin = FirstX86_64Builtin - 1,
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/BuiltinsX86_64.def"
+#include "clang/Basic/BuiltinsX86_64.inc"
     LastTSBuiltin
   };
   }
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 82bd537..43c09cf 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -358,7 +358,14 @@ public:
     //    void *__saved_reg_area_end_pointer;
     //    void *__overflow_area_pointer;
     //} va_list;
-    HexagonBuiltinVaList
+    HexagonBuiltinVaList,
+
+    // typedef struct __va_list_tag {
+    //    int* __va_stk;
+    //    int* __va_reg;
+    //    int __va_ndx;
+    //} va_list;
+    XtensaABIBuiltinVaList
   };
 
 protected:
@@ -1524,7 +1531,7 @@ public:
 
   // Return the target-specific priority for features/cpus/vendors so
   // that they can be properly sorted for checking.
-  virtual unsigned getFMVPriority(ArrayRef<StringRef> Features) const {
+  virtual uint64_t getFMVPriority(ArrayRef<StringRef> Features) const {
     return 0;
   }
 
diff --git a/clang/include/clang/Basic/arm_immcheck_incl.td b/clang/include/clang/Basic/arm_immcheck_incl.td
index 9d7f74a..6892b82 100644
--- a/clang/include/clang/Basic/arm_immcheck_incl.td
+++ b/clang/include/clang/Basic/arm_immcheck_incl.td
@@ -2,7 +2,9 @@ class ImmCheckType<int val> {
   int Value = val;
 }
 
-// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
+
+// For SVE, container_size refers to the width of a vector segment (128b).
+// For NEON, container_size refers to the vector width (64b or 128b).
 def ImmCheck0_31                : ImmCheckType<0>;  // 0..31 (used for e.g. predicate patterns)
 def ImmCheck1_16                : ImmCheckType<1>;  // 1..16
 def ImmCheckExtract             : ImmCheckType<2>;  // 0..(2048/sizeinbits(elt) - 1)
@@ -10,10 +12,10 @@ def ImmCheckShiftRight          : ImmCheckType<3>;  // 1..sizeinbits(elt)
 def ImmCheckShiftRightNarrow    : ImmCheckType<4>;  // 1..sizeinbits(elt)/2
 def ImmCheckShiftLeft           : ImmCheckType<5>;  // 0..(sizeinbits(elt) - 1)
 def ImmCheck0_7                 : ImmCheckType<6>;  // 0..7
-def ImmCheckLaneIndex           : ImmCheckType<7>;  // 0..(sizeinbits(vec)/(sizeinbits(elt)) - 1)
+def ImmCheckLaneIndex           : ImmCheckType<7>;  // 0..(container_size/(sizeinbits(elt)) - 1)
 def ImmCheckCvt                 : ImmCheckType<8>;  // 1..sizeinbits(elt) (same as ShiftRight)
-def ImmCheckLaneIndexCompRotate : ImmCheckType<9>;  // 0..(sizeinbits(vec)/(2*sizeinbits(elt)) - 1)
-def ImmCheckLaneIndexDot        : ImmCheckType<10>; // 0..(sizeinbits(vec)/(4*sizeinbits(elt)) - 1)
+def ImmCheckLaneIndexCompRotate : ImmCheckType<9>;  // 0..(container_size/(2*sizeinbits(elt)) - 1)
+def ImmCheckLaneIndexDot        : ImmCheckType<10>; // 0..(container_size/(4*sizeinbits(elt)) - 1)
 def ImmCheckComplexRot90_270    : ImmCheckType<11>; // [90,270]
 def ImmCheckComplexRotAll90     : ImmCheckType<12>; // [0, 90, 180,270]
 def ImmCheck0_13                : ImmCheckType<13>; // 0..13
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 6b31dec..891ed98 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -716,6 +716,8 @@ let SMETargetGuard = "sme2" in {
   def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
+def IN_STREAMING_MODE :  Inst<"__arm_in_streaming_mode", "sv", "Pc", MergeNone, "aarch64_sme_in_streaming_mode", [IsOverloadNone, IsStreamingCompatible], []>;
+
 //
 // lookup table expand four contiguous registers
 //
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 0e41492..b4a961d 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -9,7 +9,11 @@
 #ifndef LLVM_CLANG_CIR_DIALECT_BUILDER_CIRBASEBUILDER_H
 #define LLVM_CLANG_CIR_DIALECT_BUILDER_CIRBASEBUILDER_H
 
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
+
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
 
 namespace cir {
 
@@ -26,6 +30,13 @@ public:
   cir::PointerType getVoidPtrTy() {
     return getPointerTo(cir::VoidType::get(getContext()));
   }
+
+  mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) {
+    auto valueAttr = mlir::IntegerAttr::get(
+        mlir::IntegerType::get(type.getContext(), 64), value);
+    return cir::ConstPtrAttr::get(
+        getContext(), mlir::cast<cir::PointerType>(type), valueAttr);
+  }
 };
 
 } // namespace cir
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
new file mode 100644
index 0000000..438fb7d
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the attributes in the CIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_H
+#define LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_H
+
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+// CIR Dialect Attrs
+//===----------------------------------------------------------------------===//
+
+namespace clang {
+class FunctionDecl;
+class VarDecl;
+class RecordDecl;
+} // namespace clang
+
+#define GET_ATTRDEF_CLASSES
+#include "clang/CIR/Dialect/IR/CIROpsAttributes.h.inc"
+
+#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_H
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
new file mode 100644
index 0000000..bd1665e
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -0,0 +1,142 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CIR dialect attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
+#define LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
+
+include "mlir/IR/BuiltinAttributeInterfaces.td"
+include "mlir/IR/EnumAttr.td"
+
+include "clang/CIR/Dialect/IR/CIRDialect.td"
+
+//===----------------------------------------------------------------------===//
+// CIR Attrs
+//===----------------------------------------------------------------------===//
+
+class CIR_Attr<string name, string attrMnemonic, list<Trait> traits = []>
+    : AttrDef<CIR_Dialect, name, traits> {
+  let mnemonic = attrMnemonic;
+}
+
+class CIRUnitAttr<string name, string attrMnemonic, list<Trait> traits = []>
+    : CIR_Attr<name, attrMnemonic, traits> {
+  let returnType = "bool";
+  let defaultValue = "false";
+  let valueType = NoneType;
+  let isOptional = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// IntegerAttr
+//===----------------------------------------------------------------------===//
+
+def IntAttr : CIR_Attr<"Int", "int", [TypedAttrInterface]> {
+  let summary = "An attribute containing an integer value";
+  let description = [{
+    An integer attribute is a literal attribute that represents an integral
+    value of the specified integer type.
+  }];
+  let parameters = (ins AttributeSelfTypeParameter<"">:$type,
+                        "llvm::APInt":$value);
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
+                                        "const llvm::APInt &":$value), [{
+      return $_get(type.getContext(), type, value);
+    }]>,
+    AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
+                                        "int64_t":$value), [{
+      IntType intType = mlir::cast<IntType>(type);
+      mlir::APInt apValue(intType.getWidth(), value, intType.isSigned());
+      return $_get(intType.getContext(), intType, apValue);
+    }]>,
+  ];
+  let extraClassDeclaration = [{
+    int64_t getSInt() const { return getValue().getSExtValue(); }
+    uint64_t getUInt() const { return getValue().getZExtValue(); }
+    bool isNullValue() const { return getValue() == 0; }
+    uint64_t getBitWidth() const {
+      return mlir::cast<IntType>(getType()).getWidth();
+    }
+  }];
+  let genVerifyDecl = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// FPAttr
+//===----------------------------------------------------------------------===//
+
+def FPAttr : CIR_Attr<"FP", "fp", [TypedAttrInterface]> {
+  let summary = "An attribute containing a floating-point value";
+  let description = [{
+    An fp attribute is a literal attribute that represents a floating-point
+    value of the specified floating-point type. Supporting only CIR FP types.
+  }];
+  let parameters = (ins
+    AttributeSelfTypeParameter<"", "::cir::CIRFPTypeInterface">:$type,
+    APFloatParameter<"">:$value
+  );
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
+                                        "const llvm::APFloat &":$value), [{
+      return $_get(type.getContext(), mlir::cast<CIRFPTypeInterface>(type),
+                   value);
+    }]>,
+    AttrBuilder<(ins "mlir::Type":$type,
+                     "const llvm::APFloat &":$value), [{
+      return $_get($_ctxt, mlir::cast<CIRFPTypeInterface>(type), value);
+    }]>,
+  ];
+  let extraClassDeclaration = [{
+    static FPAttr getZero(mlir::Type type);
+  }];
+  let genVerifyDecl = 1;
+
+  let assemblyFormat = [{
+    `<` custom<FloatLiteral>($value, ref($type)) `>`
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// ConstPtrAttr
+//===----------------------------------------------------------------------===//
+
+def ConstPtrAttr : CIR_Attr<"ConstPtr", "ptr", [TypedAttrInterface]> {
+  let summary = "Holds a constant pointer value";
+  let parameters = (ins
+    AttributeSelfTypeParameter<"", "::cir::PointerType">:$type,
+    "mlir::IntegerAttr":$value);
+  let description = [{
+    A pointer attribute is a literal attribute that represents an integral
+    value of a pointer type.
+  }];
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
+                                        "mlir::IntegerAttr":$value), [{
+      return $_get(type.getContext(), mlir::cast<cir::PointerType>(type),
+                   value);
+    }]>,
+    AttrBuilder<(ins "mlir::Type":$type,
+                     "mlir::IntegerAttr":$value), [{
+      return $_get($_ctxt, mlir::cast<cir::PointerType>(type), value);
+    }]>,
+  ];
+  let extraClassDeclaration = [{
+    bool isNullValue() const { return getValue().getInt() == 0; }
+  }];
+
+  let assemblyFormat = [{
+    `<` custom<ConstPtr>($value) `>`
+  }];
+}
+
+#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
index 0b71bda..683176b 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
@@ -26,6 +26,7 @@
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIROpsDialect.h.inc"
 
 // TableGen'erated files for MLIR dialects require that a macro be defined when
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 0d6c65e..b15e041 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -16,6 +16,7 @@
 
 include "clang/CIR/Dialect/IR/CIRDialect.td"
 include "clang/CIR/Dialect/IR/CIRTypes.td"
+include "clang/CIR/Dialect/IR/CIRAttrs.td"
 
 include "mlir/IR/BuiltinAttributeInterfaces.td"
 include "mlir/IR/EnumAttr.td"
@@ -76,6 +77,45 @@ class CIR_Op<string mnemonic, list<Trait> traits = []> :
     Op<CIR_Dialect, mnemonic, traits>, LLVMLoweringInfo;
 
 //===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+def ConstantOp : CIR_Op<"const",
+                        [ConstantLike, Pure, AllTypesMatch<["value", "res"]>]> {
+  let summary = "Defines a CIR constant";
+  let description = [{
+    The `cir.const` operation turns a literal into an SSA value. The data is
+    attached to the operation as an attribute.
+
+    ```mlir
+      %0 = cir.const 42 : i32
+      %1 = cir.const 4.2 : f32
+      %2 = cir.const nullptr : !cir.ptr<i32>
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins TypedAttrInterface:$value);
+
+  // The constant operation returns a single value of CIR_AnyType.
+  let results = (outs CIR_AnyType:$res);
+
+  let assemblyFormat = "attr-dict $value";
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    bool isNullPtr() {
+      if (const auto ptrAttr = mlir::dyn_cast<cir::ConstPtrAttr>(getValue()))
+        return ptrAttr.isNullValue();
+      return false;
+    }
+  }];
+
+  let hasFolder = 1;
+}
+
+//===----------------------------------------------------------------------===//
 // GlobalOp
 //===----------------------------------------------------------------------===//
 
@@ -92,9 +132,19 @@ def GlobalOp : CIR_Op<"global"> {
     described by the type of the variable.
   }];
 
-  let arguments = (ins SymbolNameAttr:$sym_name, TypeAttr:$sym_type);
+  let arguments = (ins SymbolNameAttr:$sym_name, TypeAttr:$sym_type,
+                       OptionalAttr<AnyAttr>:$initial_value);
+
+  let assemblyFormat = [{
+    $sym_name
+    custom<GlobalOpTypeAndInitialValue>($sym_type, $initial_value)
+    attr-dict
+  }];
 
-  let assemblyFormat = [{ $sym_name `:` $sym_type attr-dict }];
+  let extraClassDeclaration = [{
+    bool isDeclaration() { return !getInitialValue(); }
+    bool hasInitializer() { return !isDeclaration(); }
+  }];
 
   let skipDefaultBuilders = 1;
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index ef00b26..a32fb3c 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -220,8 +220,8 @@ def CIR_LongDouble : CIR_FloatType<"LongDouble", "long_double"> {
 
 // Constraints
 
-def CIR_AnyFloat: AnyTypeOf<[CIR_Single, CIR_Double, CIR_FP80, CIR_FP128, CIR_LongDouble,
-    CIR_FP16, CIR_BFloat16]>;
+def CIR_AnyFloat: AnyTypeOf<[CIR_Single, CIR_Double, CIR_FP80, CIR_FP128,
+                             CIR_LongDouble, CIR_FP16, CIR_BFloat16]>;
 def CIR_AnyIntOrFloat: AnyTypeOf<[CIR_AnyFloat, CIR_IntType]>;
 
 //===----------------------------------------------------------------------===//
@@ -350,4 +350,12 @@ def VoidPtr : Type<
       "cir::VoidType::get($_builder.getContext()))"> {
 }
 
+//===----------------------------------------------------------------------===//
+// Global type constraints
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyType : AnyTypeOf<[
+  CIR_VoidType, CIR_IntType, CIR_AnyFloat, CIR_PointerType, CIR_FuncType
+]>;
+
 #endif // MLIR_CIR_DIALECT_CIR_TYPES
diff --git a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
index 28ae30d..1fdbc24 100644
--- a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
+++ b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
@@ -14,3 +14,6 @@ mlir_tablegen(CIROpsDialect.cpp.inc -gen-dialect-defs)
 add_public_tablegen_target(MLIRCIROpsIncGen)
 add_dependencies(mlir-headers MLIRCIROpsIncGen)
 
+mlir_tablegen(CIROpsAttributes.h.inc -gen-attrdef-decls)
+mlir_tablegen(CIROpsAttributes.cpp.inc -gen-attrdef-defs)
+add_public_tablegen_target(MLIRCIRAttrsEnumsGen)
diff --git a/clang/include/clang/CodeGen/BackendUtil.h b/clang/include/clang/CodeGen/BackendUtil.h
index fc8ed4f..7aa4f9d 100644
--- a/clang/include/clang/CodeGen/BackendUtil.h
+++ b/clang/include/clang/CodeGen/BackendUtil.h
@@ -14,46 +14,46 @@
 #include <memory>
 
 namespace llvm {
-  class BitcodeModule;
-  template <typename T> class Expected;
-  template <typename T> class IntrusiveRefCntPtr;
-  class Module;
-  class MemoryBufferRef;
-  namespace vfs {
-  class FileSystem;
-  } // namespace vfs
-}
+class BitcodeModule;
+template <typename T> class Expected;
+template <typename T> class IntrusiveRefCntPtr;
+class Module;
+class MemoryBufferRef;
+namespace vfs {
+class FileSystem;
+} // namespace vfs
+} // namespace llvm
 
 namespace clang {
-  class DiagnosticsEngine;
-  class HeaderSearchOptions;
-  class CodeGenOptions;
-  class TargetOptions;
-  class LangOptions;
-  class BackendConsumer;
-
-  enum BackendAction {
-    Backend_EmitAssembly,  ///< Emit native assembly files
-    Backend_EmitBC,        ///< Emit LLVM bitcode files
-    Backend_EmitLL,        ///< Emit human-readable LLVM assembly
-    Backend_EmitNothing,   ///< Don't emit anything (benchmarking mode)
-    Backend_EmitMCNull,    ///< Run CodeGen, but don't emit anything
-    Backend_EmitObj        ///< Emit native object files
-  };
-
-  void EmitBackendOutput(DiagnosticsEngine &Diags, const HeaderSearchOptions &,
-                         const CodeGenOptions &CGOpts,
-                         const TargetOptions &TOpts, const LangOptions &LOpts,
-                         StringRef TDesc, llvm::Module *M, BackendAction Action,
-                         llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-                         std::unique_ptr<raw_pwrite_stream> OS,
-                         BackendConsumer *BC = nullptr);
-
-  void EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts,
-                    llvm::MemoryBufferRef Buf);
-
-  void EmbedObject(llvm::Module *M, const CodeGenOptions &CGOpts,
-                   DiagnosticsEngine &Diags);
-}
+class DiagnosticsEngine;
+class HeaderSearchOptions;
+class CodeGenOptions;
+class TargetOptions;
+class LangOptions;
+class BackendConsumer;
+
+enum BackendAction {
+  Backend_EmitAssembly, ///< Emit native assembly files
+  Backend_EmitBC,       ///< Emit LLVM bitcode files
+  Backend_EmitLL,       ///< Emit human-readable LLVM assembly
+  Backend_EmitNothing,  ///< Don't emit anything (benchmarking mode)
+  Backend_EmitMCNull,   ///< Run CodeGen, but don't emit anything
+  Backend_EmitObj       ///< Emit native object files
+};
+
+void EmitBackendOutput(DiagnosticsEngine &Diags, const HeaderSearchOptions &,
+                       const CodeGenOptions &CGOpts, const TargetOptions &TOpts,
+                       const LangOptions &LOpts, StringRef TDesc,
+                       llvm::Module *M, BackendAction Action,
+                       llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
+                       std::unique_ptr<raw_pwrite_stream> OS,
+                       BackendConsumer *BC = nullptr);
+
+void EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts,
+                  llvm::MemoryBufferRef Buf);
+
+void EmbedObject(llvm::Module *M, const CodeGenOptions &CGOpts,
+                 DiagnosticsEngine &Diags);
+} // namespace clang
 
 #endif
diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 04fa8b0..feeabae 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -94,6 +94,7 @@ public:
     OFK_Cuda = 0x02,
     OFK_OpenMP = 0x04,
     OFK_HIP = 0x08,
+    OFK_SYCL = 0x10,
   };
 
   static const char *getClassName(ActionClass AC);
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index c23d037..80bce57 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -741,6 +741,11 @@ private:
   /// \returns true if error occurred.
   bool loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx);
 
+  /// Tries to load options from customization file.
+  ///
+  /// \returns true if error occurred.
+  bool loadZOSCustomizationFile(llvm::cl::ExpansionContext &);
+
   /// Read options from the specified file.
   ///
   /// \param [in] FileName File to read.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d922709..5282343 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -182,7 +182,8 @@ def opencl_Group : OptionGroup<"<opencl group>">, Group<f_Group>,
                    DocName<"OpenCL options">;
 
 def sycl_Group : OptionGroup<"<SYCL group>">, Group<f_Group>,
-                 DocName<"SYCL options">;
+                 DocName<"SYCL options">,
+                 Visibility<[ClangOption, CLOption]>;
 
 def cuda_Group : OptionGroup<"<CUDA group>">, Group<f_Group>,
                    DocName<"CUDA options">,
@@ -1493,6 +1494,8 @@ def libomptarget_amdgcn_bc_path_EQ : Joined<["--"], "libomptarget-amdgcn-bc-path
   HelpText<"Path to libomptarget-amdgcn bitcode library">, Alias<libomptarget_amdgpu_bc_path_EQ>;
 def libomptarget_nvptx_bc_path_EQ : Joined<["--"], "libomptarget-nvptx-bc-path=">, Group<i_Group>,
   HelpText<"Path to libomptarget-nvptx bitcode library">;
+def libomptarget_spirv_bc_path_EQ : Joined<["--"], "libomptarget-spirv-bc-path=">, Group<i_Group>,
+  HelpText<"Path to libomptarget-spirv bitcode library">;
 def dD : Flag<["-"], "dD">, Group<d_Group>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Print macro definitions in -E mode in addition to normal output">;
 def dI : Flag<["-"], "dI">, Group<d_Group>, Visibility<[ClangOption, CC1Option]>,
@@ -1888,7 +1891,7 @@ defm pseudo_probe_for_profiling : BoolFOption<"pseudo-probe-for-profiling",
           " pseudo probes for sample profiling">>;
 def forder_file_instrumentation : Flag<["-"], "forder-file-instrumentation">,
     Group<f_Group>, Visibility<[ClangOption, CC1Option, CLOption]>,
-    HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
+    HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var). Deprecated, please use temporal profiling.">;
 def fprofile_list_EQ : Joined<["-"], "fprofile-list=">,
     Group<f_Group>, Visibility<[ClangOption, CC1Option, CLOption]>,
     HelpText<"Filename defining the list of functions/files to instrument. "
@@ -3506,8 +3509,6 @@ def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
   MarshallingInfoNegativeFlag<CodeGenOpts<"AsmVerbose">>;
 def fno_working_directory : Flag<["-"], "fno-working-directory">, Group<f_Group>;
-def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>,
-  Visibility<[ClangOption, FlangOption]>;
 def fobjc_arc : Flag<["-"], "fobjc-arc">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Synthesize retain and release calls for Objective-C pointers">;
@@ -4278,8 +4279,10 @@ defm virtual_function_elimination : BoolFOption<"virtual-function-elimination",
   NegFlag<SetFalse>, BothFlags<[], [ClangOption, CLOption]>>;
 
 def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>,
-  Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+  Visibility<[ClangOption, CLOption, CC1Option, FlangOption, FC1Option]>,
   HelpText<"Treat signed integer overflow as two's complement">;
+def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>,
+  Visibility<[ClangOption, CLOption, FlangOption]>;
 def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Store string literals as writable data">,
@@ -6837,16 +6840,21 @@ defm : FlangIgnoredDiagOpt<"frontend-loop-interchange">;
 defm : FlangIgnoredDiagOpt<"target-lifetime">;
 
 // C++ SYCL options
+let Group = sycl_Group in {
 def fsycl : Flag<["-"], "fsycl">,
-  Visibility<[ClangOption, CLOption]>,
-  Group<sycl_Group>, HelpText<"Enables SYCL kernels compilation for device">;
+  HelpText<"Enable SYCL C++ extensions">;
 def fno_sycl : Flag<["-"], "fno-sycl">,
-  Visibility<[ClangOption, CLOption]>,
-  Group<sycl_Group>, HelpText<"Disables SYCL kernels compilation for device">;
+  HelpText<"Disable SYCL C++ extensions">;
+def fsycl_device_only : Flag<["-"], "fsycl-device-only">,
+  Alias<offload_device_only>, HelpText<"Compile SYCL code for device only">;
+def fsycl_host_only : Flag<["-"], "fsycl-host-only">,
+  Alias<offload_host_only>, HelpText<"Compile SYCL code for host only. Has no "
+  "effect on non-SYCL compilations">;
 def sycl_link : Flag<["--"], "sycl-link">, Flags<[HelpHidden]>,
-  Visibility<[ClangOption, CLOption]>,
-  Group<sycl_Group>, HelpText<"Perform link through clang-sycl-linker via the target "
+  HelpText<"Perform link through clang-sycl-linker via the target "
   "offloading toolchain.">;
+} // let Group = sycl_Group
+
 // OS-specific options
 let Flags = [TargetSpecific] in {
 defm android_pad_segment : BooleanFFlag<"android-pad-segment">, Group<f_Group>;
@@ -6937,6 +6945,9 @@ defm unsigned : OptInFC1FFlag<"unsigned", "Enables UNSIGNED type">;
 def fno_automatic : Flag<["-"], "fno-automatic">, Group<f_Group>,
   HelpText<"Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE">;
 
+def fsave_main_program : Flag<["-"], "fsave-main-program">, Group<f_Group>,
+  HelpText<"Place all variables from the main program in static memory (otherwise scalars may be placed on the stack)">;
+
 defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays",
   PosFlag<SetTrue, [], [ClangOption], "Attempt to allocate array temporaries on the stack, no matter their size">,
   NegFlag<SetFalse, [], [ClangOption], "Allocate array temporaries on the heap (default)">>;
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 7410ad4..3b27509 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -99,6 +99,7 @@ public:
   }
   bool needsFuzzerInterceptors() const;
   bool needsUbsanRt() const;
+  bool needsUbsanCXXRt() const;
   bool requiresMinimalRuntime() const { return MinimalRuntime; }
   bool needsDfsanRt() const { return Sanitizers.has(SanitizerKind::DataFlow); }
   bool needsSafeStackRt() const { return SafeStackRuntime; }
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 5347e29..701a1d2 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -762,6 +762,10 @@ public:
   virtual void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                  llvm::opt::ArgStringList &CC1Args) const;
 
+  /// Add arguments to use system-specific SYCL includes.
+  virtual void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                                  llvm::opt::ArgStringList &CC1Args) const;
+
   /// Add arguments to use MCU GCC toolchain includes.
   virtual void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                    llvm::opt::ArgStringList &CC1Args) const;
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 6383934..8d41077 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -988,6 +988,10 @@ struct FormatStyle {
   /// \version 3.7
   bool AllowShortLoopsOnASingleLine;
 
+  /// If ``true``, ``namespace a { class b; }`` can be put on a single line.
+  /// \version 20
+  bool AllowShortNamespacesOnASingleLine;
+
   /// Different ways to break after the function definition return type.
   /// This option is **deprecated** and is retained for backwards compatibility.
   enum DefinitionReturnTypeBreakingStyle : int8_t {
@@ -3199,11 +3203,12 @@ struct FormatStyle {
   /// \version 19
   KeepEmptyLinesStyle KeepEmptyLines;
 
-  /// This option is deprecated. See ``AtEndOfFile`` of ``KeepEmptyLines``.
+  /// This option is **deprecated**. See ``AtEndOfFile`` of ``KeepEmptyLines``.
   /// \version 17
   // bool KeepEmptyLinesAtEOF;
 
-  /// This option is deprecated. See ``AtStartOfBlock`` of ``KeepEmptyLines``.
+  /// This option is **deprecated**. See ``AtStartOfBlock`` of
+  /// ``KeepEmptyLines``.
   /// \version 3.7
   // bool KeepEmptyLinesAtTheStartOfBlocks;
 
@@ -5038,8 +5043,8 @@ struct FormatStyle {
   /// \version 3.7
   unsigned TabWidth;
 
-  /// A vector of non-keyword identifiers that should be interpreted as
-  /// template names.
+  /// A vector of non-keyword identifiers that should be interpreted as template
+  /// names.
   ///
   /// A ``<`` after a template name is annotated as a template opener instead of
   /// a binary operator.
@@ -5099,6 +5104,15 @@ struct FormatStyle {
   /// \version 3.7
   UseTabStyle UseTab;
 
+  /// A vector of non-keyword identifiers that should be interpreted as variable
+  /// template names.
+  ///
+  /// A ``)`` after a variable template instantiation is **not** annotated as
+  /// the closing parenthesis of C-style cast operator.
+  ///
+  /// \version 20
+  std::vector<std::string> VariableTemplates;
+
   /// For Verilog, put each port on its own line in module instantiations.
   /// \code
   ///    true:
@@ -5130,6 +5144,39 @@ struct FormatStyle {
   /// \version 11
   std::vector<std::string> WhitespaceSensitiveMacros;
 
+  /// Different styles for wrapping namespace body with empty lines.
+  enum WrapNamespaceBodyWithEmptyLinesStyle : int8_t {
+    /// Remove all empty lines at the beginning and the end of namespace body.
+    /// \code
+    ///   namespace N1 {
+    ///   namespace N2
+    ///   function();
+    ///   }
+    ///   }
+    /// \endcode
+    WNBWELS_Never,
+    /// Always have at least one empty line at the beginning and the end of
+    /// namespace body except that the number of empty lines between consecutive
+    /// nested namespace definitions is not increased.
+    /// \code
+    ///   namespace N1 {
+    ///   namespace N2 {
+    ///
+    ///   function();
+    ///
+    ///   }
+    ///   }
+    /// \endcode
+    WNBWELS_Always,
+    /// Keep existing newlines at the beginning and the end of namespace body.
+    /// ``MaxEmptyLinesToKeep`` still applies.
+    WNBWELS_Leave
+  };
+
+  /// Wrap namespace body with empty lines.
+  /// \version 20
+  WrapNamespaceBodyWithEmptyLinesStyle WrapNamespaceBodyWithEmptyLines;
+
   bool operator==(const FormatStyle &R) const {
     return AccessModifierOffset == R.AccessModifierOffset &&
            AlignAfterOpenBracket == R.AlignAfterOpenBracket &&
@@ -5168,6 +5215,8 @@ struct FormatStyle {
                R.AllowShortIfStatementsOnASingleLine &&
            AllowShortLambdasOnASingleLine == R.AllowShortLambdasOnASingleLine &&
            AllowShortLoopsOnASingleLine == R.AllowShortLoopsOnASingleLine &&
+           AllowShortNamespacesOnASingleLine ==
+               R.AllowShortNamespacesOnASingleLine &&
            AlwaysBreakBeforeMultilineStrings ==
                R.AlwaysBreakBeforeMultilineStrings &&
            AttributeMacros == R.AttributeMacros &&
@@ -5308,10 +5357,11 @@ struct FormatStyle {
            TableGenBreakInsideDAGArg == R.TableGenBreakInsideDAGArg &&
            TabWidth == R.TabWidth && TemplateNames == R.TemplateNames &&
            TypeNames == R.TypeNames && TypenameMacros == R.TypenameMacros &&
-           UseTab == R.UseTab &&
+           UseTab == R.UseTab && VariableTemplates == R.VariableTemplates &&
            VerilogBreakBetweenInstancePorts ==
                R.VerilogBreakBetweenInstancePorts &&
-           WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros;
+           WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros &&
+           WrapNamespaceBodyWithEmptyLines == R.WrapNamespaceBodyWithEmptyLines;
   }
 
   std::optional<FormatStyle> GetLanguageStyle(LanguageKind Language) const;
diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index 604e420..8ed1717 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -120,6 +120,7 @@ protected:
 private:
   void outputDependencyFile(DiagnosticsEngine &Diags);
 
+  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS;
   std::string OutputFile;
   std::vector<std::string> Targets;
   bool IncludeSystemHeaders;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5ee7ea4..a41f16f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -173,6 +173,7 @@ class SemaOpenMP;
 class SemaPPC;
 class SemaPseudoObject;
 class SemaRISCV;
+class SemaSPIRV;
 class SemaSYCL;
 class SemaSwift;
 class SemaSystemZ;
@@ -1142,6 +1143,11 @@ public:
     return *RISCVPtr;
   }
 
+  SemaSPIRV &SPIRV() {
+    assert(SPIRVPtr);
+    return *SPIRVPtr;
+  }
+
   SemaSYCL &SYCL() {
     assert(SYCLPtr);
     return *SYCLPtr;
@@ -1219,6 +1225,7 @@ private:
   std::unique_ptr<SemaPPC> PPCPtr;
   std::unique_ptr<SemaPseudoObject> PseudoObjectPtr;
   std::unique_ptr<SemaRISCV> RISCVPtr;
+  std::unique_ptr<SemaSPIRV> SPIRVPtr;
   std::unique_ptr<SemaSYCL> SYCLPtr;
   std::unique_ptr<SemaSwift> SwiftPtr;
   std::unique_ptr<SemaSystemZ> SystemZPtr;
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index addc33b..0f86d46 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -297,6 +297,7 @@ public:
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::Async ||
               ClauseKind == OpenACCClauseKind::DeviceNum ||
+              ClauseKind == OpenACCClauseKind::DefaultAsync ||
               ClauseKind == OpenACCClauseKind::Tile ||
               ClauseKind == OpenACCClauseKind::Worker ||
               ClauseKind == OpenACCClauseKind::Vector ||
@@ -349,6 +350,7 @@ public:
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::Async ||
               ClauseKind == OpenACCClauseKind::DeviceNum ||
+              ClauseKind == OpenACCClauseKind::DefaultAsync ||
               ClauseKind == OpenACCClauseKind::Tile ||
               ClauseKind == OpenACCClauseKind::Gang ||
               ClauseKind == OpenACCClauseKind::Worker ||
@@ -407,6 +409,8 @@ public:
               ClauseKind == OpenACCClauseKind::Detach ||
               ClauseKind == OpenACCClauseKind::DevicePtr ||
               ClauseKind == OpenACCClauseKind::Reduction ||
+              (ClauseKind == OpenACCClauseKind::Self &&
+               DirKind == OpenACCDirectiveKind::Update) ||
               ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
 
@@ -486,6 +490,7 @@ public:
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::Async ||
               ClauseKind == OpenACCClauseKind::DeviceNum ||
+              ClauseKind == OpenACCClauseKind::DefaultAsync ||
               ClauseKind == OpenACCClauseKind::Tile ||
               ClauseKind == OpenACCClauseKind::Worker ||
               ClauseKind == OpenACCClauseKind::Vector ||
@@ -498,6 +503,7 @@ public:
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::Async ||
               ClauseKind == OpenACCClauseKind::DeviceNum ||
+              ClauseKind == OpenACCClauseKind::DefaultAsync ||
               ClauseKind == OpenACCClauseKind::Tile ||
               ClauseKind == OpenACCClauseKind::Worker ||
               ClauseKind == OpenACCClauseKind::Vector ||
@@ -547,6 +553,8 @@ public:
               ClauseKind == OpenACCClauseKind::UseDevice ||
               ClauseKind == OpenACCClauseKind::Detach ||
               ClauseKind == OpenACCClauseKind::DevicePtr ||
+              (ClauseKind == OpenACCClauseKind::Self &&
+               DirKind == OpenACCDirectiveKind::Update) ||
               ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
       assert((!IsReadOnly || ClauseKind == OpenACCClauseKind::CopyIn ||
@@ -586,6 +594,8 @@ public:
               ClauseKind == OpenACCClauseKind::UseDevice ||
               ClauseKind == OpenACCClauseKind::Detach ||
               ClauseKind == OpenACCClauseKind::DevicePtr ||
+              (ClauseKind == OpenACCClauseKind::Self &&
+               DirKind == OpenACCDirectiveKind::Update) ||
               ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
       assert((!IsReadOnly || ClauseKind == OpenACCClauseKind::CopyIn ||
diff --git a/clang/include/clang/Sema/SemaSPIRV.h b/clang/include/clang/Sema/SemaSPIRV.h
new file mode 100644
index 0000000..b26b861
--- /dev/null
+++ b/clang/include/clang/Sema/SemaSPIRV.h
@@ -0,0 +1,28 @@
+//===----- SemaSPIRV.h ----- Semantic Analysis for SPIRV constructs--------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares semantic analysis for SPIRV constructs.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_SEMASPIRV_H
+#define LLVM_CLANG_SEMA_SEMASPIRV_H
+
+#include "clang/AST/ASTFwd.h"
+#include "clang/Sema/SemaBase.h"
+
+namespace clang {
+class SemaSPIRV : public SemaBase {
+public:
+  SemaSPIRV(Sema &S);
+
+  bool CheckSPIRVBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
+};
+} // namespace clang
+
+#endif // LLVM_CLANG_SEMA_SEMASPIRV_H
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index dfd82af..aac1651 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -2024,6 +2024,8 @@ enum StmtCode {
   STMT_OPENACC_WAIT_CONSTRUCT,
   STMT_OPENACC_INIT_CONSTRUCT,
   STMT_OPENACC_SHUTDOWN_CONSTRUCT,
+  STMT_OPENACC_SET_CONSTRUCT,
+  STMT_OPENACC_UPDATE_CONSTRUCT,
 
   // HLSL Constructs
   EXPR_HLSL_OUT_ARG,
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index cb972f0..adb7cce 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -997,13 +997,15 @@ protected:
   virtual Module *getEmittingModule(ASTContext &Ctx) override;
 
   CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache,
-                        StringRef OutputFile, bool GeneratingReducedBMI);
+                        StringRef OutputFile, bool GeneratingReducedBMI,
+                        bool AllowASTWithErrors);
 
 public:
   CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache,
-                        StringRef OutputFile)
+                        StringRef OutputFile, bool AllowASTWithErrors = false)
       : CXX20ModulesGenerator(PP, ModuleCache, OutputFile,
-                              /*GeneratingReducedBMI=*/false) {}
+                              /*GeneratingReducedBMI=*/false,
+                              AllowASTWithErrors) {}
 
   void HandleTranslationUnit(ASTContext &Ctx) override;
 };
@@ -1013,9 +1015,10 @@ class ReducedBMIGenerator : public CXX20ModulesGenerator {
 
 public:
   ReducedBMIGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache,
-                      StringRef OutputFile)
+                      StringRef OutputFile, bool AllowASTWithErrors = false)
       : CXX20ModulesGenerator(PP, ModuleCache, OutputFile,
-                              /*GeneratingReducedBMI=*/true) {}
+                              /*GeneratingReducedBMI=*/true,
+                              AllowASTWithErrors) {}
 };
 
 /// If we can elide the definition of \param D in reduced BMI.
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
index d8a7c75..34bb7a8 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
@@ -10,9 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_STRINGREF_H
+#ifndef LLVM_CLANG_STATICANALYZER_CORE_ANALYZEROPTIONS_H
 #error This .def file is expected to be included in translation units where \
-"llvm/ADT/StringRef.h" is already included!
+"clang/StaticAnalyzer/Core/AnalyzerOptions.h" is already included!
 #endif
 
 #ifdef ANALYZER_OPTION
@@ -193,6 +193,8 @@ ANALYZER_OPTION(
     "with \"crosscheck-with-z3-timeout-threshold\" of 300 ms, would nicely "
     "guarantee that no bug report equivalence class can take longer than "
     "1 second, effectively mitigating Z3 hangs during refutation. "
+    "If there were Z3 retries, only the minimum query time is considered "
+    "when accumulating query times within a report equivalence class. "
     "Set 0 for no timeout.", 0)
 
 ANALYZER_OPTION(
@@ -213,6 +215,15 @@ ANALYZER_OPTION(
     "400'000 should on average make Z3 queries run for up to 100ms on modern "
     "hardware. Set 0 for unlimited.", 0)
 
+ANALYZER_OPTION(
+    PositiveAnalyzerOption, Z3CrosscheckMaxAttemptsPerQuery,
+    "crosscheck-with-z3-max-attempts-per-query",
+    "Set how many times the oracle is allowed to run a Z3 query. "
+    "This must be a positive value. Set 1 to not allow any retry attempts. "
+    "Increasing the number of attempts is often more effective at reducing "
+    "the number of nondeterministic diagnostics than "
+    "\"crosscheck-with-z3-timeout-threshold\" in practice.", 3)
+
 ANALYZER_OPTION(bool, ShouldReportIssuesInMainSourceFile,
                 "report-in-main-source-file",
                 "Whether or not the diagnostic report should be always "
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index 2f4cd27..2c97030 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -124,6 +124,36 @@ enum UserModeKind {
 
 enum class CTUPhase1InliningKind { None, Small, All };
 
+class PositiveAnalyzerOption {
+public:
+  constexpr PositiveAnalyzerOption() = default;
+  constexpr PositiveAnalyzerOption(unsigned Value) : Value(Value) {
+    assert(Value > 0 && "only positive values are accepted");
+  }
+  constexpr PositiveAnalyzerOption(const PositiveAnalyzerOption &) = default;
+  constexpr PositiveAnalyzerOption &
+  operator=(const PositiveAnalyzerOption &Other) {
+    Value = Other.Value;
+    return *this;
+  }
+
+  static constexpr std::optional<PositiveAnalyzerOption> create(unsigned Val) {
+    if (Val == 0)
+      return std::nullopt;
+    return PositiveAnalyzerOption{Val};
+  }
+  static std::optional<PositiveAnalyzerOption> create(StringRef Str) {
+    unsigned Parsed = 0;
+    if (Str.getAsInteger(0, Parsed))
+      return std::nullopt;
+    return PositiveAnalyzerOption::create(Parsed);
+  }
+  constexpr operator unsigned() const { return Value; }
+
+private:
+  unsigned Value = 1;
+};
+
 /// Stores options for the analyzer from the command line.
 ///
 /// Some options are frontend flags (e.g.: -analyzer-output), but some are
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
index a6d05a3..80b79fd 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
@@ -126,6 +126,14 @@ private:
   ExplodedNode *generateCallExitBeginNode(ExplodedNode *N,
                                           const ReturnStmt *RS);
 
+  /// Helper function called by `HandleBranch()`. If the currently handled
+  /// branch corresponds to a loop, this returns the number of already
+  /// completed iterations in that loop, otherwise the return value is
+  /// `std::nullopt`. Note that this counts _all_ earlier iterations, including
+  /// ones that were performed within an earlier iteration of an outer loop.
+  std::optional<unsigned> getCompletedIterationCount(const CFGBlock *B,
+                                                     ExplodedNode *Pred) const;
+
 public:
   /// Construct a CoreEngine object to analyze the provided CFG.
   CoreEngine(ExprEngine &exprengine,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 8c7493e..20c446e 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -321,14 +321,14 @@ public:
                                NodeBuilderWithSinks &nodeBuilder,
                                ExplodedNode *Pred);
 
-  /// ProcessBranch - Called by CoreEngine.  Used to generate successor
-  ///  nodes by processing the 'effects' of a branch condition.
-  void processBranch(const Stmt *Condition,
-                     NodeBuilderContext& BuilderCtx,
-                     ExplodedNode *Pred,
-                     ExplodedNodeSet &Dst,
-                     const CFGBlock *DstT,
-                     const CFGBlock *DstF);
+  /// ProcessBranch - Called by CoreEngine. Used to generate successor nodes by
+  /// processing the 'effects' of a branch condition. If the branch condition
+  /// is a loop condition, IterationsCompletedInLoop is the number of completed
+  /// iterations (otherwise it's std::nullopt).
+  void processBranch(const Stmt *Condition, NodeBuilderContext &BuilderCtx,
+                     ExplodedNode *Pred, ExplodedNodeSet &Dst,
+                     const CFGBlock *DstT, const CFGBlock *DstF,
+                     std::optional<unsigned> IterationsCompletedInLoop);
 
   /// Called by CoreEngine.
   /// Used to generate successor nodes for temporary destructors depending
@@ -588,6 +588,8 @@ public:
   void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
                                     const Expr *Ex);
 
+  bool didEagerlyAssumeBifurcateAt(ProgramStateRef State, const Expr *Ex) const;
+
   static std::pair<const ProgramPointTag *, const ProgramPointTag *>
   getEagerlyAssumeBifurcationTags();
 
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
index 862a30c..aca14cf 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
@@ -25,6 +25,8 @@ namespace ento {
 
 class MemRegion;
 
+using SymbolID = unsigned;
+
 /// Symbolic value. These values used to capture symbolic execution of
 /// the program.
 class SymExpr : public llvm::FoldingSetNode {
@@ -39,9 +41,19 @@ public:
 
 private:
   Kind K;
+  /// A unique identifier for this symbol.
+  ///
+  /// It is useful for SymbolData to easily differentiate multiple symbols, but
+  /// also for "ephemeral" symbols, such as binary operations, because this id
+  /// can be used for arranging constraints or equivalence classes instead of
+  /// unstable pointer values.
+  ///
+  /// Note, however, that it can't be used in Profile because SymbolManager
+  /// needs to compute Profile before allocating SymExpr.
+  const SymbolID Sym;
 
 protected:
-  SymExpr(Kind k) : K(k) {}
+  SymExpr(Kind k, SymbolID Sym) : K(k), Sym(Sym) {}
 
   static bool isValidTypeForSymbol(QualType T) {
     // FIXME: Depending on whether we choose to deprecate structural symbols,
@@ -56,6 +68,14 @@ public:
 
   Kind getKind() const { return K; }
 
+  /// Get a unique identifier for this symbol.
+  /// The ID is unique across all SymExprs in a SymbolManager.
+  /// They reflect the allocation order of these SymExprs,
+  /// and are likely stable across runs.
+  /// Used as a key in SymbolRef containers and as part of identity
+  /// for SymbolData, e.g. SymbolConjured with ID = 7 is "conj_$7".
+  SymbolID getSymbolID() const { return Sym; }
+
   virtual void dump() const;
 
   virtual void dumpToStream(raw_ostream &os) const {}
@@ -112,19 +132,14 @@ inline raw_ostream &operator<<(raw_ostream &os,
 
 using SymbolRef = const SymExpr *;
 using SymbolRefSmallVectorTy = SmallVector<SymbolRef, 2>;
-using SymbolID = unsigned;
 
 /// A symbol representing data which can be stored in a memory location
 /// (region).
 class SymbolData : public SymExpr {
-  const SymbolID Sym;
-
   void anchor() override;
 
 protected:
-  SymbolData(Kind k, SymbolID sym) : SymExpr(k), Sym(sym) {
-    assert(classof(this));
-  }
+  SymbolData(Kind k, SymbolID sym) : SymExpr(k, sym) { assert(classof(this)); }
 
 public:
   ~SymbolData() override = default;
@@ -132,8 +147,6 @@ public:
   /// Get a string representation of the kind of the region.
   virtual StringRef getKindStr() const = 0;
 
-  SymbolID getSymbolID() const { return Sym; }
-
   unsigned computeComplexity() const override {
     return 1;
   };
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
index 73732d5..c530dff 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/ImmutableSet.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
 #include <cassert>
@@ -43,15 +44,16 @@ class StoreManager;
 class SymbolRegionValue : public SymbolData {
   const TypedValueRegion *R;
 
-public:
+  friend class SymExprAllocator;
   SymbolRegionValue(SymbolID sym, const TypedValueRegion *r)
       : SymbolData(SymbolRegionValueKind, sym), R(r) {
     assert(r);
     assert(isValidTypeForSymbol(r->getValueType()));
   }
 
+public:
   LLVM_ATTRIBUTE_RETURNS_NONNULL
-  const TypedValueRegion* getRegion() const { return R; }
+  const TypedValueRegion *getRegion() const { return R; }
 
   static void Profile(llvm::FoldingSetNodeID& profile, const TypedValueRegion* R) {
     profile.AddInteger((unsigned) SymbolRegionValueKind);
@@ -84,7 +86,7 @@ class SymbolConjured : public SymbolData {
   const LocationContext *LCtx;
   const void *SymbolTag;
 
-public:
+  friend class SymExprAllocator;
   SymbolConjured(SymbolID sym, const Stmt *s, const LocationContext *lctx,
                  QualType t, unsigned count, const void *symbolTag)
       : SymbolData(SymbolConjuredKind, sym), S(s), T(t), Count(count),
@@ -98,6 +100,7 @@ public:
     assert(isValidTypeForSymbol(t));
   }
 
+public:
   /// It might return null.
   const Stmt *getStmt() const { return S; }
   unsigned getCount() const { return Count; }
@@ -137,7 +140,7 @@ class SymbolDerived : public SymbolData {
   SymbolRef parentSymbol;
   const TypedValueRegion *R;
 
-public:
+  friend class SymExprAllocator;
   SymbolDerived(SymbolID sym, SymbolRef parent, const TypedValueRegion *r)
       : SymbolData(SymbolDerivedKind, sym), parentSymbol(parent), R(r) {
     assert(parent);
@@ -145,6 +148,7 @@ public:
     assert(isValidTypeForSymbol(r->getValueType()));
   }
 
+public:
   LLVM_ATTRIBUTE_RETURNS_NONNULL
   SymbolRef getParentSymbol() const { return parentSymbol; }
   LLVM_ATTRIBUTE_RETURNS_NONNULL
@@ -180,12 +184,13 @@ public:
 class SymbolExtent : public SymbolData {
   const SubRegion *R;
 
-public:
+  friend class SymExprAllocator;
   SymbolExtent(SymbolID sym, const SubRegion *r)
       : SymbolData(SymbolExtentKind, sym), R(r) {
     assert(r);
   }
 
+public:
   LLVM_ATTRIBUTE_RETURNS_NONNULL
   const SubRegion *getRegion() const { return R; }
 
@@ -222,7 +227,7 @@ class SymbolMetadata : public SymbolData {
   unsigned Count;
   const void *Tag;
 
-public:
+  friend class SymExprAllocator;
   SymbolMetadata(SymbolID sym, const MemRegion* r, const Stmt *s, QualType t,
                  const LocationContext *LCtx, unsigned count, const void *tag)
       : SymbolData(SymbolMetadataKind, sym), R(r), S(s), T(t), LCtx(LCtx),
@@ -234,6 +239,7 @@ public:
       assert(tag);
     }
 
+  public:
     LLVM_ATTRIBUTE_RETURNS_NONNULL
     const MemRegion *getRegion() const { return R; }
 
@@ -286,15 +292,16 @@ class SymbolCast : public SymExpr {
   /// The type of the result.
   QualType ToTy;
 
-public:
-  SymbolCast(const SymExpr *In, QualType From, QualType To)
-      : SymExpr(SymbolCastKind), Operand(In), FromTy(From), ToTy(To) {
+  friend class SymExprAllocator;
+  SymbolCast(SymbolID Sym, const SymExpr *In, QualType From, QualType To)
+      : SymExpr(SymbolCastKind, Sym), Operand(In), FromTy(From), ToTy(To) {
     assert(In);
     assert(isValidTypeForSymbol(From));
     // FIXME: GenericTaintChecker creates symbols of void type.
     // Otherwise, 'To' should also be a valid type.
   }
 
+public:
   unsigned computeComplexity() const override {
     if (Complexity == 0)
       Complexity = 1 + Operand->computeComplexity();
@@ -332,9 +339,10 @@ class UnarySymExpr : public SymExpr {
   UnaryOperator::Opcode Op;
   QualType T;
 
-public:
-  UnarySymExpr(const SymExpr *In, UnaryOperator::Opcode Op, QualType T)
-      : SymExpr(UnarySymExprKind), Operand(In), Op(Op), T(T) {
+  friend class SymExprAllocator;
+  UnarySymExpr(SymbolID Sym, const SymExpr *In, UnaryOperator::Opcode Op,
+               QualType T)
+      : SymExpr(UnarySymExprKind, Sym), Operand(In), Op(Op), T(T) {
     // Note, some unary operators are modeled as a binary operator. E.g. ++x is
     // modeled as x + 1.
     assert((Op == UO_Minus || Op == UO_Not) && "non-supported unary expression");
@@ -345,6 +353,7 @@ public:
     assert(!Loc::isLocType(T) && "unary symbol should be nonloc");
   }
 
+public:
   unsigned computeComplexity() const override {
     if (Complexity == 0)
       Complexity = 1 + Operand->computeComplexity();
@@ -381,8 +390,8 @@ class BinarySymExpr : public SymExpr {
   QualType T;
 
 protected:
-  BinarySymExpr(Kind k, BinaryOperator::Opcode op, QualType t)
-      : SymExpr(k), Op(op), T(t) {
+  BinarySymExpr(SymbolID Sym, Kind k, BinaryOperator::Opcode op, QualType t)
+      : SymExpr(k, Sym), Op(op), T(t) {
     assert(classof(this));
     // Binary expressions are results of arithmetic. Pointer arithmetic is not
     // handled by binary expressions, but it is instead handled by applying
@@ -425,14 +434,15 @@ class BinarySymExprImpl : public BinarySymExpr {
   LHSTYPE LHS;
   RHSTYPE RHS;
 
-public:
-  BinarySymExprImpl(LHSTYPE lhs, BinaryOperator::Opcode op, RHSTYPE rhs,
-                    QualType t)
-      : BinarySymExpr(ClassKind, op, t), LHS(lhs), RHS(rhs) {
+  friend class SymExprAllocator;
+  BinarySymExprImpl(SymbolID Sym, LHSTYPE lhs, BinaryOperator::Opcode op,
+                    RHSTYPE rhs, QualType t)
+      : BinarySymExpr(Sym, ClassKind, op, t), LHS(lhs), RHS(rhs) {
     assert(getPointer(lhs));
     assert(getPointer(rhs));
   }
 
+public:
   void dumpToStream(raw_ostream &os) const override {
     dumpToStreamImpl(os, LHS);
     dumpToStreamImpl(os, getOpcode());
@@ -478,6 +488,21 @@ using IntSymExpr = BinarySymExprImpl<APSIntPtr, const SymExpr *,
 using SymSymExpr = BinarySymExprImpl<const SymExpr *, const SymExpr *,
                                      SymExpr::Kind::SymSymExprKind>;
 
+class SymExprAllocator {
+  SymbolID NextSymbolID = 0;
+  llvm::BumpPtrAllocator &Alloc;
+
+public:
+  explicit SymExprAllocator(llvm::BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
+
+  template <class SymT, typename... ArgsT> SymT *make(ArgsT &&...Args) {
+    return new (Alloc) SymT(nextID(), std::forward<ArgsT>(Args)...);
+  }
+
+private:
+  SymbolID nextID() { return NextSymbolID++; }
+};
+
 class SymbolManager {
   using DataSetTy = llvm::FoldingSet<SymExpr>;
   using SymbolDependTy =
@@ -489,15 +514,14 @@ class SymbolManager {
   /// alive as long as the key is live.
   SymbolDependTy SymbolDependencies;
 
-  unsigned SymbolCounter = 0;
-  llvm::BumpPtrAllocator& BPAlloc;
+  SymExprAllocator Alloc;
   BasicValueFactory &BV;
   ASTContext &Ctx;
 
 public:
   SymbolManager(ASTContext &ctx, BasicValueFactory &bv,
-                llvm::BumpPtrAllocator& bpalloc)
-      : SymbolDependencies(16), BPAlloc(bpalloc), BV(bv), Ctx(ctx) {}
+                llvm::BumpPtrAllocator &bpalloc)
+      : SymbolDependencies(16), Alloc(bpalloc), BV(bv), Ctx(ctx) {}
 
   static bool canSymbolicate(QualType T);
 
@@ -687,4 +711,36 @@ public:
 
 } // namespace clang
 
+// Override the default definition that would use pointer values of SymbolRefs
+// to order them, which is unstable due to ASLR.
+// Use the SymbolID instead which reflect the order in which the symbols were
+// allocated. This is usually stable across runs leading to the stability of
+// ConstraintMap and other containers using SymbolRef as keys.
+template <>
+struct llvm::ImutContainerInfo<clang::ento::SymbolRef>
+    : public ImutProfileInfo<clang::ento::SymbolRef> {
+  using value_type = clang::ento::SymbolRef;
+  using value_type_ref = clang::ento::SymbolRef;
+  using key_type = value_type;
+  using key_type_ref = value_type_ref;
+  using data_type = bool;
+  using data_type_ref = bool;
+
+  static key_type_ref KeyOfValue(value_type_ref D) { return D; }
+  static data_type_ref DataOfValue(value_type_ref) { return true; }
+
+  static bool isEqual(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) {
+    return LHS->getSymbolID() == RHS->getSymbolID();
+  }
+
+  static bool isLess(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) {
+    return LHS->getSymbolID() < RHS->getSymbolID();
+  }
+
+  // This might seem redundant, but it is required because of the way
+  // ImmutableSet is implemented through AVLTree:
+  // same as ImmutableMap, but with a non-informative "data".
+  static bool isDataEqual(data_type_ref, data_type_ref) { return true; }
+};
+
 #endif // LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_SYMBOLMANAGER_H
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
index 012237e..ddb078d 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
@@ -15,6 +15,7 @@
 #include "clang/Tooling/JSONCompilationDatabase.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
+#include <functional>
 #include <optional>
 #include <string>
 #include <vector>
@@ -25,7 +26,7 @@ namespace dependencies {
 
 /// A callback to lookup module outputs for "-fmodule-file=", "-o" etc.
 using LookupModuleOutputCallback =
-    llvm::function_ref<std::string(const ModuleID &, ModuleOutputKind)>;
+    std::function<std::string(const ModuleID &, ModuleOutputKind)>;
 
 /// Graph of modular dependencies.
 using ModuleDepsGraph = std::vector<ModuleDeps>;
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index 5bb9f6b..f00dede 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -62,8 +62,6 @@ module Clang_Basic {
   textual header "clang/Basic/BuiltinsVE.def"
   textual header "clang/Basic/BuiltinsVEVL.gen.def"
   textual header "clang/Basic/BuiltinsWebAssembly.def"
-  textual header "clang/Basic/BuiltinsX86.def"
-  textual header "clang/Basic/BuiltinsX86_64.def"
   textual header "clang/Basic/BuiltinsXCore.def"
   textual header "clang/Basic/CFProtectionOptions.def"
   textual header "clang/Basic/CodeGenOptions.def"
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 6ec927e..b10513f 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6376,7 +6376,7 @@ ASTContext::getAutoType(QualType DeducedType, AutoTypeKeyword Keyword,
 }
 
 QualType ASTContext::getUnconstrainedType(QualType T) const {
-  QualType CanonT = T.getCanonicalType();
+  QualType CanonT = T.getNonPackExpansionType().getCanonicalType();
 
   // Remove a type-constraint from a top-level auto or decltype(auto).
   if (auto *AT = CanonT->getAs<AutoType>()) {
@@ -9751,6 +9751,43 @@ static TypedefDecl *CreateHexagonBuiltinVaListDecl(const ASTContext *Context) {
   return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
 }
 
+static TypedefDecl *
+CreateXtensaABIBuiltinVaListDecl(const ASTContext *Context) {
+  // typedef struct __va_list_tag {
+  RecordDecl *VaListTagDecl = Context->buildImplicitRecord("__va_list_tag");
+
+  VaListTagDecl->startDefinition();
+
+  // int* __va_stk;
+  // int* __va_reg;
+  // int __va_ndx;
+  constexpr size_t NumFields = 3;
+  QualType FieldTypes[NumFields] = {Context->getPointerType(Context->IntTy),
+                                    Context->getPointerType(Context->IntTy),
+                                    Context->IntTy};
+  const char *FieldNames[NumFields] = {"__va_stk", "__va_reg", "__va_ndx"};
+
+  // Create fields
+  for (unsigned i = 0; i < NumFields; ++i) {
+    FieldDecl *Field = FieldDecl::Create(
+        *Context, VaListTagDecl, SourceLocation(), SourceLocation(),
+        &Context->Idents.get(FieldNames[i]), FieldTypes[i], /*TInfo=*/nullptr,
+        /*BitWidth=*/nullptr,
+        /*Mutable=*/false, ICIS_NoInit);
+    Field->setAccess(AS_public);
+    VaListTagDecl->addDecl(Field);
+  }
+  VaListTagDecl->completeDefinition();
+  Context->VaListTagDecl = VaListTagDecl;
+  QualType VaListTagType = Context->getRecordType(VaListTagDecl);
+
+  // } __va_list_tag;
+  TypedefDecl *VaListTagTypedefDecl =
+      Context->buildImplicitTypedef(VaListTagType, "__builtin_va_list");
+
+  return VaListTagTypedefDecl;
+}
+
 static TypedefDecl *CreateVaListDecl(const ASTContext *Context,
                                      TargetInfo::BuiltinVaListKind Kind) {
   switch (Kind) {
@@ -9772,6 +9809,8 @@ static TypedefDecl *CreateVaListDecl(const ASTContext *Context,
     return CreateSystemZBuiltinVaListDecl(Context);
   case TargetInfo::HexagonBuiltinVaList:
     return CreateHexagonBuiltinVaListDecl(Context);
+  case TargetInfo::XtensaABIBuiltinVaList:
+    return CreateXtensaABIBuiltinVaListDecl(Context);
   }
 
   llvm_unreachable("Unhandled __builtin_va_list type kind");
@@ -14318,7 +14357,7 @@ QualType ASTContext::getCorrespondingSignedFixedPointType(QualType Ty) const {
 // corresponding backend features (which may contain duplicates).
 static std::vector<std::string> getFMVBackendFeaturesFor(
     const llvm::SmallVectorImpl<StringRef> &FMVFeatStrings) {
-  std::vector<std::string> BackendFeats{{"+fmv"}};
+  std::vector<std::string> BackendFeats;
   llvm::AArch64::ExtensionSet FeatureBits;
   for (StringRef F : FMVFeatStrings)
     if (auto FMVExt = llvm::AArch64::parseFMVExtension(F))
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 68c75b0..036f960 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -3427,6 +3427,38 @@ bool Compiler<Emitter>::VisitBlockExpr(const BlockExpr *E) {
 }
 
 template <class Emitter>
+bool Compiler<Emitter>::VisitCXXTypeidExpr(const CXXTypeidExpr *E) {
+  const Type *TypeInfoType = E->getType().getTypePtr();
+
+  if (!E->isPotentiallyEvaluated()) {
+    if (DiscardResult)
+      return true;
+
+    if (E->isTypeOperand())
+      return this->emitGetTypeid(
+          E->getTypeOperand(Ctx.getASTContext()).getTypePtr(), TypeInfoType, E);
+    return this->emitGetTypeid(E->getExprOperand()->getType().getTypePtr(),
+                               TypeInfoType, E);
+  }
+
+  // Otherwise, we need to evaluate the expression operand.
+  assert(E->getExprOperand());
+  assert(E->getExprOperand()->isLValue());
+
+  if (!Ctx.getLangOpts().CPlusPlus20 && !this->emitDiagTypeid(E))
+    return false;
+
+  if (!this->visit(E->getExprOperand()))
+    return false;
+
+  if (!this->emitGetTypeidPtr(TypeInfoType, E))
+    return false;
+  if (DiscardResult)
+    return this->emitPopPtr(E);
+  return true;
+}
+
+template <class Emitter>
 bool Compiler<Emitter>::VisitExpressionTraitExpr(const ExpressionTraitExpr *E) {
   assert(Ctx.getLangOpts().CPlusPlus);
   return this->emitConstBool(E->getValue(), E);
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 2a94f5e..71765b1 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -205,6 +205,7 @@ public:
   bool VisitCXXNewExpr(const CXXNewExpr *E);
   bool VisitCXXDeleteExpr(const CXXDeleteExpr *E);
   bool VisitBlockExpr(const BlockExpr *E);
+  bool VisitCXXTypeidExpr(const CXXTypeidExpr *E);
 
   // Statements.
   bool visitCompoundStmt(const CompoundStmt *S);
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 7c77520..cb0ce886 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1154,6 +1154,53 @@ bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T) {
   return false;
 }
 
+static bool getField(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                     uint32_t Off) {
+  if (S.getLangOpts().CPlusPlus && S.inConstantContext() &&
+      !CheckNull(S, OpPC, Ptr, CSK_Field))
+    return false;
+
+  if (!CheckExtern(S, OpPC, Ptr))
+    return false;
+  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
+    return false;
+  if (!CheckArray(S, OpPC, Ptr))
+    return false;
+  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
+    return false;
+
+  if (Ptr.isIntegralPointer()) {
+    S.Stk.push<Pointer>(Ptr.asIntPointer().atOffset(S.getASTContext(), Off));
+    return true;
+  }
+
+  if (!Ptr.isBlockPointer()) {
+    // FIXME: The only time we (seem to) get here is when trying to access a
+    // field of a typeid pointer. In that case, we're supposed to diagnose e.g.
+    // `typeid(int).name`, but we currently diagnose `&typeid(int)`.
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_access_unreadable_object)
+        << AK_Read << Ptr.toDiagnosticString(S.getASTContext());
+    return false;
+  }
+
+  if (Off > Ptr.block()->getSize())
+    return false;
+
+  S.Stk.push<Pointer>(Ptr.atField(Off));
+  return true;
+}
+
+bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
+  const auto &Ptr = S.Stk.peek<Pointer>();
+  return getField(S, OpPC, Ptr, Off);
+}
+
+bool GetPtrFieldPop(InterpState &S, CodePtr OpPC, uint32_t Off) {
+  const auto &Ptr = S.Stk.pop<Pointer>();
+  return getField(S, OpPC, Ptr, Off);
+}
+
 static bool checkConstructor(InterpState &S, CodePtr OpPC, const Function *Func,
                              const Pointer &ThisPtr) {
   assert(Func->isConstructor());
@@ -1595,6 +1642,41 @@ bool CheckBitCast(InterpState &S, CodePtr OpPC, bool HasIndeterminateBits,
   return false;
 }
 
+bool GetTypeid(InterpState &S, CodePtr OpPC, const Type *TypePtr,
+               const Type *TypeInfoType) {
+  S.Stk.push<Pointer>(TypePtr, TypeInfoType);
+  return true;
+}
+
+bool GetTypeidPtr(InterpState &S, CodePtr OpPC, const Type *TypeInfoType) {
+  const auto &P = S.Stk.pop<Pointer>();
+
+  if (!P.isBlockPointer())
+    return false;
+
+  if (P.isDummy()) {
+    QualType StarThisType =
+        S.getASTContext().getLValueReferenceType(P.getType());
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_polymorphic_unknown_dynamic_type)
+        << AK_TypeId
+        << P.toAPValue(S.getASTContext())
+               .getAsString(S.getASTContext(), StarThisType);
+    return false;
+  }
+
+  S.Stk.push<Pointer>(P.getType().getTypePtr(), TypeInfoType);
+  return true;
+}
+
+bool DiagTypeid(InterpState &S, CodePtr OpPC) {
+  const auto *E = cast<CXXTypeidExpr>(S.Current->getExpr(OpPC));
+  S.CCEDiag(E, diag::note_constexpr_typeid_polymorphic)
+      << E->getExprOperand()->getType()
+      << E->getExprOperand()->getSourceRange();
+  return false;
+}
+
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 8461d1e..d2aec69 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -1526,61 +1526,8 @@ inline bool GetPtrGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 
 /// 1) Peeks a Pointer
 /// 2) Pushes Pointer.atField(Off) on the stack
-inline bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
-  const Pointer &Ptr = S.Stk.peek<Pointer>();
-
-  if (S.getLangOpts().CPlusPlus && S.inConstantContext() &&
-      !CheckNull(S, OpPC, Ptr, CSK_Field))
-    return false;
-
-  if (!CheckExtern(S, OpPC, Ptr))
-    return false;
-  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
-    return false;
-  if (!CheckArray(S, OpPC, Ptr))
-    return false;
-  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
-    return false;
-
-  if (Ptr.isBlockPointer() && Off > Ptr.block()->getSize())
-    return false;
-
-  if (Ptr.isIntegralPointer()) {
-    S.Stk.push<Pointer>(Ptr.asIntPointer().atOffset(S.getASTContext(), Off));
-    return true;
-  }
-
-  S.Stk.push<Pointer>(Ptr.atField(Off));
-  return true;
-}
-
-inline bool GetPtrFieldPop(InterpState &S, CodePtr OpPC, uint32_t Off) {
-  const Pointer &Ptr = S.Stk.pop<Pointer>();
-
-  if (S.getLangOpts().CPlusPlus && S.inConstantContext() &&
-      !CheckNull(S, OpPC, Ptr, CSK_Field))
-    return false;
-
-  if (!CheckExtern(S, OpPC, Ptr))
-    return false;
-  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
-    return false;
-  if (!CheckArray(S, OpPC, Ptr))
-    return false;
-  if (!CheckSubobject(S, OpPC, Ptr, CSK_Field))
-    return false;
-
-  if (Ptr.isBlockPointer() && Off > Ptr.block()->getSize())
-    return false;
-
-  if (Ptr.isIntegralPointer()) {
-    S.Stk.push<Pointer>(Ptr.asIntPointer().atOffset(S.getASTContext(), Off));
-    return true;
-  }
-
-  S.Stk.push<Pointer>(Ptr.atField(Off));
-  return true;
-}
+bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off);
+bool GetPtrFieldPop(InterpState &S, CodePtr OpPC, uint32_t Off);
 
 inline bool GetPtrThisField(InterpState &S, CodePtr OpPC, uint32_t Off) {
   if (S.checkingPotentialConstantExpression())
@@ -3087,6 +3034,12 @@ inline bool BitCast(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
+/// Typeid support.
+bool GetTypeid(InterpState &S, CodePtr OpPC, const Type *TypePtr,
+               const Type *TypeInfoType);
+bool GetTypeidPtr(InterpState &S, CodePtr OpPC, const Type *TypeInfoType);
+bool DiagTypeid(InterpState &S, CodePtr OpPC);
+
 //===----------------------------------------------------------------------===//
 // Read opcode arguments
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 2ae91fe..0d52083 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/SipHash.h"
 
 namespace clang {
@@ -154,7 +155,7 @@ static void diagnoseNonConstexprBuiltin(InterpState &S, CodePtr OpPC,
   if (S.getLangOpts().CPlusPlus11)
     S.CCEDiag(Loc, diag::note_constexpr_invalid_function)
         << /*isConstexpr=*/0 << /*isConstructor=*/0
-        << ("'" + S.getASTContext().BuiltinInfo.getName(ID) + "'").str();
+        << S.getASTContext().BuiltinInfo.getQuotedName(ID);
   else
     S.CCEDiag(Loc, diag::note_invalid_subexpr_in_const_expr);
 }
@@ -1543,9 +1544,10 @@ static bool interp__builtin_constant_p(InterpState &S, CodePtr OpPC,
     if (Res.isInvalid()) {
       C.cleanup();
       Stk.clear();
+      return returnInt(false);
     }
 
-    if (!Res.isInvalid() && !Res.empty()) {
+    if (!Res.empty()) {
       const APValue &LV = Res.toAPValue();
       if (LV.isLValue()) {
         APValue::LValueBase Base = LV.getLValueBase();
@@ -1837,6 +1839,7 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
   assert(Call->getNumArgs() == 3);
   unsigned ID = Func->getBuiltinID();
   Pointer DestPtr = getParam<Pointer>(Frame, 0);
+  const ASTContext &ASTCtx = S.getASTContext();
   const Pointer &SrcPtr = getParam<Pointer>(Frame, 1);
   const APSInt &Size =
       peekToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(2)));
@@ -1857,34 +1860,63 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
     Pointer DiagPtr = (SrcPtr.isZero() ? SrcPtr : DestPtr);
     S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_null)
         << /*IsMove=*/Move << /*IsWchar=*/false << !SrcPtr.isZero()
-        << DiagPtr.toDiagnosticString(S.getASTContext());
+        << DiagPtr.toDiagnosticString(ASTCtx);
     return false;
   }
 
-  QualType ElemType;
-  if (DestPtr.getFieldDesc()->isArray())
-    ElemType = DestPtr.getFieldDesc()->getElemQualType();
-  else
-    ElemType = DestPtr.getType();
+  // Can't read from dummy pointers.
+  if (DestPtr.isDummy() || SrcPtr.isDummy())
+    return false;
 
-  unsigned ElemSize =
-      S.getASTContext().getTypeSizeInChars(ElemType).getQuantity();
-  if (Size.urem(ElemSize) != 0) {
+  QualType DestElemType;
+  size_t RemainingDestElems;
+  if (DestPtr.getFieldDesc()->isArray()) {
+    DestElemType = DestPtr.getFieldDesc()->getElemQualType();
+    RemainingDestElems = DestPtr.isUnknownSizeArray()
+                             ? 0
+                             : (DestPtr.getNumElems() - DestPtr.getIndex());
+  } else {
+    DestElemType = DestPtr.getType();
+    RemainingDestElems = 1;
+  }
+  unsigned DestElemSize = ASTCtx.getTypeSizeInChars(DestElemType).getQuantity();
+
+  if (Size.urem(DestElemSize) != 0) {
     S.FFDiag(S.Current->getSource(OpPC),
              diag::note_constexpr_memcpy_unsupported)
-        << Move << /*IsWchar=*/false << 0 << ElemType << Size << ElemSize;
+        << Move << /*IsWchar=*/false << 0 << DestElemType << Size
+        << DestElemSize;
     return false;
   }
 
   QualType SrcElemType;
-  if (SrcPtr.getFieldDesc()->isArray())
+  size_t RemainingSrcElems;
+  if (SrcPtr.getFieldDesc()->isArray()) {
     SrcElemType = SrcPtr.getFieldDesc()->getElemQualType();
-  else
+    RemainingSrcElems = SrcPtr.isUnknownSizeArray()
+                            ? 0
+                            : (SrcPtr.getNumElems() - SrcPtr.getIndex());
+  } else {
     SrcElemType = SrcPtr.getType();
+    RemainingSrcElems = 1;
+  }
+  unsigned SrcElemSize = ASTCtx.getTypeSizeInChars(SrcElemType).getQuantity();
 
-  if (!S.getASTContext().hasSameUnqualifiedType(ElemType, SrcElemType)) {
+  if (!ASTCtx.hasSameUnqualifiedType(DestElemType, SrcElemType)) {
     S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_type_pun)
-        << Move << SrcElemType << ElemType;
+        << Move << SrcElemType << DestElemType;
+    return false;
+  }
+
+  // Check if we have enough elements to read from and write to/
+  size_t RemainingDestBytes = RemainingDestElems * DestElemSize;
+  size_t RemainingSrcBytes = RemainingSrcElems * SrcElemSize;
+  if (Size.ugt(RemainingDestBytes) || Size.ugt(RemainingSrcBytes)) {
+    APInt N = Size.udiv(DestElemSize);
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_memcpy_unsupported)
+        << Move << /*IsWChar*/ false << (Size.ugt(RemainingSrcBytes) ? 1 : 2)
+        << DestElemType << toString(N, 10, /*Signed=*/false);
     return false;
   }
 
@@ -1902,10 +1934,7 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
     }
   }
 
-  // As a last resort, reject dummy pointers.
-  if (DestPtr.isDummy() || SrcPtr.isDummy())
-    return false;
-  assert(Size.getZExtValue() % ElemSize == 0);
+  assert(Size.getZExtValue() % DestElemSize == 0);
   if (!DoMemcpy(S, OpPC, SrcPtr, DestPtr, Bytes(Size.getZExtValue()).toBits()))
     return false;
 
@@ -1948,7 +1977,7 @@ static bool interp__builtin_memcmp(InterpState &S, CodePtr OpPC,
                   !isOneByteCharacterType(PtrB.getType()))) {
     S.FFDiag(S.Current->getSource(OpPC),
              diag::note_constexpr_memcmp_unsupported)
-        << ("'" + ASTCtx.BuiltinInfo.getName(ID) + "'").str() << PtrA.getType()
+        << ASTCtx.BuiltinInfo.getQuotedName(ID) << PtrA.getType()
         << PtrB.getType();
     return false;
   }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 0fc94e1..57c1fab 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -110,7 +110,7 @@ static bool enumerateData(const Pointer &P, const Context &Ctx, Bits Offset,
   if (FieldDesc->isCompositeArray()) {
     QualType ElemType = FieldDesc->getElemQualType();
     Bits ElemSize = Bits(Ctx.getASTContext().getTypeSize(ElemType));
-    for (unsigned I = 0; I != FieldDesc->getNumElems(); ++I) {
+    for (unsigned I = P.getIndex(); I != FieldDesc->getNumElems(); ++I) {
       enumerateData(P.atIndex(I).narrow(), Ctx, Offset, BitsToRead, F);
       Offset += ElemSize;
       if (Offset >= BitsToRead)
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 123c21f..4b0c902 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -850,3 +850,7 @@ def BitCastPrim : Opcode {
 }
 
 def BitCast : Opcode;
+
+def GetTypeid : Opcode { let Args = [ArgTypePtr, ArgTypePtr]; }
+def GetTypeidPtr : Opcode { let Args = [ArgTypePtr]; }
+def DiagTypeid : Opcode;
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 5448485..ec4756f 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -96,6 +96,8 @@ void Pointer::operator=(const Pointer &P) {
     PointeeStorage.Int = P.PointeeStorage.Int;
   } else if (P.isFunctionPointer()) {
     PointeeStorage.Fn = P.PointeeStorage.Fn;
+  } else if (P.isTypeidPointer()) {
+    PointeeStorage.Typeid = P.PointeeStorage.Typeid;
   } else {
     assert(false && "Unhandled storage kind");
   }
@@ -132,6 +134,8 @@ void Pointer::operator=(Pointer &&P) {
     PointeeStorage.Int = P.PointeeStorage.Int;
   } else if (P.isFunctionPointer()) {
     PointeeStorage.Fn = P.PointeeStorage.Fn;
+  } else if (P.isTypeidPointer()) {
+    PointeeStorage.Typeid = P.PointeeStorage.Typeid;
   } else {
     assert(false && "Unhandled storage kind");
   }
@@ -151,6 +155,14 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const {
   if (isFunctionPointer())
     return asFunctionPointer().toAPValue(ASTCtx);
 
+  if (isTypeidPointer()) {
+    TypeInfoLValue TypeInfo(PointeeStorage.Typeid.TypePtr);
+    return APValue(
+        APValue::LValueBase::getTypeInfo(
+            TypeInfo, QualType(PointeeStorage.Typeid.TypeInfoType, 0)),
+        CharUnits::Zero(), APValue::NoLValuePath{});
+  }
+
   // Build the lvalue base from the block.
   const Descriptor *Desc = getDeclDesc();
   APValue::LValueBase Base;
@@ -304,6 +316,9 @@ void Pointer::print(llvm::raw_ostream &OS) const {
   case Storage::Fn:
     OS << "(Fn) { " << asFunctionPointer().getFunction() << " + " << Offset
        << " }";
+    break;
+  case Storage::Typeid:
+    OS << "(Typeid)";
   }
 }
 
@@ -450,6 +465,8 @@ bool Pointer::hasSameBase(const Pointer &A, const Pointer &B) {
     return true;
   if (A.isFunctionPointer() && B.isFunctionPointer())
     return true;
+  if (A.isTypeidPointer() && B.isTypeidPointer())
+    return true;
 
   if (A.isIntegralPointer() || B.isIntegralPointer())
     return A.getSource() == B.getSource();
@@ -476,10 +493,10 @@ bool Pointer::pointsToLiteral() const {
   if (isZero() || !isBlockPointer())
     return false;
 
-  const Expr *E = block()->getDescriptor()->asExpr();
   if (block()->isDynamic())
     return false;
 
+  const Expr *E = block()->getDescriptor()->asExpr();
   return E && !isa<MaterializeTemporaryExpr, StringLiteral>(E);
 }
 
diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index 0d467c2..ef03c12 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -49,7 +49,12 @@ struct IntPointer {
   IntPointer baseCast(const ASTContext &ASTCtx, unsigned BaseOffset) const;
 };
 
-enum class Storage { Block, Int, Fn };
+struct TypeidPointer {
+  const Type *TypePtr;
+  const Type *TypeInfoType;
+};
+
+enum class Storage { Block, Int, Fn, Typeid };
 
 /// A pointer to a memory block, live or dead.
 ///
@@ -107,6 +112,11 @@ public:
       : Offset(Offset), StorageKind(Storage::Fn) {
     PointeeStorage.Fn = FunctionPointer(F);
   }
+  Pointer(const Type *TypePtr, const Type *TypeInfoType, uint64_t Offset = 0)
+      : Offset(Offset), StorageKind(Storage::Typeid) {
+    PointeeStorage.Typeid.TypePtr = TypePtr;
+    PointeeStorage.Typeid.TypeInfoType = TypeInfoType;
+  }
   Pointer(Block *Pointee, unsigned Base, uint64_t Offset);
   ~Pointer();
 
@@ -263,6 +273,8 @@ public:
       return asBlockPointer().Pointee == nullptr;
     if (isFunctionPointer())
       return asFunctionPointer().isZero();
+    if (isTypeidPointer())
+      return false;
     assert(isIntegralPointer());
     return asIntPointer().Value == 0 && Offset == 0;
   }
@@ -284,7 +296,7 @@ public:
   const Descriptor *getDeclDesc() const {
     if (isIntegralPointer())
       return asIntPointer().Desc;
-    if (isFunctionPointer())
+    if (isFunctionPointer() || isTypeidPointer())
       return nullptr;
 
     assert(isBlockPointer());
@@ -337,6 +349,9 @@ public:
 
   /// Returns the type of the innermost field.
   QualType getType() const {
+    if (isTypeidPointer())
+      return QualType(PointeeStorage.Typeid.TypeInfoType, 0);
+
     if (inPrimitiveArray() && Offset != asBlockPointer().Base) {
       // Unfortunately, complex and vector types are not array types in clang,
       // but they are for us.
@@ -437,7 +452,7 @@ public:
   }
   /// Pointer points directly to a block.
   bool isRoot() const {
-    if (isZero() || isIntegralPointer())
+    if (isZero() || !isBlockPointer())
       return true;
     return (asBlockPointer().Base ==
                 asBlockPointer().Pointee->getDescriptor()->getMetadataSize() ||
@@ -467,6 +482,7 @@ public:
   bool isBlockPointer() const { return StorageKind == Storage::Block; }
   bool isIntegralPointer() const { return StorageKind == Storage::Int; }
   bool isFunctionPointer() const { return StorageKind == Storage::Fn; }
+  bool isTypeidPointer() const { return StorageKind == Storage::Typeid; }
 
   /// Returns the record descriptor of a class.
   const Record *getRecord() const { return getFieldDesc()->ElemRecord; }
@@ -605,7 +621,7 @@ public:
 
   /// Checks if the index is one past end.
   bool isOnePastEnd() const {
-    if (isIntegralPointer() || isFunctionPointer())
+    if (!isBlockPointer())
       return false;
 
     if (!asBlockPointer().Pointee)
@@ -746,6 +762,7 @@ private:
     BlockPointer BS;
     IntPointer Int;
     FunctionPointer Fn;
+    TypeidPointer Typeid;
   } PointeeStorage;
   Storage StorageKind = Storage::Int;
 };
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 8c8ccdb..ba66d36 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1618,9 +1618,9 @@ QualType CallExpr::getCallReturnType(const ASTContext &Ctx) const {
 std::pair<const NamedDecl *, const Attr *>
 CallExpr::getUnusedResultAttr(const ASTContext &Ctx) const {
   // If the callee is marked nodiscard, return that attribute
-  const Decl *D = getCalleeDecl();
-  if (const auto *A = D->getAttr<WarnUnusedResultAttr>())
-    return {nullptr, A};
+  if (const Decl *D = getCalleeDecl())
+    if (const auto *A = D->getAttr<WarnUnusedResultAttr>())
+      return {nullptr, A};
 
   // If the return type is a struct, union, or enum that is marked nodiscard,
   // then return the return type attribute.
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index fc09d24..5bf5d6a 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -1722,7 +1722,7 @@ PackIndexingExpr *PackIndexingExpr::Create(
   if (Index && FullySubstituted && !SubstitutedExprs.empty())
     Type = SubstitutedExprs[*Index]->getType();
   else
-    Type = Context.DependentTy;
+    Type = PackIdExpr->getType();
 
   void *Storage =
       Context.Allocate(totalSizeToAlloc<Expr *>(SubstitutedExprs.size()));
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index dd75dca..e220f69 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -9858,7 +9858,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -9903,8 +9903,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     // FIXME: We can compare the bytes in the correct order.
     if (IsRawByte && !isOneByteCharacterType(CharTy)) {
       Info.FFDiag(E, diag::note_constexpr_memchr_unsupported)
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str()
-          << CharTy;
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp) << CharTy;
       return false;
     }
     // Figure out what value we're actually looking for (after converting to
@@ -9966,7 +9965,7 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13241,7 +13240,7 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13266,7 +13265,7 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     if (Info.getLangOpts().CPlusPlus11)
       Info.CCEDiag(E, diag::note_constexpr_invalid_function)
           << /*isConstexpr*/ 0 << /*isConstructor*/ 0
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str();
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp);
     else
       Info.CCEDiag(E, diag::note_invalid_subexpr_in_const_expr);
     [[fallthrough]];
@@ -13321,8 +13320,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
         !(isOneByteCharacterType(CharTy1) && isOneByteCharacterType(CharTy2))) {
       // FIXME: Consider using our bit_cast implementation to support this.
       Info.FFDiag(E, diag::note_constexpr_memcmp_unsupported)
-          << ("'" + Info.Ctx.BuiltinInfo.getName(BuiltinOp) + "'").str()
-          << CharTy1 << CharTy2;
+          << Info.Ctx.BuiltinInfo.getQuotedName(BuiltinOp) << CharTy1
+          << CharTy2;
       return false;
     }
 
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index d69fab5..da63b47 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -20,7 +20,7 @@ using namespace clang;
 bool OpenACCClauseWithParams::classof(const OpenACCClause *C) {
   return OpenACCDeviceTypeClause::classof(C) ||
          OpenACCClauseWithCondition::classof(C) ||
-         OpenACCClauseWithExprs::classof(C);
+         OpenACCClauseWithExprs::classof(C) || OpenACCSelfClause::classof(C);
 }
 bool OpenACCClauseWithExprs::classof(const OpenACCClause *C) {
   return OpenACCWaitClause::classof(C) || OpenACCNumGangsClause::classof(C) ||
@@ -41,12 +41,13 @@ bool OpenACCClauseWithVarList::classof(const OpenACCClause *C) {
          OpenACCReductionClause::classof(C) || OpenACCCreateClause::classof(C);
 }
 bool OpenACCClauseWithCondition::classof(const OpenACCClause *C) {
-  return OpenACCIfClause::classof(C) || OpenACCSelfClause::classof(C);
+  return OpenACCIfClause::classof(C);
 }
 bool OpenACCClauseWithSingleIntExpr::classof(const OpenACCClause *C) {
   return OpenACCNumWorkersClause::classof(C) ||
          OpenACCVectorLengthClause::classof(C) ||
          OpenACCDeviceNumClause::classof(C) ||
+         OpenACCDefaultAsyncClause::classof(C) ||
          OpenACCVectorClause::classof(C) || OpenACCWorkerClause::classof(C) ||
          OpenACCCollapseClause::classof(C) || OpenACCAsyncClause::classof(C);
 }
@@ -86,19 +87,43 @@ OpenACCSelfClause *OpenACCSelfClause::Create(const ASTContext &C,
                                              SourceLocation LParenLoc,
                                              Expr *ConditionExpr,
                                              SourceLocation EndLoc) {
-  void *Mem = C.Allocate(sizeof(OpenACCIfClause), alignof(OpenACCIfClause));
+  void *Mem = C.Allocate(OpenACCSelfClause::totalSizeToAlloc<Expr *>(1));
   return new (Mem)
       OpenACCSelfClause(BeginLoc, LParenLoc, ConditionExpr, EndLoc);
 }
 
+OpenACCSelfClause *OpenACCSelfClause::Create(const ASTContext &C,
+                                             SourceLocation BeginLoc,
+                                             SourceLocation LParenLoc,
+                                             ArrayRef<Expr *> VarList,
+                                             SourceLocation EndLoc) {
+  void *Mem =
+      C.Allocate(OpenACCSelfClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCSelfClause(BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
+OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc,
+                                     SourceLocation LParenLoc,
+                                     llvm::ArrayRef<Expr *> VarList,
+                                     SourceLocation EndLoc)
+    : OpenACCClauseWithParams(OpenACCClauseKind::Self, BeginLoc, LParenLoc,
+                              EndLoc),
+      HasConditionExpr(std::nullopt), NumExprs(VarList.size()) {
+  std::uninitialized_copy(VarList.begin(), VarList.end(),
+                          getTrailingObjects<Expr *>());
+}
+
 OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc,
                                      SourceLocation LParenLoc,
                                      Expr *ConditionExpr, SourceLocation EndLoc)
-    : OpenACCClauseWithCondition(OpenACCClauseKind::Self, BeginLoc, LParenLoc,
-                                 ConditionExpr, EndLoc) {
+    : OpenACCClauseWithParams(OpenACCClauseKind::Self, BeginLoc, LParenLoc,
+                              EndLoc),
+      HasConditionExpr(ConditionExpr != nullptr), NumExprs(1) {
   assert((!ConditionExpr || ConditionExpr->isInstantiationDependent() ||
           ConditionExpr->getType()->isScalarType()) &&
          "Condition expression type not scalar/dependent");
+  std::uninitialized_copy(&ConditionExpr, &ConditionExpr + 1,
+                          getTrailingObjects<Expr *>());
 }
 
 OpenACCClause::child_range OpenACCClause::children() {
@@ -239,6 +264,27 @@ OpenACCDeviceNumClause *OpenACCDeviceNumClause::Create(const ASTContext &C,
   return new (Mem) OpenACCDeviceNumClause(BeginLoc, LParenLoc, IntExpr, EndLoc);
 }
 
+OpenACCDefaultAsyncClause::OpenACCDefaultAsyncClause(SourceLocation BeginLoc,
+                                                     SourceLocation LParenLoc,
+                                                     Expr *IntExpr,
+                                                     SourceLocation EndLoc)
+    : OpenACCClauseWithSingleIntExpr(OpenACCClauseKind::DefaultAsync, BeginLoc,
+                                     LParenLoc, IntExpr, EndLoc) {
+  assert((IntExpr->isInstantiationDependent() ||
+          IntExpr->getType()->isIntegerType()) &&
+         "default_async expression type not scalar/dependent");
+}
+
+OpenACCDefaultAsyncClause *
+OpenACCDefaultAsyncClause::Create(const ASTContext &C, SourceLocation BeginLoc,
+                                  SourceLocation LParenLoc, Expr *IntExpr,
+                                  SourceLocation EndLoc) {
+  void *Mem = C.Allocate(sizeof(OpenACCDefaultAsyncClause),
+                         alignof(OpenACCDefaultAsyncClause));
+  return new (Mem)
+      OpenACCDefaultAsyncClause(BeginLoc, LParenLoc, IntExpr, EndLoc);
+}
+
 OpenACCWaitClause *OpenACCWaitClause::Create(
     const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
     Expr *DevNumExpr, SourceLocation QueuesLoc, ArrayRef<Expr *> QueueIdExprs,
@@ -533,9 +579,17 @@ void OpenACCClausePrinter::VisitIfClause(const OpenACCIfClause &C) {
 
 void OpenACCClausePrinter::VisitSelfClause(const OpenACCSelfClause &C) {
   OS << "self";
-  if (const Expr *CondExpr = C.getConditionExpr()) {
+
+  if (C.isConditionExprClause()) {
+    if (const Expr *CondExpr = C.getConditionExpr()) {
+      OS << "(";
+      printExpr(CondExpr);
+      OS << ")";
+    }
+  } else {
     OS << "(";
-    printExpr(CondExpr);
+    llvm::interleaveComma(C.getVarList(), OS,
+                          [&](const Expr *E) { printExpr(E); });
     OS << ")";
   }
 }
@@ -575,6 +629,13 @@ void OpenACCClausePrinter::VisitDeviceNumClause(
   OS << ")";
 }
 
+void OpenACCClausePrinter::VisitDefaultAsyncClause(
+    const OpenACCDefaultAsyncClause &C) {
+  OS << "default_async(";
+  printExpr(C.getIntExpr());
+  OS << ")";
+}
+
 void OpenACCClausePrinter::VisitAsyncClause(const OpenACCAsyncClause &C) {
   OS << "async";
   if (C.hasIntExpr()) {
diff --git a/clang/lib/AST/ParentMap.cpp b/clang/lib/AST/ParentMap.cpp
index fd749b0..e62e71b 100644
--- a/clang/lib/AST/ParentMap.cpp
+++ b/clang/lib/AST/ParentMap.cpp
@@ -33,17 +33,19 @@ static void BuildParentMap(MapTy& M, Stmt* S,
   switch (S->getStmtClass()) {
   case Stmt::PseudoObjectExprClass: {
     PseudoObjectExpr *POE = cast<PseudoObjectExpr>(S);
-
-    if (OVMode == OV_Opaque && M[POE->getSyntacticForm()])
-      break;
-
-    // If we are rebuilding the map, clear out any existing state.
-    if (M[POE->getSyntacticForm()])
+    Expr *SF = POE->getSyntacticForm();
+
+    auto [Iter, Inserted] = M.try_emplace(SF, S);
+    if (!Inserted) {
+      // Nothing more to do in opaque mode if we are updating an existing map.
+      if (OVMode == OV_Opaque)
+        break;
+      // Update the entry in transparent mode, and clear existing state.
+      Iter->second = S;
       for (Stmt *SubStmt : S->children())
-        M[SubStmt] = nullptr;
-
-    M[POE->getSyntacticForm()] = S;
-    BuildParentMap(M, POE->getSyntacticForm(), OV_Transparent);
+        M.erase(SubStmt);
+    }
+    BuildParentMap(M, SF, OV_Transparent);
 
     for (PseudoObjectExpr::semantics_iterator I = POE->semantics_begin(),
                                               E = POE->semantics_end();
@@ -78,10 +80,15 @@ static void BuildParentMap(MapTy& M, Stmt* S,
     // The right thing to do is to give the OpaqueValueExpr its syntactic
     // parent, then not reassign that when traversing the semantic expressions.
     OpaqueValueExpr *OVE = cast<OpaqueValueExpr>(S);
-    if (OVMode == OV_Transparent || !M[OVE->getSourceExpr()]) {
-      M[OVE->getSourceExpr()] = S;
-      BuildParentMap(M, OVE->getSourceExpr(), OV_Transparent);
+    Expr *SrcExpr = OVE->getSourceExpr();
+    auto [Iter, Inserted] = M.try_emplace(SrcExpr, S);
+    // Force update in transparent mode.
+    if (!Inserted && OVMode == OV_Transparent) {
+      Iter->second = S;
+      Inserted = true;
     }
+    if (Inserted)
+      BuildParentMap(M, SrcExpr, OV_Transparent);
     break;
   }
   case Stmt::CapturedStmtClass:
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index e6d76ea..2b0ac71 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -265,3 +265,43 @@ OpenACCShutdownConstruct *OpenACCShutdownConstruct::Create(
       new (Mem) OpenACCShutdownConstruct(Start, DirectiveLoc, End, Clauses);
   return Inst;
 }
+
+OpenACCSetConstruct *OpenACCSetConstruct::CreateEmpty(const ASTContext &C,
+                                                      unsigned NumClauses) {
+  void *Mem = C.Allocate(
+      OpenACCSetConstruct::totalSizeToAlloc<const OpenACCClause *>(NumClauses));
+  auto *Inst = new (Mem) OpenACCSetConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCSetConstruct *
+OpenACCSetConstruct::Create(const ASTContext &C, SourceLocation Start,
+                            SourceLocation DirectiveLoc, SourceLocation End,
+                            ArrayRef<const OpenACCClause *> Clauses) {
+  void *Mem =
+      C.Allocate(OpenACCSetConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst = new (Mem) OpenACCSetConstruct(Start, DirectiveLoc, End, Clauses);
+  return Inst;
+}
+
+OpenACCUpdateConstruct *
+OpenACCUpdateConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) {
+  void *Mem = C.Allocate(
+      OpenACCUpdateConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          NumClauses));
+  auto *Inst = new (Mem) OpenACCUpdateConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCUpdateConstruct *
+OpenACCUpdateConstruct::Create(const ASTContext &C, SourceLocation Start,
+                               SourceLocation DirectiveLoc, SourceLocation End,
+                               ArrayRef<const OpenACCClause *> Clauses) {
+  void *Mem = C.Allocate(
+      OpenACCUpdateConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst =
+      new (Mem) OpenACCUpdateConstruct(Start, DirectiveLoc, End, Clauses);
+  return Inst;
+}
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index c5d19f7..52bcb51 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1204,6 +1204,12 @@ void StmtPrinter::VisitOpenACCInitConstruct(OpenACCInitConstruct *S) {
 void StmtPrinter::VisitOpenACCShutdownConstruct(OpenACCShutdownConstruct *S) {
   PrintOpenACCConstruct(S);
 }
+void StmtPrinter::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) {
+  PrintOpenACCConstruct(S);
+}
+void StmtPrinter::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) {
+  PrintOpenACCConstruct(S);
+}
 
 void StmtPrinter::VisitOpenACCWaitConstruct(OpenACCWaitConstruct *S) {
   Indent() << "#pragma acc wait";
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 27313f9..cd91a79 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2555,8 +2555,13 @@ void OpenACCClauseProfiler::VisitCreateClause(
 }
 
 void OpenACCClauseProfiler::VisitSelfClause(const OpenACCSelfClause &Clause) {
-  if (Clause.hasConditionExpr())
-    Profiler.VisitStmt(Clause.getConditionExpr());
+  if (Clause.isConditionExprClause()) {
+    if (Clause.hasConditionExpr())
+      Profiler.VisitStmt(Clause.getConditionExpr());
+  } else {
+    for (auto *E : Clause.getVarList())
+      Profiler.VisitStmt(E);
+  }
 }
 
 void OpenACCClauseProfiler::VisitFinalizeClause(
@@ -2650,6 +2655,11 @@ void OpenACCClauseProfiler::VisitDeviceNumClause(
   Profiler.VisitStmt(Clause.getIntExpr());
 }
 
+void OpenACCClauseProfiler::VisitDefaultAsyncClause(
+    const OpenACCDefaultAsyncClause &Clause) {
+  Profiler.VisitStmt(Clause.getIntExpr());
+}
+
 void OpenACCClauseProfiler::VisitWorkerClause(
     const OpenACCWorkerClause &Clause) {
   if (Clause.hasIntExpr())
@@ -2769,6 +2779,19 @@ void StmtProfiler::VisitOpenACCShutdownConstruct(
   P.VisitOpenACCClauseList(S->clauses());
 }
 
+void StmtProfiler::VisitOpenACCSetConstruct(const OpenACCSetConstruct *S) {
+  VisitStmt(S);
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
+void StmtProfiler::VisitOpenACCUpdateConstruct(
+    const OpenACCUpdateConstruct *S) {
+  VisitStmt(S);
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
 void StmtProfiler::VisitHLSLOutArgExpr(const HLSLOutArgExpr *S) {
   VisitStmt(S);
 }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 018147e..eedd8fa 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -414,6 +414,7 @@ void TextNodeDumper::Visit(const OpenACCClause *C) {
     case OpenACCClauseKind::Detach:
     case OpenACCClauseKind::Delete:
     case OpenACCClauseKind::DeviceNum:
+    case OpenACCClauseKind::DefaultAsync:
     case OpenACCClauseKind::DevicePtr:
     case OpenACCClauseKind::Finalize:
     case OpenACCClauseKind::FirstPrivate:
@@ -2930,7 +2931,6 @@ void TextNodeDumper::VisitOpenACCConstructStmt(const OpenACCConstructStmt *S) {
   OS << " " << S->getDirectiveKind();
 }
 void TextNodeDumper::VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S) {
-
   if (S->isOrphanedLoopConstruct())
     OS << " <orphan>";
   else
@@ -2939,37 +2939,44 @@ void TextNodeDumper::VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S) {
 
 void TextNodeDumper::VisitOpenACCCombinedConstruct(
     const OpenACCCombinedConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCDataConstruct(const OpenACCDataConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCEnterDataConstruct(
     const OpenACCEnterDataConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCExitDataConstruct(
     const OpenACCExitDataConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCHostDataConstruct(
     const OpenACCHostDataConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 void TextNodeDumper::VisitOpenACCInitConstruct(const OpenACCInitConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 void TextNodeDumper::VisitOpenACCShutdownConstruct(
     const OpenACCShutdownConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
+}
+void TextNodeDumper::VisitOpenACCSetConstruct(const OpenACCSetConstruct *S) {
+  VisitOpenACCConstructStmt(S);
+}
+void TextNodeDumper::VisitOpenACCUpdateConstruct(
+    const OpenACCUpdateConstruct *S) {
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) {
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index bf9dc5f..9c7943a 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -924,6 +924,8 @@ const internal::VariadicDynCastAllOfMatcher<Stmt, CXXRewrittenBinaryOperator>
 const internal::VariadicDynCastAllOfMatcher<Stmt, CXXFoldExpr> cxxFoldExpr;
 const internal::VariadicDynCastAllOfMatcher<Stmt, Expr> expr;
 const internal::VariadicDynCastAllOfMatcher<Stmt, DeclRefExpr> declRefExpr;
+const internal::VariadicDynCastAllOfMatcher<Stmt, DependentScopeDeclRefExpr>
+    dependentScopeDeclRefExpr;
 const internal::VariadicDynCastAllOfMatcher<Stmt, ObjCIvarRefExpr> objcIvarRefExpr;
 const internal::VariadicDynCastAllOfMatcher<Stmt, BlockExpr> blockExpr;
 const internal::VariadicDynCastAllOfMatcher<Stmt, IfStmt> ifStmt;
@@ -1106,6 +1108,9 @@ const AstTypeMatcher<SubstTemplateTypeParmType> substTemplateTypeParmType;
 const AstTypeMatcher<TemplateTypeParmType> templateTypeParmType;
 const AstTypeMatcher<InjectedClassNameType> injectedClassNameType;
 const AstTypeMatcher<DecayedType> decayedType;
+const AstTypeMatcher<DependentNameType> dependentNameType;
+const AstTypeMatcher<DependentTemplateSpecializationType>
+    dependentTemplateSpecializationType;
 AST_TYPELOC_TRAVERSE_MATCHER_DEF(hasElementType,
                                  AST_POLYMORPHIC_SUPPORTED_TYPES(ArrayType,
                                                                  ComplexType));
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 837633f..336d3a1 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -222,6 +222,9 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(decompositionDecl);
   REGISTER_MATCHER(declCountIs);
   REGISTER_MATCHER(declRefExpr);
+  REGISTER_MATCHER(dependentNameType);
+  REGISTER_MATCHER(dependentScopeDeclRefExpr);
+  REGISTER_MATCHER(dependentTemplateSpecializationType);
   REGISTER_MATCHER(declStmt);
   REGISTER_MATCHER(declaratorDecl);
   REGISTER_MATCHER(decltypeType);
@@ -311,6 +314,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(hasDeducedType);
   REGISTER_MATCHER(hasDefaultArgument);
   REGISTER_MATCHER(hasDefinition);
+  REGISTER_MATCHER(hasDependentName);
   REGISTER_MATCHER(hasDescendant);
   REGISTER_MATCHER(hasDestinationType);
   REGISTER_MATCHER(hasDirectBase);
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index da5dda0..e1394e2 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -25,8 +25,10 @@
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/Formula.h"
 #include "clang/Analysis/FlowSensitive/RecordOps.h"
+#include "clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
+#include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -555,24 +557,25 @@ void handleConstMemberCall(const CallExpr *CE,
                            LatticeTransferState &State) {
   // If the const method returns an optional or reference to an optional.
   if (RecordLoc != nullptr && isSupportedOptionalType(CE->getType())) {
-    StorageLocation *Loc =
+    const FunctionDecl *DirectCallee = CE->getDirectCallee();
+    if (DirectCallee == nullptr)
+      return;
+    StorageLocation &Loc =
         State.Lattice.getOrCreateConstMethodReturnStorageLocation(
-            *RecordLoc, CE, State.Env, [&](StorageLocation &Loc) {
+            *RecordLoc, DirectCallee, State.Env, [&](StorageLocation &Loc) {
               setHasValue(cast<RecordStorageLocation>(Loc),
                           State.Env.makeAtomicBoolValue(), State.Env);
             });
-    if (Loc == nullptr)
-      return;
     if (CE->isGLValue()) {
       // If the call to the const method returns a reference to an optional,
       // link the call expression to the cached StorageLocation.
-      State.Env.setStorageLocation(*CE, *Loc);
+      State.Env.setStorageLocation(*CE, Loc);
     } else {
       // If the call to the const method returns an optional by value, we
       // need to use CopyRecord to link the optional to the result object
       // of the call expression.
       auto &ResultLoc = State.Env.getResultObjectLocation(*CE);
-      copyRecord(*cast<RecordStorageLocation>(Loc), ResultLoc, State.Env);
+      copyRecord(cast<RecordStorageLocation>(Loc), ResultLoc, State.Env);
     }
     return;
   }
@@ -1031,6 +1034,48 @@ auto buildTransferMatchSwitch() {
             transferOptionalAndValueCmp(Cmp, Cmp->getArg(1), State.Env);
           })
 
+      // Smart-pointer-like operator* and operator-> calls that may look like
+      // const accessors (below) but need special handling to allow mixing
+      // the accessor calls.
+      .CaseOfCFGStmt<CXXOperatorCallExpr>(
+          isSmartPointerLikeOperatorStar(),
+          [](const CXXOperatorCallExpr *E,
+             const MatchFinder::MatchResult &Result,
+             LatticeTransferState &State) {
+            transferSmartPointerLikeCachedDeref(
+                E,
+                dyn_cast_or_null<RecordStorageLocation>(
+                    getLocBehindPossiblePointer(*E->getArg(0), State.Env)),
+                State, [](StorageLocation &Loc) {});
+          })
+      .CaseOfCFGStmt<CXXOperatorCallExpr>(
+          isSmartPointerLikeOperatorArrow(),
+          [](const CXXOperatorCallExpr *E,
+             const MatchFinder::MatchResult &Result,
+             LatticeTransferState &State) {
+            transferSmartPointerLikeCachedGet(
+                E,
+                dyn_cast_or_null<RecordStorageLocation>(
+                    getLocBehindPossiblePointer(*E->getArg(0), State.Env)),
+                State, [](StorageLocation &Loc) {});
+          })
+      .CaseOfCFGStmt<CXXMemberCallExpr>(
+          isSmartPointerLikeValueMethodCall(),
+          [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &Result,
+             LatticeTransferState &State) {
+            transferSmartPointerLikeCachedDeref(
+                E, getImplicitObjectLocation(*E, State.Env), State,
+                [](StorageLocation &Loc) {});
+          })
+      .CaseOfCFGStmt<CXXMemberCallExpr>(
+          isSmartPointerLikeGetMethodCall(),
+          [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &Result,
+             LatticeTransferState &State) {
+            transferSmartPointerLikeCachedGet(
+                E, getImplicitObjectLocation(*E, State.Env), State,
+                [](StorageLocation &Loc) {});
+          })
+
       // const accessor calls
       .CaseOfCFGStmt<CXXMemberCallExpr>(isZeroParamConstMemberCall(),
                                         transferValue_ConstMemberCall)
diff --git a/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp b/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
index a0c81aa..c58bd30 100644
--- a/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
+++ b/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
@@ -132,6 +132,7 @@ ast_matchers::StatementMatcher isSmartPointerLikeOperatorArrow() {
       callee(cxxMethodDecl(parameterCountIs(0), returns(pointerType()),
                            ofClass(smartPointerClassWithGetOrValue()))));
 }
+
 ast_matchers::StatementMatcher isSmartPointerLikeValueMethodCall() {
   return cxxMemberCallExpr(callee(
       cxxMethodDecl(parameterCountIs(0), returns(referenceType()),
@@ -144,4 +145,24 @@ ast_matchers::StatementMatcher isSmartPointerLikeGetMethodCall() {
                     ofClass(smartPointerClassWithGet()))));
 }
 
+const FunctionDecl *
+getCanonicalSmartPointerLikeOperatorCallee(const CallExpr *CE) {
+  const FunctionDecl *CanonicalCallee = nullptr;
+  const CXXMethodDecl *Callee =
+      cast_or_null<CXXMethodDecl>(CE->getDirectCallee());
+  if (Callee == nullptr)
+    return nullptr;
+  const CXXRecordDecl *RD = Callee->getParent();
+  if (RD == nullptr)
+    return nullptr;
+  for (const auto *MD : RD->methods()) {
+    if (MD->getOverloadedOperator() == OO_Star && MD->isConst() &&
+        MD->getNumParams() == 0 && MD->getReturnType()->isReferenceType()) {
+      CanonicalCallee = MD;
+      break;
+    }
+  }
+  return CanonicalCallee;
+}
+
 } // namespace clang::dataflow
diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp
index 8dd1888..5881837 100644
--- a/clang/lib/Basic/Builtins.cpp
+++ b/clang/lib/Basic/Builtins.cpp
@@ -163,6 +163,10 @@ void Builtin::Context::initializeBuiltins(IdentifierTable &Table,
   }
 }
 
+std::string Builtin::Context::getQuotedName(unsigned ID) const {
+  return (llvm::Twine("'") + getName(ID) + "'").str();
+}
+
 unsigned Builtin::Context::getRequiredVectorWidth(unsigned ID) const {
   const char *WidthPos = ::strchr(getRecord(ID).Attributes, 'V');
   if (!WidthPos)
diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt
index e11e1ac..331dfbb 100644
--- a/clang/lib/Basic/CMakeLists.txt
+++ b/clang/lib/Basic/CMakeLists.txt
@@ -120,6 +120,7 @@ add_clang_library(clangBasic
   Targets/WebAssembly.cpp
   Targets/X86.cpp
   Targets/XCore.cpp
+  Targets/Xtensa.cpp
   TokenKinds.cpp
   TypeTraits.cpp
   Version.cpp
diff --git a/clang/lib/Basic/CodeGenOptions.cpp b/clang/lib/Basic/CodeGenOptions.cpp
index 79d7153..95e65ba 100644
--- a/clang/lib/Basic/CodeGenOptions.cpp
+++ b/clang/lib/Basic/CodeGenOptions.cpp
@@ -17,7 +17,6 @@ CodeGenOptions::CodeGenOptions() {
 #include "clang/Basic/CodeGenOptions.def"
 
   RelocationModel = llvm::Reloc::PIC_;
-  memcpy(CoverageVersion, "408*", 4);
 }
 
 void CodeGenOptions::resetNonModularOptions(StringRef ModuleFormat) {
@@ -54,7 +53,6 @@ void CodeGenOptions::resetNonModularOptions(StringRef ModuleFormat) {
   }
 
   RelocationModel = llvm::Reloc::PIC_;
-  memcpy(CoverageVersion, "408*", 4);
 }
 
 }  // end namespace clang
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 706a391..be5dedb 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -40,6 +40,7 @@
 #include "Targets/WebAssembly.h"
 #include "Targets/X86.h"
 #include "Targets/XCore.h"
+#include "Targets/Xtensa.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "llvm/ADT/StringExtras.h"
@@ -297,6 +298,14 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
     case llvm::Triple::NaCl:
       return std::make_unique<NaClTargetInfo<NaClMips32TargetInfo>>(Triple,
                                                                     Opts);
+    case llvm::Triple::Win32:
+      switch (Triple.getEnvironment()) {
+      case llvm::Triple::GNU:
+        return std::make_unique<MinGWMipsTargetInfo>(Triple, Opts);
+      case llvm::Triple::MSVC:
+      default: // Assume MSVC for unknown environments
+        return std::make_unique<MicrosoftMipsTargetInfo>(Triple, Opts);
+      }
     default:
       return std::make_unique<MipsTargetInfo>(Triple, Opts);
     }
@@ -743,6 +752,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
     default:
         return std::make_unique<LoongArch64TargetInfo>(Triple, Opts);
     }
+
+  case llvm::Triple::xtensa:
+    return std::make_unique<XtensaTargetInfo>(Triple, Opts);
   }
 }
 } // namespace targets
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 53e102b..2b4b954d 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -714,7 +714,7 @@ AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts) const {
   return std::nullopt;
 }
 
-unsigned AArch64TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
+uint64_t AArch64TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
   return llvm::AArch64::getFMVPriority(Features);
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 68a8b1e..4e927c0 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -137,7 +137,7 @@ public:
   void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
   bool setCPU(const std::string &Name) override;
 
-  unsigned getFMVPriority(ArrayRef<StringRef> Features) const override;
+  uint64_t getFMVPriority(ArrayRef<StringRef> Features) const override;
 
   bool useFP16ConversionIntrinsics() const override {
     return false;
diff --git a/clang/lib/Basic/Targets/Mips.cpp b/clang/lib/Basic/Targets/Mips.cpp
index 174bc9d..d56995e 100644
--- a/clang/lib/Basic/Targets/Mips.cpp
+++ b/clang/lib/Basic/Targets/Mips.cpp
@@ -304,3 +304,62 @@ bool MipsTargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
 
   return true;
 }
+
+WindowsMipsTargetInfo::WindowsMipsTargetInfo(const llvm::Triple &Triple,
+                                             const TargetOptions &Opts)
+    : WindowsTargetInfo<MipsTargetInfo>(Triple, Opts), Triple(Triple) {}
+
+void WindowsMipsTargetInfo::getVisualStudioDefines(
+    const LangOptions &Opts, MacroBuilder &Builder) const {
+  Builder.defineMacro("_M_MRX000", "4000");
+}
+
+TargetInfo::BuiltinVaListKind
+WindowsMipsTargetInfo::getBuiltinVaListKind() const {
+  return TargetInfo::CharPtrBuiltinVaList;
+}
+
+TargetInfo::CallingConvCheckResult
+WindowsMipsTargetInfo::checkCallingConvention(CallingConv CC) const {
+  switch (CC) {
+  case CC_X86StdCall:
+  case CC_X86ThisCall:
+  case CC_X86FastCall:
+  case CC_X86VectorCall:
+    return CCCR_Ignore;
+  case CC_C:
+  case CC_OpenCLKernel:
+  case CC_PreserveMost:
+  case CC_PreserveAll:
+  case CC_Swift:
+  case CC_SwiftAsync:
+    return CCCR_OK;
+  default:
+    return CCCR_Warning;
+  }
+}
+
+// Windows MIPS, MS (C++) ABI
+MicrosoftMipsTargetInfo::MicrosoftMipsTargetInfo(const llvm::Triple &Triple,
+                                                 const TargetOptions &Opts)
+    : WindowsMipsTargetInfo(Triple, Opts) {
+  TheCXXABI.set(TargetCXXABI::Microsoft);
+}
+
+void MicrosoftMipsTargetInfo::getTargetDefines(const LangOptions &Opts,
+                                               MacroBuilder &Builder) const {
+  WindowsMipsTargetInfo::getTargetDefines(Opts, Builder);
+  WindowsMipsTargetInfo::getVisualStudioDefines(Opts, Builder);
+}
+
+MinGWMipsTargetInfo::MinGWMipsTargetInfo(const llvm::Triple &Triple,
+                                         const TargetOptions &Opts)
+    : WindowsMipsTargetInfo(Triple, Opts) {
+  TheCXXABI.set(TargetCXXABI::GenericMIPS);
+}
+
+void MinGWMipsTargetInfo::getTargetDefines(const LangOptions &Opts,
+                                           MacroBuilder &Builder) const {
+  WindowsMipsTargetInfo::getTargetDefines(Opts, Builder);
+  Builder.defineMacro("_MIPS_");
+}
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index 8acaf56..7ddcd57 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_CLANG_LIB_BASIC_TARGETS_MIPS_H
 #define LLVM_CLANG_LIB_BASIC_TARGETS_MIPS_H
 
+#include "OSTargets.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/TargetOptions.h"
 #include "llvm/Support/Compiler.h"
@@ -450,6 +451,42 @@ public:
     return std::make_pair(32, 32);
   }
 };
+
+class LLVM_LIBRARY_VISIBILITY WindowsMipsTargetInfo
+    : public WindowsTargetInfo<MipsTargetInfo> {
+  const llvm::Triple Triple;
+
+public:
+  WindowsMipsTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts);
+
+  void getVisualStudioDefines(const LangOptions &Opts,
+                              MacroBuilder &Builder) const;
+
+  BuiltinVaListKind getBuiltinVaListKind() const override;
+
+  CallingConvCheckResult checkCallingConvention(CallingConv CC) const override;
+};
+
+// Windows MIPS, MS (C++) ABI
+class LLVM_LIBRARY_VISIBILITY MicrosoftMipsTargetInfo
+    : public WindowsMipsTargetInfo {
+public:
+  MicrosoftMipsTargetInfo(const llvm::Triple &Triple,
+                          const TargetOptions &Opts);
+
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override;
+};
+
+// MIPS MinGW target
+class LLVM_LIBRARY_VISIBILITY MinGWMipsTargetInfo
+    : public WindowsMipsTargetInfo {
+public:
+  MinGWMipsTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts);
+
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override;
+};
 } // namespace targets
 } // namespace clang
 
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index 88c0541..6f98353 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -114,9 +114,6 @@ void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
     assert(OsVersion.getMinor().value_or(0) < 100 &&
            OsVersion.getSubminor().value_or(0) < 100 && "Invalid version!");
     Builder.defineMacro("__ENVIRONMENT_OS_VERSION_MIN_REQUIRED__", Str);
-
-    // Tell users about the kernel if there is one.
-    Builder.defineMacro("__MACH__");
   }
 
   PlatformMinVersion = OsVersion;
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index a541dfed..db23b0c 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -489,7 +489,7 @@ ParsedTargetAttr RISCVTargetInfo::parseTargetAttr(StringRef Features) const {
   return Ret;
 }
 
-unsigned RISCVTargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
+uint64_t RISCVTargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
   // Priority is explicitly specified on RISC-V unlike on other targets, where
   // it is derived by all the features of a specific version. Therefore if a
   // feature contains the priority string, then return it immediately.
@@ -501,7 +501,7 @@ unsigned RISCVTargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
       Feature = RHS;
     else
       continue;
-    unsigned Priority;
+    uint64_t Priority;
     if (!Feature.getAsInteger(0, Priority))
       return Priority;
   }
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 68f10e7..bb3f3a5 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -122,7 +122,7 @@ public:
   void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) const override;
   bool supportsTargetAttributeTune() const override { return true; }
   ParsedTargetAttr parseTargetAttr(StringRef Str) const override;
-  unsigned getFMVPriority(ArrayRef<StringRef> Features) const override;
+  uint64_t getFMVPriority(ArrayRef<StringRef> Features) const override;
 
   std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
     return std::make_pair(32, 32);
diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp
index 0403039..f242fed 100644
--- a/clang/lib/Basic/Targets/SPIR.cpp
+++ b/clang/lib/Basic/Targets/SPIR.cpp
@@ -13,11 +13,24 @@
 #include "SPIR.h"
 #include "AMDGPU.h"
 #include "Targets.h"
+#include "clang/Basic/MacroBuilder.h"
+#include "clang/Basic/TargetBuiltins.h"
 #include "llvm/TargetParser/TargetParser.h"
 
 using namespace clang;
 using namespace clang::targets;
 
+static constexpr Builtin::Info BuiltinInfo[] = {
+#define BUILTIN(ID, TYPE, ATTRS)                                               \
+  {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
+#include "clang/Basic/BuiltinsSPIRV.inc"
+};
+
+ArrayRef<Builtin::Info> SPIRVTargetInfo::getTargetBuiltins() const {
+  return llvm::ArrayRef(BuiltinInfo,
+                        clang::SPIRV::LastTSBuiltin - Builtin::FirstTSBuiltin);
+}
+
 void SPIRTargetInfo::getTargetDefines(const LangOptions &Opts,
                                       MacroBuilder &Builder) const {
   DefineStd(Builder, "SPIR", Opts);
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 85e4bd9..5a328b9c 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -313,7 +313,7 @@ public:
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-"
                     "v256:256-v512:512-v1024:1024-n8:16:32:64-G1");
   }
-
+  ArrayRef<Builtin::Info> getTargetBuiltins() const override;
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override;
 };
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 1b16888..40ad8fd 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -30,14 +30,6 @@ static constexpr Builtin::Info BuiltinInfoX86[] = {
   {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
 #define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE)         \
   {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::HEADER, LANGS},
-#include "clang/Basic/BuiltinsX86.def"
-
-#define BUILTIN(ID, TYPE, ATTRS)                                               \
-  {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
-  {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE)         \
-  {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::HEADER, LANGS},
 #include "clang/Basic/BuiltinsX86.inc"
 
 #define BUILTIN(ID, TYPE, ATTRS)                                               \
@@ -46,7 +38,7 @@ static constexpr Builtin::Info BuiltinInfoX86[] = {
   {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
 #define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE)         \
   {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::HEADER, LANGS},
-#include "clang/Basic/BuiltinsX86_64.def"
+#include "clang/Basic/BuiltinsX86_64.inc"
 };
 
 static const char *const GCCRegNames[] = {
@@ -1365,8 +1357,8 @@ static llvm::X86::ProcessorFeatures getFeature(StringRef Name) {
   // correct, so it asserts if the value is out of range.
 }
 
-unsigned X86TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
-  auto getPriority = [](StringRef Feature) -> unsigned {
+uint64_t X86TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
+  auto getPriority = [](StringRef Feature) -> uint64_t {
     // Valid CPUs have a 'key feature' that compares just better than its key
     // feature.
     using namespace llvm::X86;
@@ -1380,7 +1372,7 @@ unsigned X86TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
     return getFeaturePriority(getFeature(Feature)) << 1;
   };
 
-  unsigned Priority = 0;
+  uint64_t Priority = 0;
   for (StringRef Feature : Features)
     if (!Feature.empty())
       Priority = std::max(Priority, getPriority(Feature));
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 553c452..35aceb1 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -384,7 +384,7 @@ public:
     return CPU != llvm::X86::CK_None;
   }
 
-  unsigned getFMVPriority(ArrayRef<StringRef> Features) const override;
+  uint64_t getFMVPriority(ArrayRef<StringRef> Features) const override;
 
   bool setFPMath(StringRef Name) override;
 
diff --git a/clang/lib/Basic/Targets/Xtensa.cpp b/clang/lib/Basic/Targets/Xtensa.cpp
new file mode 100644
index 0000000..f3216f4
--- /dev/null
+++ b/clang/lib/Basic/Targets/Xtensa.cpp
@@ -0,0 +1,35 @@
+//===--- Xtensa.cpp - Implement Xtensa target feature support -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Xtensa TargetInfo objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Xtensa.h"
+#include "clang/Basic/Builtins.h"
+#include "clang/Basic/MacroBuilder.h"
+#include "clang/Basic/TargetBuiltins.h"
+
+using namespace clang;
+using namespace clang::targets;
+
+void XtensaTargetInfo::getTargetDefines(const LangOptions &Opts,
+                                        MacroBuilder &Builder) const {
+  Builder.defineMacro("__xtensa__");
+  Builder.defineMacro("__XTENSA__");
+  if (BigEndian)
+    Builder.defineMacro("__XTENSA_EB__");
+  else
+    Builder.defineMacro("__XTENSA_EL__");
+  Builder.defineMacro("__XCHAL_HAVE_BE", BigEndian ? "1" : "0");
+  Builder.defineMacro("__XCHAL_HAVE_ABS");  // core arch
+  Builder.defineMacro("__XCHAL_HAVE_ADDX"); // core arch
+  Builder.defineMacro("__XCHAL_HAVE_L32R"); // core arch
+}
diff --git a/clang/lib/Basic/Targets/Xtensa.h b/clang/lib/Basic/Targets/Xtensa.h
new file mode 100644
index 0000000..a440ba8
--- /dev/null
+++ b/clang/lib/Basic/Targets/Xtensa.h
@@ -0,0 +1,111 @@
+//===--- Xtensa.h - Declare Xtensa target feature support -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares Xtensa TargetInfo objects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_XTENSA_H
+#define LLVM_CLANG_LIB_BASIC_TARGETS_XTENSA_H
+
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/TargetParser/Triple.h"
+
+#include "clang/Basic/Builtins.h"
+#include "clang/Basic/MacroBuilder.h"
+#include "clang/Basic/TargetBuiltins.h"
+
+namespace clang {
+namespace targets {
+
+class LLVM_LIBRARY_VISIBILITY XtensaTargetInfo : public TargetInfo {
+  static const Builtin::Info BuiltinInfo[];
+
+protected:
+  std::string CPU;
+
+public:
+  XtensaTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
+    // no big-endianess support yet
+    BigEndian = false;
+    NoAsmVariants = true;
+    LongLongAlign = 64;
+    SuitableAlign = 32;
+    DoubleAlign = LongDoubleAlign = 64;
+    SizeType = UnsignedInt;
+    PtrDiffType = SignedInt;
+    IntPtrType = SignedInt;
+    WCharType = SignedInt;
+    WIntType = UnsignedInt;
+    UseZeroLengthBitfieldAlignment = true;
+    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
+    resetDataLayout("e-m:e-p:32:32-i8:8:32-i16:16:32-i64:64-n32");
+  }
+
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override;
+
+  ArrayRef<Builtin::Info> getTargetBuiltins() const override {
+    return std::nullopt;
+  }
+
+  BuiltinVaListKind getBuiltinVaListKind() const override {
+    return TargetInfo::XtensaABIBuiltinVaList;
+  }
+
+  std::string_view getClobbers() const override { return ""; }
+
+  ArrayRef<const char *> getGCCRegNames() const override {
+    static const char *const GCCRegNames[] = {
+        // General register name
+        "a0", "sp", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "a10",
+        "a11", "a12", "a13", "a14", "a15",
+        // Special register name
+        "sar"};
+    return llvm::ArrayRef(GCCRegNames);
+  }
+
+  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
+    return std::nullopt;
+  }
+
+  bool validateAsmConstraint(const char *&Name,
+                             TargetInfo::ConstraintInfo &Info) const override {
+    switch (*Name) {
+    default:
+      return false;
+    case 'a':
+      Info.setAllowsRegister();
+      return true;
+    }
+    return false;
+  }
+
+  int getEHDataRegisterNumber(unsigned RegNo) const override {
+    return (RegNo < 2) ? RegNo : -1;
+  }
+
+  bool isValidCPUName(StringRef Name) const override {
+    return llvm::StringSwitch<bool>(Name).Case("generic", true).Default(false);
+  }
+
+  bool setCPU(const std::string &Name) override {
+    CPU = Name;
+    return isValidCPUName(Name);
+  }
+};
+
+} // namespace targets
+} // namespace clang
+#endif // LLVM_CLANG_LIB_BASIC_TARGETS_XTENSA_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 416d532..2615ae3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -115,6 +115,48 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
   if (clang::IdentifierInfo *identifier = vd->getIdentifier()) {
     auto varOp = builder.create<cir::GlobalOp>(getLoc(vd->getSourceRange()),
                                                identifier->getName(), type);
+    // TODO(CIR): This code for processing initial values is a placeholder
+    // until class ConstantEmitter is upstreamed and the code for processing
+    // constant expressions is filled out.  Only the most basic handling of
+    // certain constant expressions is implemented for now.
+    const VarDecl *initDecl;
+    const Expr *initExpr = vd->getAnyInitializer(initDecl);
+    if (initExpr) {
+      mlir::Attribute initializer;
+      if (APValue *value = initDecl->evaluateValue()) {
+        switch (value->getKind()) {
+        case APValue::Int: {
+          initializer = builder.getAttr<cir::IntAttr>(type, value->getInt());
+          break;
+        }
+        case APValue::Float: {
+          initializer = builder.getAttr<cir::FPAttr>(type, value->getFloat());
+          break;
+        }
+        case APValue::LValue: {
+          if (value->getLValueBase()) {
+            errorNYI(initExpr->getSourceRange(),
+                     "non-null pointer initialization");
+          } else {
+            if (auto ptrType = mlir::dyn_cast<cir::PointerType>(type)) {
+              initializer = builder.getConstPtrAttr(
+                  ptrType, value->getLValueOffset().getQuantity());
+            } else {
+              llvm_unreachable(
+                  "non-pointer variable initialized with a pointer");
+            }
+          }
+          break;
+        }
+        default:
+          errorNYI(initExpr->getSourceRange(), "unsupported initializer kind");
+          break;
+        }
+      } else {
+        errorNYI(initExpr->getSourceRange(), "non-constant initializer");
+      }
+      varOp.setInitialValueAttr(initializer);
+    }
     theModule.push_back(varOp);
   } else {
     errorNYI(vd->getSourceRange().getBegin(),
diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
index 7d42da1..8e8f7d5 100644
--- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
@@ -12,6 +12,24 @@
 
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+static void printFloatLiteral(mlir::AsmPrinter &p, llvm::APFloat value,
+                              mlir::Type ty);
+static mlir::ParseResult
+parseFloatLiteral(mlir::AsmParser &parser,
+                  mlir::FailureOr<llvm::APFloat> &value,
+                  cir::CIRFPTypeInterface fpType);
+
+static mlir::ParseResult parseConstPtr(mlir::AsmParser &parser,
+                                       mlir::IntegerAttr &value);
+
+static void printConstPtr(mlir::AsmPrinter &p, mlir::IntegerAttr value);
+
+#define GET_ATTRDEF_CLASSES
+#include "clang/CIR/Dialect/IR/CIROpsAttributes.cpp.inc"
+
 using namespace mlir;
 using namespace cir;
 
@@ -21,12 +39,155 @@ using namespace cir;
 
 Attribute CIRDialect::parseAttribute(DialectAsmParser &parser,
                                      Type type) const {
-  // No attributes yet to parse
-  return Attribute{};
+  llvm::SMLoc typeLoc = parser.getCurrentLocation();
+  llvm::StringRef mnemonic;
+  Attribute genAttr;
+  OptionalParseResult parseResult =
+      generatedAttributeParser(parser, &mnemonic, type, genAttr);
+  if (parseResult.has_value())
+    return genAttr;
+  parser.emitError(typeLoc, "unknown attribute in CIR dialect");
+  return Attribute();
 }
 
 void CIRDialect::printAttribute(Attribute attr, DialectAsmPrinter &os) const {
-  // No attributes yet to print
+  if (failed(generatedAttributePrinter(attr, os)))
+    llvm_unreachable("unexpected CIR type kind");
+}
+
+//===----------------------------------------------------------------------===//
+// ConstPtrAttr definitions
+//===----------------------------------------------------------------------===//
+
+// TODO(CIR): Consider encoding the null value differently and use conditional
+// assembly format instead of custom parsing/printing.
+static ParseResult parseConstPtr(AsmParser &parser, mlir::IntegerAttr &value) {
+
+  if (parser.parseOptionalKeyword("null").succeeded()) {
+    value = mlir::IntegerAttr::get(
+        mlir::IntegerType::get(parser.getContext(), 64), 0);
+    return success();
+  }
+
+  return parser.parseAttribute(value);
+}
+
+static void printConstPtr(AsmPrinter &p, mlir::IntegerAttr value) {
+  if (!value.getInt())
+    p << "null";
+  else
+    p << value;
+}
+
+//===----------------------------------------------------------------------===//
+// IntAttr definitions
+//===----------------------------------------------------------------------===//
+
+Attribute IntAttr::parse(AsmParser &parser, Type odsType) {
+  mlir::APInt apValue;
+
+  if (!mlir::isa<IntType>(odsType))
+    return {};
+  auto type = mlir::cast<IntType>(odsType);
+
+  // Consume the '<' symbol.
+  if (parser.parseLess())
+    return {};
+
+  // Fetch arbitrary precision integer value.
+  if (type.isSigned()) {
+    int64_t value = 0;
+    if (parser.parseInteger(value)) {
+      parser.emitError(parser.getCurrentLocation(), "expected integer value");
+    } else {
+      apValue = mlir::APInt(type.getWidth(), value, type.isSigned(),
+                            /*implicitTrunc=*/true);
+      if (apValue.getSExtValue() != value)
+        parser.emitError(parser.getCurrentLocation(),
+                         "integer value too large for the given type");
+    }
+  } else {
+    uint64_t value = 0;
+    if (parser.parseInteger(value)) {
+      parser.emitError(parser.getCurrentLocation(), "expected integer value");
+    } else {
+      apValue = mlir::APInt(type.getWidth(), value, type.isSigned(),
+                            /*implicitTrunc=*/true);
+      if (apValue.getZExtValue() != value)
+        parser.emitError(parser.getCurrentLocation(),
+                         "integer value too large for the given type");
+    }
+  }
+
+  // Consume the '>' symbol.
+  if (parser.parseGreater())
+    return {};
+
+  return IntAttr::get(type, apValue);
+}
+
+void IntAttr::print(AsmPrinter &printer) const {
+  auto type = mlir::cast<IntType>(getType());
+  printer << '<';
+  if (type.isSigned())
+    printer << getSInt();
+  else
+    printer << getUInt();
+  printer << '>';
+}
+
+LogicalResult IntAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                              Type type, APInt value) {
+  if (!mlir::isa<IntType>(type)) {
+    emitError() << "expected 'simple.int' type";
+    return failure();
+  }
+
+  auto intType = mlir::cast<IntType>(type);
+  if (value.getBitWidth() != intType.getWidth()) {
+    emitError() << "type and value bitwidth mismatch: " << intType.getWidth()
+                << " != " << value.getBitWidth();
+    return failure();
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FPAttr definitions
+//===----------------------------------------------------------------------===//
+
+static void printFloatLiteral(AsmPrinter &p, APFloat value, Type ty) {
+  p << value;
+}
+
+static ParseResult parseFloatLiteral(AsmParser &parser,
+                                     FailureOr<APFloat> &value,
+                                     CIRFPTypeInterface fpType) {
+
+  APFloat parsedValue(0.0);
+  if (parser.parseFloat(fpType.getFloatSemantics(), parsedValue))
+    return failure();
+
+  value.emplace(parsedValue);
+  return success();
+}
+
+FPAttr FPAttr::getZero(Type type) {
+  return get(type,
+             APFloat::getZero(
+                 mlir::cast<CIRFPTypeInterface>(type).getFloatSemantics()));
+}
+
+LogicalResult FPAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                             CIRFPTypeInterface fpType, APFloat value) {
+  if (APFloat::SemanticsToEnum(fpType.getFloatSemantics()) !=
+      APFloat::SemanticsToEnum(value.getSemantics())) {
+    emitError() << "floating-point semantics mismatch";
+    return failure();
+  }
+
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -34,5 +195,8 @@ void CIRDialect::printAttribute(Attribute attr, DialectAsmPrinter &os) const {
 //===----------------------------------------------------------------------===//
 
 void CIRDialect::registerAttributes() {
-  // No attributes yet to register
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "clang/CIR/Dialect/IR/CIROpsAttributes.cpp.inc"
+      >();
 }
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index dbdca1f..f98d8b6 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -12,6 +12,8 @@
 
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
 #include "mlir/Support/LogicalResult.h"
 
 #include "clang/CIR/Dialect/IR/CIROpsDialect.cpp.inc"
@@ -33,12 +35,72 @@ void cir::CIRDialect::initialize() {
 }
 
 //===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult checkConstantTypes(mlir::Operation *op, mlir::Type opType,
+                                        mlir::Attribute attrType) {
+  if (isa<cir::ConstPtrAttr>(attrType)) {
+    if (!mlir::isa<cir::PointerType>(opType))
+      return op->emitOpError(
+          "pointer constant initializing a non-pointer type");
+    return success();
+  }
+
+  if (mlir::isa<cir::IntAttr, cir::FPAttr>(attrType)) {
+    auto at = cast<TypedAttr>(attrType);
+    if (at.getType() != opType) {
+      return op->emitOpError("result type (")
+             << opType << ") does not match value type (" << at.getType()
+             << ")";
+    }
+    return success();
+  }
+
+  assert(isa<TypedAttr>(attrType) && "What else could we be looking at here?");
+  return op->emitOpError("global with type ")
+         << cast<TypedAttr>(attrType).getType() << " not yet supported";
+}
+
+LogicalResult cir::ConstantOp::verify() {
+  // ODS already generates checks to make sure the result type is valid. We just
+  // need to additionally check that the value's attribute type is consistent
+  // with the result type.
+  return checkConstantTypes(getOperation(), getType(), getValue());
+}
+
+OpFoldResult cir::ConstantOp::fold(FoldAdaptor /*adaptor*/) {
+  return getValue();
+}
+
+//===----------------------------------------------------------------------===//
 // GlobalOp
 //===----------------------------------------------------------------------===//
 
-// TODO(CIR): The properties of global variables that require verification
-// haven't been implemented yet.
-mlir::LogicalResult cir::GlobalOp::verify() { return success(); }
+static ParseResult parseConstantValue(OpAsmParser &parser,
+                                      mlir::Attribute &valueAttr) {
+  NamedAttrList attr;
+  return parser.parseAttribute(valueAttr, "value", attr);
+}
+
+static void printConstant(OpAsmPrinter &p, Attribute value) {
+  p.printAttribute(value);
+}
+
+mlir::LogicalResult cir::GlobalOp::verify() {
+  // Verify that the initial value, if present, is either a unit attribute or
+  // an attribute CIR supports.
+  if (getInitialValue().has_value()) {
+    if (checkConstantTypes(getOperation(), getSymType(), *getInitialValue())
+            .failed())
+      return failure();
+  }
+
+  // TODO(CIR): Many other checks for properties that haven't been upstreamed
+  // yet.
+
+  return success();
+}
 
 void cir::GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                           llvm::StringRef sym_name, mlir::Type sym_type) {
@@ -48,6 +110,45 @@ void cir::GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                         mlir::TypeAttr::get(sym_type));
 }
 
+static void printGlobalOpTypeAndInitialValue(OpAsmPrinter &p, cir::GlobalOp op,
+                                             TypeAttr type,
+                                             Attribute initAttr) {
+  if (!op.isDeclaration()) {
+    p << "= ";
+    // This also prints the type...
+    if (initAttr)
+      printConstant(p, initAttr);
+  } else {
+    p << ": " << type;
+  }
+}
+
+static ParseResult
+parseGlobalOpTypeAndInitialValue(OpAsmParser &parser, TypeAttr &typeAttr,
+                                 Attribute &initialValueAttr) {
+  mlir::Type opTy;
+  if (parser.parseOptionalEqual().failed()) {
+    // Absence of equal means a declaration, so we need to parse the type.
+    //  cir.global @a : !cir.int<s, 32>
+    if (parser.parseColonType(opTy))
+      return failure();
+  } else {
+    // Parse constant with initializer, examples:
+    //  cir.global @y = #cir.fp<1.250000e+00> : !cir.double
+    //  cir.global @rgb = #cir.const_array<[...] : !cir.array<i8 x 3>>
+    if (parseConstantValue(parser, initialValueAttr).failed())
+      return failure();
+
+    assert(mlir::isa<mlir::TypedAttr>(initialValueAttr) &&
+           "Non-typed attrs shouldn't appear here.");
+    auto typedAttr = mlir::cast<mlir::TypedAttr>(initialValueAttr);
+    opTy = typedAttr.getType();
+  }
+
+  typeAttr = TypeAttr::get(opTy);
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FuncOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CMakeLists.txt b/clang/lib/CIR/Dialect/IR/CMakeLists.txt
index df60f69..baf8bff 100644
--- a/clang/lib/CIR/Dialect/IR/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/IR/CMakeLists.txt
@@ -5,6 +5,7 @@ add_clang_library(MLIRCIR
 
   DEPENDS
   MLIRCIROpsIncGen
+  MLIRCIRAttrsEnumsGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/clang/lib/CIR/Interfaces/CMakeLists.txt b/clang/lib/CIR/Interfaces/CMakeLists.txt
index fcd8b69..b826bf6 100644
--- a/clang/lib/CIR/Interfaces/CMakeLists.txt
+++ b/clang/lib/CIR/Interfaces/CMakeLists.txt
@@ -5,6 +5,7 @@ add_clang_library(MLIRCIRInterfaces
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Interfaces
 
   DEPENDS
+  MLIRCIRAttrsEnumsGen
   MLIRCIRFPTypeInterfaceIncGen
 
   LINK_LIBS
diff --git a/clang/lib/CodeGen/BackendConsumer.h b/clang/lib/CodeGen/BackendConsumer.h
index a023d29..d932a78 100644
--- a/clang/lib/CodeGen/BackendConsumer.h
+++ b/clang/lib/CodeGen/BackendConsumer.h
@@ -29,17 +29,16 @@ class BackendConsumer : public ASTConsumer {
 
   virtual void anchor();
   DiagnosticsEngine &Diags;
-  BackendAction Action;
   const HeaderSearchOptions &HeaderSearchOpts;
   const CodeGenOptions &CodeGenOpts;
   const TargetOptions &TargetOpts;
   const LangOptions &LangOpts;
   std::unique_ptr<raw_pwrite_stream> AsmOutStream;
-  ASTContext *Context;
+  ASTContext *Context = nullptr;
   IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS;
 
   llvm::Timer LLVMIRGeneration;
-  unsigned LLVMIRGenerationRefCount;
+  unsigned LLVMIRGenerationRefCount = 0;
 
   /// True if we've finished generating IR. This prevents us from generating
   /// additional LLVM IR after emitting output in HandleTranslationUnit. This
@@ -48,6 +47,8 @@ class BackendConsumer : public ASTConsumer {
 
   bool TimerIsEnabled = false;
 
+  BackendAction Action;
+
   std::unique_ptr<CodeGenerator> Gen;
 
   SmallVector<LinkModule, 4> LinkModules;
@@ -69,29 +70,12 @@ class BackendConsumer : public ASTConsumer {
   llvm::Module *CurLinkModule = nullptr;
 
 public:
-  BackendConsumer(BackendAction Action, DiagnosticsEngine &Diags,
-                  IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-                  const HeaderSearchOptions &HeaderSearchOpts,
-                  const PreprocessorOptions &PPOpts,
-                  const CodeGenOptions &CodeGenOpts,
-                  const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-                  const std::string &InFile,
-                  SmallVector<LinkModule, 4> LinkModules,
-                  std::unique_ptr<raw_pwrite_stream> OS, llvm::LLVMContext &C,
-                  CoverageSourceInfo *CoverageInfo = nullptr);
-
-  // This constructor is used in installing an empty BackendConsumer
-  // to use the clang diagnostic handler for IR input files. It avoids
-  // initializing the OS field.
-  BackendConsumer(BackendAction Action, DiagnosticsEngine &Diags,
+  BackendConsumer(const CompilerInstance &CI, BackendAction Action,
                   IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-                  const HeaderSearchOptions &HeaderSearchOpts,
-                  const PreprocessorOptions &PPOpts,
-                  const CodeGenOptions &CodeGenOpts,
-                  const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-                  llvm::Module *Module, SmallVector<LinkModule, 4> LinkModules,
-                  llvm::LLVMContext &C,
-                  CoverageSourceInfo *CoverageInfo = nullptr);
+                  llvm::LLVMContext &C, SmallVector<LinkModule, 4> LinkModules,
+                  StringRef InFile, std::unique_ptr<raw_pwrite_stream> OS,
+                  CoverageSourceInfo *CoverageInfo,
+                  llvm::Module *CurLinkModule = nullptr);
 
   llvm::Module *getModule() const;
   std::unique_ptr<llvm::Module> takeModule();
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 04358cd..2dbab78 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -736,10 +736,8 @@ static void addSanitizers(const Triple &TargetTriple,
       MPM.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
     }
 
-    if (LangOpts.Sanitize.has(SanitizerKind::Type)) {
-      MPM.addPass(ModuleTypeSanitizerPass());
-      MPM.addPass(createModuleToFunctionPassAdaptor(TypeSanitizerPass()));
-    }
+    if (LangOpts.Sanitize.has(SanitizerKind::Type))
+      MPM.addPass(TypeSanitizerPass());
 
     if (LangOpts.Sanitize.has(SanitizerKind::NumericalStability))
       MPM.addPass(NumericalStabilitySanitizerPass());
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4d4b742..573be93 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -835,6 +835,38 @@ static Value *emitFrexpBuiltin(CodeGenFunction &CGF, const CallExpr *E,
   return CGF.Builder.CreateExtractValue(Call, 0);
 }
 
+static void emitSincosBuiltin(CodeGenFunction &CGF, const CallExpr *E,
+                              llvm::Intrinsic::ID IntrinsicID) {
+  llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(0));
+  llvm::Value *Dest0 = CGF.EmitScalarExpr(E->getArg(1));
+  llvm::Value *Dest1 = CGF.EmitScalarExpr(E->getArg(2));
+
+  llvm::Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {Val->getType()});
+  llvm::Value *Call = CGF.Builder.CreateCall(F, Val);
+
+  llvm::Value *SinResult = CGF.Builder.CreateExtractValue(Call, 0);
+  llvm::Value *CosResult = CGF.Builder.CreateExtractValue(Call, 1);
+
+  QualType DestPtrType = E->getArg(1)->getType()->getPointeeType();
+  LValue SinLV = CGF.MakeNaturalAlignAddrLValue(Dest0, DestPtrType);
+  LValue CosLV = CGF.MakeNaturalAlignAddrLValue(Dest1, DestPtrType);
+
+  llvm::StoreInst *StoreSin =
+      CGF.Builder.CreateStore(SinResult, SinLV.getAddress());
+  llvm::StoreInst *StoreCos =
+      CGF.Builder.CreateStore(CosResult, CosLV.getAddress());
+
+  // Mark the two stores as non-aliasing with each other. The order of stores
+  // emitted by this builtin is arbitrary, enforcing a particular order will
+  // prevent optimizations later on.
+  llvm::MDBuilder MDHelper(CGF.getLLVMContext());
+  MDNode *Domain = MDHelper.createAnonymousAliasScopeDomain();
+  MDNode *AliasScope = MDHelper.createAnonymousAliasScope(Domain);
+  MDNode *AliasScopeList = MDNode::get(Call->getContext(), AliasScope);
+  StoreSin->setMetadata(LLVMContext::MD_alias_scope, AliasScopeList);
+  StoreCos->setMetadata(LLVMContext::MD_noalias, AliasScopeList);
+}
+
 /// EmitFAbs - Emit a call to @llvm.fabs().
 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
   Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
@@ -3232,6 +3264,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::sinh, Intrinsic::experimental_constrained_sinh));
 
+    case Builtin::BI__builtin_sincos:
+    case Builtin::BI__builtin_sincosf:
+    case Builtin::BI__builtin_sincosf16:
+    case Builtin::BI__builtin_sincosl:
+    case Builtin::BI__builtin_sincosf128:
+      emitSincosBuiltin(*this, E, Intrinsic::sincos);
+      return RValue::get(nullptr);
+
     case Builtin::BIsqrt:
     case Builtin::BIsqrtf:
     case Builtin::BIsqrtl:
@@ -6757,6 +6797,8 @@ static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
   case llvm::Triple::riscv32:
   case llvm::Triple::riscv64:
     return CGF->EmitRISCVBuiltinExpr(BuiltinID, E, ReturnValue);
+  case llvm::Triple::spirv:
+    return CGF->EmitSPIRVBuiltinExpr(BuiltinID, E);
   case llvm::Triple::spirv64:
     if (CGF->getTarget().getTriple().getOS() != llvm::Triple::OSType::AMDHSA)
       return nullptr;
@@ -11285,6 +11327,19 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
   if (Builtin->LLVMIntrinsic == 0)
     return nullptr;
 
+  if (BuiltinID == SME::BI__builtin_sme___arm_in_streaming_mode) {
+    // If we already know the streaming mode, don't bother with the intrinsic
+    // and emit a constant instead
+    const auto *FD = cast<FunctionDecl>(CurFuncDecl);
+    if (const auto *FPT = FD->getType()->getAs<FunctionProtoType>()) {
+      unsigned SMEAttrs = FPT->getAArch64SMEAttributes();
+      if (!(SMEAttrs & FunctionType::SME_PStateSMCompatibleMask)) {
+        bool IsStreaming = SMEAttrs & FunctionType::SME_PStateSMEnabledMask;
+        return ConstantInt::getBool(Builder.getContext(), IsStreaming);
+      }
+    }
+  }
+
   // Predicates must match the main datatype.
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
@@ -19157,8 +19212,9 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     // TODO: Map to an hlsl_device address space.
     llvm::Type *RetTy = llvm::PointerType::getUnqual(getLLVMContext());
 
-    return Builder.CreateIntrinsic(RetTy, Intrinsic::dx_resource_getpointer,
-                                   ArrayRef<Value *>{HandleOp, IndexOp});
+    return Builder.CreateIntrinsic(
+        RetTy, CGM.getHLSLRuntime().getCreateResourceGetPointerIntrinsic(),
+        ArrayRef<Value *>{HandleOp, IndexOp});
   }
   case Builtin::BI__builtin_hlsl_all: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
@@ -20440,6 +20496,26 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   }
 }
 
+Value *CodeGenFunction::EmitSPIRVBuiltinExpr(unsigned BuiltinID,
+                                             const CallExpr *E) {
+  switch (BuiltinID) {
+  case SPIRV::BI__builtin_spirv_distance: {
+    Value *X = EmitScalarExpr(E->getArg(0));
+    Value *Y = EmitScalarExpr(E->getArg(1));
+    assert(E->getArg(0)->getType()->hasFloatingRepresentation() &&
+           E->getArg(1)->getType()->hasFloatingRepresentation() &&
+           "Distance operands must have a float representation");
+    assert(E->getArg(0)->getType()->isVectorType() &&
+           E->getArg(1)->getType()->isVectorType() &&
+           "Distance operands must be a vector");
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/X->getType()->getScalarType(), Intrinsic::spv_distance,
+        ArrayRef<Value *>{X, Y}, nullptr, "spv.distance");
+  }
+  }
+  return nullptr;
+}
+
 /// Handle a SystemZ function in which the final argument is a pointer
 /// to an int that receives the post-instruction CC value.  At the LLVM level
 /// this is represented as a function that returns a {result, cc} pair.
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 50b9dfb..7b0ef4b 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3235,22 +3235,6 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
 
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgI.getCoerceToType());
-      if (ArgI.isDirect() && !ArgI.getCanBeFlattened() && STy &&
-          STy->getNumElements() > 1) {
-        [[maybe_unused]] llvm::TypeSize StructSize =
-            CGM.getDataLayout().getTypeAllocSize(STy);
-        [[maybe_unused]] llvm::TypeSize PtrElementSize =
-            CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(Ty));
-        if (STy->containsHomogeneousScalableVectorTypes()) {
-          assert(StructSize == PtrElementSize &&
-                 "Only allow non-fractional movement of structure with"
-                 "homogeneous scalable vector type");
-
-          ArgVals.push_back(ParamValue::forDirect(AI));
-          break;
-        }
-      }
-
       Address Alloca = CreateMemTemp(Ty, getContext().getDeclAlign(Arg),
                                      Arg->getName());
 
@@ -4887,7 +4871,7 @@ llvm::CallInst *CodeGenFunction::EmitRuntimeCall(llvm::FunctionCallee callee,
   call->setCallingConv(getRuntimeCC());
 
   if (CGM.shouldEmitConvergenceTokens() && call->isConvergent())
-    return addControlledConvergenceToken(call);
+    return cast<llvm::CallInst>(addConvergenceControlToken(call));
   return call;
 }
 
@@ -5414,21 +5398,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
 
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
-      if (STy && ArgInfo.isDirect() && !ArgInfo.getCanBeFlattened()) {
-        llvm::Type *SrcTy = ConvertTypeForMem(I->Ty);
-        [[maybe_unused]] llvm::TypeSize SrcTypeSize =
-            CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        [[maybe_unused]] llvm::TypeSize DstTypeSize =
-            CGM.getDataLayout().getTypeAllocSize(STy);
-        if (STy->containsHomogeneousScalableVectorTypes()) {
-          assert(SrcTypeSize == DstTypeSize &&
-                 "Only allow non-fractional movement of structure with "
-                 "homogeneous scalable vector type");
-
-          IRCallArgs[FirstIRArg] = I->getKnownRValue().getScalarVal();
-          break;
-        }
-      }
 
       // FIXME: Avoid the conversion through memory if possible.
       Address Src = Address::invalid();
@@ -5818,7 +5787,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     CI->setName("call");
 
   if (CGM.shouldEmitConvergenceTokens() && CI->isConvergent())
-    CI = addControlledConvergenceToken(CI);
+    CI = addConvergenceControlToken(CI);
 
   // Update largest vector width from the return type.
   LargestVectorWidth =
@@ -6121,6 +6090,8 @@ RValue CodeGenFunction::EmitVAArg(VAArgExpr *VE, Address &VAListAddr,
   VAListAddr = VE->isMicrosoftABI() ? EmitMSVAListRef(VE->getSubExpr())
                                     : EmitVAListRef(VE->getSubExpr());
   QualType Ty = VE->getType();
+  if (Ty->isVariablyModifiedType())
+    EmitVariablyModifiedType(Ty);
   if (VE->isMicrosoftABI())
     return CGM.getABIInfo().EmitMSVAArg(*this, VAListAddr, Ty, Slot);
   return CGM.getABIInfo().EmitVAArg(*this, VAListAddr, Ty, Slot);
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f29ddec..560d4ce 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -3492,6 +3492,11 @@ llvm::DIType *CGDebugInfo::CreateType(const PipeType *Ty, llvm::DIFile *U) {
   return getOrCreateType(Ty->getElementType(), U);
 }
 
+llvm::DIType *CGDebugInfo::CreateType(const HLSLAttributedResourceType *Ty,
+                                      llvm::DIFile *U) {
+  return getOrCreateType(Ty->getWrappedType(), U);
+}
+
 llvm::DIType *CGDebugInfo::CreateEnumType(const EnumType *Ty) {
   const EnumDecl *ED = Ty->getDecl();
 
@@ -3834,12 +3839,13 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) {
 
   case Type::TemplateSpecialization:
     return CreateType(cast<TemplateSpecializationType>(Ty), Unit);
+  case Type::HLSLAttributedResource:
+    return CreateType(cast<HLSLAttributedResourceType>(Ty), Unit);
 
   case Type::CountAttributed:
   case Type::Auto:
   case Type::Attributed:
   case Type::BTFTagAttributed:
-  case Type::HLSLAttributedResource:
   case Type::Adjusted:
   case Type::Decayed:
   case Type::DeducedTemplateSpecialization:
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 3fd0237..38f73ec 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -196,6 +196,8 @@ class CGDebugInfo {
   llvm::DIType *CreateType(const PointerType *Ty, llvm::DIFile *F);
   llvm::DIType *CreateType(const BlockPointerType *Ty, llvm::DIFile *F);
   llvm::DIType *CreateType(const FunctionType *Ty, llvm::DIFile *F);
+  llvm::DIType *CreateType(const HLSLAttributedResourceType *Ty,
+                           llvm::DIFile *F);
   /// Get structure or union type.
   llvm::DIType *CreateType(const RecordType *Tyg);
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 4159cee..0f27bd0 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -5455,11 +5455,6 @@ Value *ScalarExprEmitter::VisitChooseExpr(ChooseExpr *E) {
 }
 
 Value *ScalarExprEmitter::VisitVAArgExpr(VAArgExpr *VE) {
-  QualType Ty = VE->getType();
-
-  if (Ty->isVariablyModifiedType())
-    CGF.EmitVariablyModifiedType(Ty);
-
   Address ArgValue = Address::invalid();
   RValue ArgPtr = CGF.EmitVAArg(VE, ArgValue);
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index c354e58..5679bd7 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -395,7 +395,7 @@ llvm::Value *CGHLSLRuntime::emitInputSemantic(IRBuilder<> &B,
     return buildVectorInput(B, GroupThreadIDIntrinsic, Ty);
   }
   if (D.hasAttr<HLSLSV_GroupIDAttr>()) {
-    llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(Intrinsic::dx_group_id);
+    llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(getGroupIdIntrinsic());
     return buildVectorInput(B, GroupIDIntrinsic, Ty);
   }
   assert(false && "Unhandled parameter attribute");
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index edb87f9..46e472f 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -87,6 +87,7 @@ public:
   GENERATE_HLSL_INTRINSIC_FUNCTION(Radians, radians)
   GENERATE_HLSL_INTRINSIC_FUNCTION(ThreadId, thread_id)
   GENERATE_HLSL_INTRINSIC_FUNCTION(GroupThreadId, thread_id_in_group)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(GroupId, group_id)
   GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot)
   GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot)
   GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot)
@@ -103,6 +104,8 @@ public:
   GENERATE_HLSL_INTRINSIC_FUNCTION(SClamp, sclamp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(UClamp, uclamp)
 
+  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateResourceGetPointer,
+                                   resource_getpointer)
   GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding,
                                    resource_handlefrombinding)
   GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, resource_updatecounter)
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index be9605f..e9a8500 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -480,6 +480,12 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OpenACCShutdownConstructClass:
     EmitOpenACCShutdownConstruct(cast<OpenACCShutdownConstruct>(*S));
     break;
+  case Stmt::OpenACCSetConstructClass:
+    EmitOpenACCSetConstruct(cast<OpenACCSetConstruct>(*S));
+    break;
+  case Stmt::OpenACCUpdateConstructClass:
+    EmitOpenACCUpdateConstruct(cast<OpenACCUpdateConstruct>(*S));
+    break;
   }
 }
 
@@ -1026,8 +1032,8 @@ void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
   EmitBlock(LoopHeader.getBlock());
 
   if (CGM.shouldEmitConvergenceTokens())
-    ConvergenceTokenStack.push_back(emitConvergenceLoopToken(
-        LoopHeader.getBlock(), ConvergenceTokenStack.back()));
+    ConvergenceTokenStack.push_back(
+        emitConvergenceLoopToken(LoopHeader.getBlock()));
 
   // Create an exit block for when the condition fails, which will
   // also become the break target.
@@ -1154,8 +1160,7 @@ void CodeGenFunction::EmitDoStmt(const DoStmt &S,
     EmitBlockWithFallThrough(LoopBody, &S);
 
   if (CGM.shouldEmitConvergenceTokens())
-    ConvergenceTokenStack.push_back(
-        emitConvergenceLoopToken(LoopBody, ConvergenceTokenStack.back()));
+    ConvergenceTokenStack.push_back(emitConvergenceLoopToken(LoopBody));
 
   {
     RunCleanupsScope BodyScope(*this);
@@ -1233,8 +1238,7 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
   EmitBlock(CondBlock);
 
   if (CGM.shouldEmitConvergenceTokens())
-    ConvergenceTokenStack.push_back(
-        emitConvergenceLoopToken(CondBlock, ConvergenceTokenStack.back()));
+    ConvergenceTokenStack.push_back(emitConvergenceLoopToken(CondBlock));
 
   const SourceRange &R = S.getSourceRange();
   LoopStack.push(CondBlock, CGM.getContext(), CGM.getCodeGenOpts(), ForAttrs,
@@ -1371,8 +1375,7 @@ CodeGenFunction::EmitCXXForRangeStmt(const CXXForRangeStmt &S,
   EmitBlock(CondBlock);
 
   if (CGM.shouldEmitConvergenceTokens())
-    ConvergenceTokenStack.push_back(
-        emitConvergenceLoopToken(CondBlock, ConvergenceTokenStack.back()));
+    ConvergenceTokenStack.push_back(emitConvergenceLoopToken(CondBlock));
 
   const SourceRange &R = S.getSourceRange();
   LoopStack.push(CondBlock, CGM.getContext(), CGM.getCodeGenOpts(), ForAttrs,
@@ -3248,35 +3251,32 @@ CodeGenFunction::GenerateCapturedStmtFunction(const CapturedStmt &S) {
   return F;
 }
 
-namespace {
 // Returns the first convergence entry/loop/anchor instruction found in |BB|.
 // std::nullptr otherwise.
-llvm::IntrinsicInst *getConvergenceToken(llvm::BasicBlock *BB) {
+static llvm::ConvergenceControlInst *getConvergenceToken(llvm::BasicBlock *BB) {
   for (auto &I : *BB) {
-    auto *II = dyn_cast<llvm::IntrinsicInst>(&I);
-    if (II && llvm::isConvergenceControlIntrinsic(II->getIntrinsicID()))
-      return II;
+    if (auto *CI = dyn_cast<llvm::ConvergenceControlInst>(&I))
+      return CI;
   }
   return nullptr;
 }
 
-} // namespace
-
 llvm::CallBase *
-CodeGenFunction::addConvergenceControlToken(llvm::CallBase *Input,
-                                            llvm::Value *ParentToken) {
+CodeGenFunction::addConvergenceControlToken(llvm::CallBase *Input) {
+  llvm::ConvergenceControlInst *ParentToken = ConvergenceTokenStack.back();
+  assert(ParentToken);
+
   llvm::Value *bundleArgs[] = {ParentToken};
   llvm::OperandBundleDef OB("convergencectrl", bundleArgs);
-  auto Output = llvm::CallBase::addOperandBundle(
+  auto *Output = llvm::CallBase::addOperandBundle(
       Input, llvm::LLVMContext::OB_convergencectrl, OB, Input->getIterator());
   Input->replaceAllUsesWith(Output);
   Input->eraseFromParent();
   return Output;
 }
 
-llvm::IntrinsicInst *
-CodeGenFunction::emitConvergenceLoopToken(llvm::BasicBlock *BB,
-                                          llvm::Value *ParentToken) {
+llvm::ConvergenceControlInst *
+CodeGenFunction::emitConvergenceLoopToken(llvm::BasicBlock *BB) {
   CGBuilderTy::InsertPoint IP = Builder.saveIP();
   if (BB->empty())
     Builder.SetInsertPoint(BB);
@@ -3287,14 +3287,14 @@ CodeGenFunction::emitConvergenceLoopToken(llvm::BasicBlock *BB,
       llvm::Intrinsic::experimental_convergence_loop, {}, {});
   Builder.restoreIP(IP);
 
-  llvm::CallBase *I = addConvergenceControlToken(CB, ParentToken);
-  return cast<llvm::IntrinsicInst>(I);
+  CB = addConvergenceControlToken(CB);
+  return cast<llvm::ConvergenceControlInst>(CB);
 }
 
-llvm::IntrinsicInst *
+llvm::ConvergenceControlInst *
 CodeGenFunction::getOrEmitConvergenceEntryToken(llvm::Function *F) {
   llvm::BasicBlock *BB = &F->getEntryBlock();
-  llvm::IntrinsicInst *Token = getConvergenceToken(BB);
+  llvm::ConvergenceControlInst *Token = getConvergenceToken(BB);
   if (Token)
     return Token;
 
@@ -3309,5 +3309,5 @@ CodeGenFunction::getOrEmitConvergenceEntryToken(llvm::Function *F) {
   assert(isa<llvm::IntrinsicInst>(I));
   Builder.restoreIP(IP);
 
-  return cast<llvm::IntrinsicInst>(I);
+  return cast<llvm::ConvergenceControlInst>(I);
 }
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index cc927f4..f63cb9b 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -106,46 +106,19 @@ static void reportOptRecordError(Error E, DiagnosticsEngine &Diags,
 }
 
 BackendConsumer::BackendConsumer(
-    BackendAction Action, DiagnosticsEngine &Diags,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-    const HeaderSearchOptions &HeaderSearchOpts,
-    const PreprocessorOptions &PPOpts, const CodeGenOptions &CodeGenOpts,
-    const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-    const std::string &InFile, SmallVector<LinkModule, 4> LinkModules,
-    std::unique_ptr<raw_pwrite_stream> OS, LLVMContext &C,
-    CoverageSourceInfo *CoverageInfo)
-    : Diags(Diags), Action(Action), HeaderSearchOpts(HeaderSearchOpts),
-      CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts), LangOpts(LangOpts),
-      AsmOutStream(std::move(OS)), Context(nullptr), FS(VFS),
-      LLVMIRGeneration("irgen", "LLVM IR Generation Time"),
-      LLVMIRGenerationRefCount(0),
-      Gen(CreateLLVMCodeGen(Diags, InFile, std::move(VFS), HeaderSearchOpts,
-                            PPOpts, CodeGenOpts, C, CoverageInfo)),
-      LinkModules(std::move(LinkModules)) {
-  TimerIsEnabled = CodeGenOpts.TimePasses;
-  llvm::TimePassesIsEnabled = CodeGenOpts.TimePasses;
-  llvm::TimePassesPerRun = CodeGenOpts.TimePassesPerRun;
-}
-
-// This constructor is used in installing an empty BackendConsumer
-// to use the clang diagnostic handler for IR input files. It avoids
-// initializing the OS field.
-BackendConsumer::BackendConsumer(
-    BackendAction Action, DiagnosticsEngine &Diags,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-    const HeaderSearchOptions &HeaderSearchOpts,
-    const PreprocessorOptions &PPOpts, const CodeGenOptions &CodeGenOpts,
-    const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-    llvm::Module *Module, SmallVector<LinkModule, 4> LinkModules,
-    LLVMContext &C, CoverageSourceInfo *CoverageInfo)
-    : Diags(Diags), Action(Action), HeaderSearchOpts(HeaderSearchOpts),
-      CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts), LangOpts(LangOpts),
-      Context(nullptr), FS(VFS),
-      LLVMIRGeneration("irgen", "LLVM IR Generation Time"),
-      LLVMIRGenerationRefCount(0),
-      Gen(CreateLLVMCodeGen(Diags, "", std::move(VFS), HeaderSearchOpts, PPOpts,
-                            CodeGenOpts, C, CoverageInfo)),
-      LinkModules(std::move(LinkModules)), CurLinkModule(Module) {
+    const CompilerInstance &CI, BackendAction Action,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, LLVMContext &C,
+    SmallVector<LinkModule, 4> LinkModules, StringRef InFile,
+    std::unique_ptr<raw_pwrite_stream> OS, CoverageSourceInfo *CoverageInfo,
+    llvm::Module *CurLinkModule)
+    : Diags(CI.getDiagnostics()), HeaderSearchOpts(CI.getHeaderSearchOpts()),
+      CodeGenOpts(CI.getCodeGenOpts()), TargetOpts(CI.getTargetOpts()),
+      LangOpts(CI.getLangOpts()), AsmOutStream(std::move(OS)), FS(VFS),
+      LLVMIRGeneration("irgen", "LLVM IR Generation Time"), Action(Action),
+      Gen(CreateLLVMCodeGen(Diags, InFile, std::move(VFS),
+                            CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(),
+                            CI.getCodeGenOpts(), C, CoverageInfo)),
+      LinkModules(std::move(LinkModules)), CurLinkModule(CurLinkModule) {
   TimerIsEnabled = CodeGenOpts.TimePasses;
   llvm::TimePassesIsEnabled = CodeGenOpts.TimePasses;
   llvm::TimePassesPerRun = CodeGenOpts.TimePassesPerRun;
@@ -1011,10 +984,8 @@ CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
         CI.getPreprocessor());
 
   std::unique_ptr<BackendConsumer> Result(new BackendConsumer(
-      BA, CI.getDiagnostics(), &CI.getVirtualFileSystem(),
-      CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(), CI.getCodeGenOpts(),
-      CI.getTargetOpts(), CI.getLangOpts(), std::string(InFile),
-      std::move(LinkModules), std::move(OS), *VMContext, CoverageInfo));
+      CI, BA, &CI.getVirtualFileSystem(), *VMContext, std::move(LinkModules),
+      InFile, std::move(OS), CoverageInfo));
   BEConsumer = Result.get();
 
   // Enable generating macro debug info only when debug info is not disabled and
@@ -1182,11 +1153,9 @@ void CodeGenAction::ExecuteAction() {
 
   // Set clang diagnostic handler. To do this we need to create a fake
   // BackendConsumer.
-  BackendConsumer Result(BA, CI.getDiagnostics(), &CI.getVirtualFileSystem(),
-                         CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(),
-                         CI.getCodeGenOpts(), CI.getTargetOpts(),
-                         CI.getLangOpts(), TheModule.get(),
-                         std::move(LinkModules), *VMContext, nullptr);
+  BackendConsumer Result(CI, BA, &CI.getVirtualFileSystem(), *VMContext,
+                         std::move(LinkModules), "", nullptr, nullptr,
+                         TheModule.get());
 
   // Link in each pending link module.
   if (!CodeGenOpts.LinkBitcodePostopt && Result.LinkInModules(&*TheModule))
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 6145c6a..cdeff1e 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -315,7 +315,7 @@ public:
   SmallVector<const BinaryOperator *, 16> MCDCLogOpStack;
 
   /// Stack to track the controlled convergence tokens.
-  SmallVector<llvm::IntrinsicInst *, 4> ConvergenceTokenStack;
+  SmallVector<llvm::ConvergenceControlInst *, 4> ConvergenceTokenStack;
 
   /// Number of nested loop to be consumed by the last surrounding
   /// loop-associated directive.
@@ -4157,6 +4157,16 @@ public:
     // but in the future we will implement some sort of IR.
   }
 
+  void EmitOpenACCSetConstruct(const OpenACCSetConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // but in the future we will implement some sort of IR.
+  }
+
+  void EmitOpenACCUpdateConstruct(const OpenACCUpdateConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // but in the future we will implement some sort of IR.
+  }
+
   //===--------------------------------------------------------------------===//
   //                         LValue Expression Emission
   //===--------------------------------------------------------------------===//
@@ -4780,6 +4790,7 @@ public:
   llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitHLSLBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                    ReturnValueSlot ReturnValue);
+  llvm::Value *EmitSPIRVBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx,
                                            const CallExpr *E);
   llvm::Value *EmitSystemZBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
@@ -5258,29 +5269,20 @@ public:
   llvm::Value *emitBoolVecConversion(llvm::Value *SrcVec,
                                      unsigned NumElementsDst,
                                      const llvm::Twine &Name = "");
-  // Adds a convergence_ctrl token to |Input| and emits the required parent
-  // convergence instructions.
-  template <typename CallType>
-  CallType *addControlledConvergenceToken(CallType *Input) {
-    return cast<CallType>(
-        addConvergenceControlToken(Input, ConvergenceTokenStack.back()));
-  }
 
 private:
   // Emits a convergence_loop instruction for the given |BB|, with |ParentToken|
   // as it's parent convergence instr.
-  llvm::IntrinsicInst *emitConvergenceLoopToken(llvm::BasicBlock *BB,
-                                                llvm::Value *ParentToken);
+  llvm::ConvergenceControlInst *emitConvergenceLoopToken(llvm::BasicBlock *BB);
+
   // Adds a convergence_ctrl token with |ParentToken| as parent convergence
   // instr to the call |Input|.
-  llvm::CallBase *addConvergenceControlToken(llvm::CallBase *Input,
-                                             llvm::Value *ParentToken);
+  llvm::CallBase *addConvergenceControlToken(llvm::CallBase *Input);
+
   // Find the convergence_entry instruction |F|, or emits ones if none exists.
   // Returns the convergence instruction.
-  llvm::IntrinsicInst *getOrEmitConvergenceEntryToken(llvm::Function *F);
-  // Find the convergence_loop instruction for the loop defined by |LI|, or
-  // emits one if none exists. Returns the convergence instruction.
-  llvm::IntrinsicInst *getOrEmitConvergenceLoopToken(const LoopInfo *LI);
+  llvm::ConvergenceControlInst *
+  getOrEmitConvergenceEntryToken(llvm::Function *F);
 
 private:
   llvm::MDNode *getRangeForLoadFromType(QualType Ty);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c49f763..7db1ed7 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2748,7 +2748,21 @@ bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
     Attrs.addAttribute("target-features", llvm::join(Features, ","));
     AddedAttr = true;
   }
-
+  if (getTarget().getTriple().isAArch64()) {
+    llvm::SmallVector<StringRef, 8> Feats;
+    if (TV)
+      TV->getFeatures(Feats);
+    else if (TC)
+      TC->getFeatures(Feats, GD.getMultiVersionIndex());
+    if (!Feats.empty()) {
+      llvm::sort(Feats);
+      std::string FMVFeatures;
+      for (StringRef F : Feats)
+        FMVFeatures.append(",+" + F.str());
+      Attrs.addAttribute("fmv-features", FMVFeatures.substr(1));
+      AddedAttr = true;
+    }
+  }
   return AddedAttr;
 }
 
@@ -4227,7 +4241,7 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
 static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
                                                       llvm::Function *NewFn);
 
-static unsigned getFMVPriority(const TargetInfo &TI,
+static uint64_t getFMVPriority(const TargetInfo &TI,
                                const CodeGenFunction::FMVResolverOption &RO) {
   llvm::SmallVector<StringRef, 8> Features{RO.Features};
   if (RO.Architecture)
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index e385ca8..ce39b72 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -938,19 +938,37 @@ struct CounterCoverageMappingBuilder
   }
 
   struct BranchCounterPair {
-    Counter Executed;
-    Counter Skipped;
+    Counter Executed; ///< The Counter previously assigned.
+    Counter Skipped;  ///< An expression (Parent-Executed), or equivalent to it.
   };
 
+  /// Retrieve or assign the pair of Counter(s).
+  ///
+  /// This returns BranchCounterPair {Executed, Skipped}.
+  /// Executed is the Counter associated with S assigned by an earlier
+  /// CounterMapping pass.
+  /// Skipped may be an expression (Executed - ParentCnt) or newly
+  /// assigned Counter in EnableSingleByteCoverage, as subtract
+  /// expressions are not available in this mode.
+  ///
+  /// \param S Key to the CounterMap
+  /// \param ParentCnt The Counter representing how many times S is evaluated.
+  /// \param SkipCntForOld (To be removed later) Optional fake Counter
+  ///                      to override Skipped for adjustment of
+  ///                      expressions in the old behavior of
+  ///                      EnableSingleByteCoverage that is unaware of
+  ///                      Branch coverage.
   BranchCounterPair
   getBranchCounterPair(const Stmt *S, Counter ParentCnt,
                        std::optional<Counter> SkipCntForOld = std::nullopt) {
     auto &TheMap = CounterMap[S];
     auto ExecCnt = Counter::getCounter(TheMap.Executed);
 
-    // The old behavior of SingleByte shouldn't emit Branches.
+    // The old behavior of SingleByte is unaware of Branches.
+    // Will be pruned after the migration of SingleByte.
     if (llvm::EnableSingleByteCoverage) {
-      assert(SkipCntForOld);
+      assert(SkipCntForOld &&
+             "SingleByte must provide SkipCntForOld as a fake Skipped count.");
       return {ExecCnt, *SkipCntForOld};
     }
 
diff --git a/clang/lib/CodeGen/SanitizerMetadata.cpp b/clang/lib/CodeGen/SanitizerMetadata.cpp
index 61fdf33..b7b212b 100644
--- a/clang/lib/CodeGen/SanitizerMetadata.cpp
+++ b/clang/lib/CodeGen/SanitizerMetadata.cpp
@@ -145,7 +145,9 @@ void SanitizerMetadata::reportGlobal(llvm::GlobalVariable *GV, const VarDecl &D,
     for (auto *Attr : D.specific_attrs<NoSanitizeAttr>())
       NoSanitizeMask |= Attr->getMask();
 
-    if (D.hasExternalStorage())
+    // External definitions and incomplete types get handled at the place they
+    // are defined.
+    if (D.hasExternalStorage() || D.getType()->isIncompleteType())
       NoSanitizeMask |= SanitizerKind::Type;
 
     return NoSanitizeMask;
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index ad7f405..7db67ec 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -662,7 +662,7 @@ bool AArch64ABIInfo::isZeroLengthBitfieldPermittedInHomogeneousAggregate()
 
 bool AArch64ABIInfo::passAsAggregateType(QualType Ty) const {
   if (Kind == AArch64ABIKind::AAPCS && Ty->isSVESizelessBuiltinType()) {
-    const auto *BT = Ty->getAs<BuiltinType>();
+    const auto *BT = Ty->castAs<BuiltinType>();
     return !BT->isSVECount() &&
            getContext().getBuiltinVectorTypeInfo(BT).NumVectors > 1;
   }
@@ -1169,8 +1169,9 @@ void AArch64TargetCodeGenInfo::checkFunctionABI(
 enum class ArmSMEInlinability : uint8_t {
   Ok = 0,
   ErrorCalleeRequiresNewZA = 1 << 0,
-  WarnIncompatibleStreamingModes = 1 << 1,
-  ErrorIncompatibleStreamingModes = 1 << 2,
+  ErrorCalleeRequiresNewZT0 = 1 << 1,
+  WarnIncompatibleStreamingModes = 1 << 2,
+  ErrorIncompatibleStreamingModes = 1 << 3,
 
   IncompatibleStreamingModes =
       WarnIncompatibleStreamingModes | ErrorIncompatibleStreamingModes,
@@ -1198,9 +1199,12 @@ static ArmSMEInlinability GetArmSMEInlinability(const FunctionDecl *Caller,
     else
       Inlinability |= ArmSMEInlinability::WarnIncompatibleStreamingModes;
   }
-  if (auto *NewAttr = Callee->getAttr<ArmNewAttr>())
+  if (auto *NewAttr = Callee->getAttr<ArmNewAttr>()) {
     if (NewAttr->isNewZA())
       Inlinability |= ArmSMEInlinability::ErrorCalleeRequiresNewZA;
+    if (NewAttr->isNewZT0())
+      Inlinability |= ArmSMEInlinability::ErrorCalleeRequiresNewZT0;
+  }
 
   return Inlinability;
 }
@@ -1227,6 +1231,11 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
       ArmSMEInlinability::ErrorCalleeRequiresNewZA)
     CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_za)
         << Callee->getDeclName();
+
+  if ((Inlinability & ArmSMEInlinability::ErrorCalleeRequiresNewZT0) ==
+      ArmSMEInlinability::ErrorCalleeRequiresNewZT0)
+    CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_zt0)
+        << Callee->getDeclName();
 }
 
 // If the target does not have floating-point registers, but we are using a
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 56ad050..fa07e68 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -537,7 +537,11 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
     break;
   }
 
-  if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
+  // OpenCL assumes by default that atomic scopes are per-address space for
+  // non-sequentially consistent operations.
+  if (Scope >= SyncScope::OpenCLWorkGroup &&
+      Scope <= SyncScope::OpenCLSubGroup &&
+      Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
     if (!Name.empty())
       Name = Twine(Twine(Name) + Twine("-")).str();
 
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 0431d2c..b82e4dd 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -9,6 +9,7 @@
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
 using namespace clang;
@@ -79,13 +80,11 @@ public:
   // Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
   // resulting MDNode to the nvvm.annotations MDNode.
   static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
-                              int Operand,
-                              const SmallVectorImpl<int> &GridConstantArgs);
+                              int Operand);
 
-  static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
-                              int Operand) {
-    addNVVMMetadata(GV, Name, Operand, SmallVector<int, 1>(0));
-  }
+  static void
+  addGridConstantNVVMMetadata(llvm::GlobalValue *GV,
+                              const SmallVectorImpl<int> &GridConstantArgs);
 
 private:
   static void emitBuiltinSurfTexDeviceCopy(CodeGenFunction &CGF, LValue Dst,
@@ -259,7 +258,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
     if (FD->hasAttr<OpenCLKernelAttr>()) {
       // OpenCL __kernel functions get kernel metadata
       // Create !{<func-ref>, metadata !"kernel", i32 1} node
-      addNVVMMetadata(F, "kernel", 1);
+      F->setCallingConv(llvm::CallingConv::PTX_Kernel);
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
     }
@@ -277,7 +276,8 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
           // For some reason arg indices are 1-based in NVVM
           GCI.push_back(IV.index() + 1);
       // Create !{<func-ref>, metadata !"kernel", i32 1} node
-      addNVVMMetadata(F, "kernel", 1, GCI);
+      F->setCallingConv(llvm::CallingConv::PTX_Kernel);
+      addGridConstantNVVMMetadata(F, GCI);
     }
     if (CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>())
       M.handleCUDALaunchBoundsAttr(F, Attr);
@@ -285,13 +285,12 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
 
   // Attach kernel metadata directly if compiling for NVPTX.
   if (FD->hasAttr<NVPTXKernelAttr>()) {
-    addNVVMMetadata(F, "kernel", 1);
+    F->setCallingConv(llvm::CallingConv::PTX_Kernel);
   }
 }
 
-void NVPTXTargetCodeGenInfo::addNVVMMetadata(
-    llvm::GlobalValue *GV, StringRef Name, int Operand,
-    const SmallVectorImpl<int> &GridConstantArgs) {
+void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
+                                             StringRef Name, int Operand) {
   llvm::Module *M = GV->getParent();
   llvm::LLVMContext &Ctx = M->getContext();
 
@@ -302,6 +301,21 @@ void NVPTXTargetCodeGenInfo::addNVVMMetadata(
       llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
+
+  // Append metadata to nvvm.annotations
+  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+}
+
+void NVPTXTargetCodeGenInfo::addGridConstantNVVMMetadata(
+    llvm::GlobalValue *GV, const SmallVectorImpl<int> &GridConstantArgs) {
+
+  llvm::Module *M = GV->getParent();
+  llvm::LLVMContext &Ctx = M->getContext();
+
+  // Get "nvvm.annotations" metadata node
+  llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
+
+  SmallVector<llvm::Metadata *, 5> MDVals = {llvm::ConstantAsMetadata::get(GV)};
   if (!GridConstantArgs.empty()) {
     SmallVector<llvm::Metadata *, 10> GCM;
     for (int I : GridConstantArgs)
@@ -310,6 +324,7 @@ void NVPTXTargetCodeGenInfo::addNVVMMetadata(
     MDVals.append({llvm::MDString::get(Ctx, "grid_constant"),
                    llvm::MDNode::get(Ctx, GCM)});
   }
+
   // Append metadata to nvvm.annotations
   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
 }
diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index b04e436..873e696 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -495,13 +495,7 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
         return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
     }
 
-    ABIArgInfo Info = ABIArgInfo::getDirect();
-
-    // If it is tuple type, it can't be flattened.
-    if (llvm::StructType *STy = dyn_cast<llvm::StructType>(CGT.ConvertType(Ty)))
-      Info.setCanBeFlattened(!STy->containsHomogeneousScalableVectorTypes());
-
-    return Info;
+    return ABIArgInfo::getDirect();
   }
 
   if (const VectorType *VT = Ty->getAs<VectorType>())
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index a48fe9d..5c75e98 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -64,6 +64,8 @@ public:
   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
                                   const VarDecl *D) const override;
+  void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+                           CodeGen::CodeGenModule &M) const override;
   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
                                          SyncScope Scope,
                                          llvm::AtomicOrdering Ordering,
@@ -245,6 +247,41 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
   return DefaultGlobalAS;
 }
 
+void SPIRVTargetCodeGenInfo::setTargetAttributes(
+    const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
+  if (!M.getLangOpts().HIP ||
+      M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
+    return;
+  if (GV->isDeclaration())
+    return;
+
+  auto F = dyn_cast<llvm::Function>(GV);
+  if (!F)
+    return;
+
+  auto FD = dyn_cast_or_null<FunctionDecl>(D);
+  if (!FD)
+    return;
+  if (!FD->hasAttr<CUDAGlobalAttr>())
+    return;
+
+  unsigned N = M.getLangOpts().GPUMaxThreadsPerBlock;
+  if (auto FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>())
+    N = FlatWGS->getMax()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+
+  // We encode the maximum flat WG size in the first component of the 3D
+  // max_work_group_size attribute, which will get reverse translated into the
+  // original AMDGPU attribute when targeting AMDGPU.
+  auto Int32Ty = llvm::IntegerType::getInt32Ty(M.getLLVMContext());
+  llvm::Metadata *AttrMDArgs[] = {
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, N)),
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1)),
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1))};
+
+  F->setMetadata("max_work_group_size",
+                 llvm::MDNode::get(M.getLLVMContext(), AttrMDArgs));
+}
+
 llvm::SyncScope::ID
 SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope,
                                            llvm::AtomicOrdering,
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index 849bf60..23dbceb 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -111,6 +111,8 @@ std::string Action::getOffloadingKindPrefix() const {
     return "device-openmp";
   case OFK_HIP:
     return "device-hip";
+  case OFK_SYCL:
+    return "device-sycl";
 
     // TODO: Add other programming models here.
   }
@@ -128,6 +130,8 @@ std::string Action::getOffloadingKindPrefix() const {
     Res += "-hip";
   if (ActiveOffloadKindMask & OFK_OpenMP)
     Res += "-openmp";
+  if (ActiveOffloadKindMask & OFK_SYCL)
+    Res += "-sycl";
 
   // TODO: Add other programming models here.
 
@@ -164,6 +168,8 @@ StringRef Action::GetOffloadKindName(OffloadKind Kind) {
     return "openmp";
   case OFK_HIP:
     return "hip";
+  case OFK_SYCL:
+    return "sycl";
 
     // TODO: Add other programming models here.
   }
@@ -320,7 +326,7 @@ void OffloadAction::DeviceDependences::add(Action &A, const ToolChain &TC,
   DeviceBoundArchs.push_back(BoundArch);
 
   // Add each active offloading kind from a mask.
-  for (OffloadKind OKind : {OFK_OpenMP, OFK_Cuda, OFK_HIP})
+  for (OffloadKind OKind : {OFK_OpenMP, OFK_Cuda, OFK_HIP, OFK_SYCL})
     if (OKind & OffloadKindMask)
       DeviceOffloadKinds.push_back(OKind);
 }
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 4fd10bf..5bdb661 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -77,6 +77,8 @@ add_clang_library(clangDriver
   ToolChains/RISCVToolchain.cpp
   ToolChains/Solaris.cpp
   ToolChains/SPIRV.cpp
+  ToolChains/SPIRVOpenMP.cpp
+  ToolChains/SYCL.cpp
   ToolChains/TCE.cpp
   ToolChains/UEFI.cpp
   ToolChains/VEToolchain.cpp
diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index 4d40805..a39952e 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -214,10 +214,11 @@ static bool ActionFailed(const Action *A,
   if (FailingCommands.empty())
     return false;
 
-  // CUDA/HIP can have the same input source code compiled multiple times so do
-  // not compiled again if there are already failures. It is OK to abort the
-  // CUDA pipeline on errors.
-  if (A->isOffloading(Action::OFK_Cuda) || A->isOffloading(Action::OFK_HIP))
+  // CUDA/HIP/SYCL can have the same input source code compiled multiple times
+  // so do not compile again if there are already failures. It is OK to abort
+  // the CUDA/HIP/SYCL pipeline on errors.
+  if (A->isOffloading(Action::OFK_Cuda) || A->isOffloading(Action::OFK_HIP) ||
+      A->isOffloading(Action::OFK_SYCL))
     return true;
 
   for (const auto &CI : FailingCommands)
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index dc84c1b..528b7d1 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -43,6 +43,8 @@
 #include "ToolChains/PS4CPU.h"
 #include "ToolChains/RISCVToolchain.h"
 #include "ToolChains/SPIRV.h"
+#include "ToolChains/SPIRVOpenMP.h"
+#include "ToolChains/SYCL.h"
 #include "ToolChains/Solaris.h"
 #include "ToolChains/TCE.h"
 #include "ToolChains/UEFI.h"
@@ -780,6 +782,35 @@ Driver::OpenMPRuntimeKind Driver::getOpenMPRuntime(const ArgList &Args) const {
   return RT;
 }
 
+static llvm::Triple getSYCLDeviceTriple(StringRef TargetArch) {
+  SmallVector<StringRef, 5> SYCLAlias = {"spir", "spir64", "spirv", "spirv32",
+                                         "spirv64"};
+  if (llvm::is_contained(SYCLAlias, TargetArch)) {
+    llvm::Triple TargetTriple;
+    TargetTriple.setArchName(TargetArch);
+    TargetTriple.setVendor(llvm::Triple::UnknownVendor);
+    TargetTriple.setOS(llvm::Triple::UnknownOS);
+    return TargetTriple;
+  }
+  return llvm::Triple(TargetArch);
+}
+
+static bool addSYCLDefaultTriple(Compilation &C,
+                                 SmallVectorImpl<llvm::Triple> &SYCLTriples) {
+  // Check current set of triples to see if the default has already been set.
+  for (const auto &SYCLTriple : SYCLTriples) {
+    if (SYCLTriple.getSubArch() == llvm::Triple::NoSubArch &&
+        SYCLTriple.isSPIROrSPIRV())
+      return false;
+  }
+  // Add the default triple as it was not found.
+  llvm::Triple DefaultTriple = getSYCLDeviceTriple(
+      C.getDefaultToolChain().getTriple().isArch32Bit() ? "spirv32"
+                                                        : "spirv64");
+  SYCLTriples.insert(SYCLTriples.begin(), DefaultTriple);
+  return true;
+}
+
 void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                                               InputList &Inputs) {
 
@@ -841,7 +872,6 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       return;
     auto *HIPTC = &getOffloadingDeviceToolChain(C.getInputArgs(), *HIPTriple,
                                                 *HostTC, OFK);
-    assert(HIPTC && "Could not create offloading device tool chain.");
     C.addOffloadDeviceToolChain(HIPTC, OFK);
   }
 
@@ -890,9 +920,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                                                       HostTC->getTriple());
 
       // Attempt to deduce the offloading triple from the set of architectures.
-      // We can only correctly deduce NVPTX / AMDGPU triples currently. We need
-      // to temporarily create these toolchains so that we can access tools for
-      // inferring architectures.
+      // We can only correctly deduce NVPTX / AMDGPU triples currently.
+      // We need to temporarily create these toolchains so that we can access
+      // tools for inferring architectures.
       llvm::DenseSet<StringRef> Archs;
       if (NVPTXTriple) {
         auto TempTC = std::make_unique<toolchains::CudaToolChain>(
@@ -962,7 +992,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
         const ToolChain *TC;
         // Device toolchains have to be selected differently. They pair host
         // and device in their implementation.
-        if (TT.isNVPTX() || TT.isAMDGCN()) {
+        if (TT.isNVPTX() || TT.isAMDGCN() || TT.isSPIRV()) {
           const ToolChain *HostTC =
               C.getSingleOffloadToolChain<Action::OFK_Host>();
           assert(HostTC && "Host toolchain should be always defined.");
@@ -975,6 +1005,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
             else if (TT.isAMDGCN())
               DeviceTC = std::make_unique<toolchains::AMDGPUOpenMPToolChain>(
                   *this, TT, *HostTC, C.getInputArgs());
+            else if (TT.isSPIRV())
+              DeviceTC = std::make_unique<toolchains::SPIRVOpenMPToolChain>(
+                  *this, TT, *HostTC, C.getInputArgs());
             else
               assert(DeviceTC && "Device toolchain not defined.");
           }
@@ -993,11 +1026,71 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
     return;
   }
 
+  // We need to generate a SYCL toolchain if the user specified -fsycl.
+  bool IsSYCL = C.getInputArgs().hasFlag(options::OPT_fsycl,
+                                         options::OPT_fno_sycl, false);
+
+  auto argSYCLIncompatible = [&](OptSpecifier OptId) {
+    if (!IsSYCL)
+      return;
+    if (Arg *IncompatArg = C.getInputArgs().getLastArg(OptId))
+      Diag(clang::diag::err_drv_argument_not_allowed_with)
+          << IncompatArg->getSpelling() << "-fsycl";
+  };
+  // -static-libstdc++ is not compatible with -fsycl.
+  argSYCLIncompatible(options::OPT_static_libstdcxx);
+  // -ffreestanding cannot be used with -fsycl
+  argSYCLIncompatible(options::OPT_ffreestanding);
+
+  llvm::SmallVector<llvm::Triple, 4> UniqueSYCLTriplesVec;
+
+  if (IsSYCL) {
+    addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
+
+    // We'll need to use the SYCL and host triples as the key into
+    // getOffloadingDeviceToolChain, because the device toolchains we're
+    // going to create will depend on both.
+    const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+    for (const auto &TargetTriple : UniqueSYCLTriplesVec) {
+      auto SYCLTC = &getOffloadingDeviceToolChain(
+          C.getInputArgs(), TargetTriple, *HostTC, Action::OFK_SYCL);
+      C.addOffloadDeviceToolChain(SYCLTC, Action::OFK_SYCL);
+    }
+  }
+
   //
   // TODO: Add support for other offloading programming models here.
   //
 }
 
+bool Driver::loadZOSCustomizationFile(llvm::cl::ExpansionContext &ExpCtx) {
+  if (IsCLMode() || IsDXCMode() || IsFlangMode())
+    return false;
+
+  SmallString<128> CustomizationFile;
+  StringRef PathLIBEnv = StringRef(getenv("CLANG_CONFIG_PATH")).trim();
+  // If the env var is a directory then append "/clang.cfg" and treat
+  // that as the config file.  Otherwise treat the env var as the
+  // config file.
+  if (!PathLIBEnv.empty()) {
+    llvm::sys::path::append(CustomizationFile, PathLIBEnv);
+    if (llvm::sys::fs::is_directory(PathLIBEnv))
+      llvm::sys::path::append(CustomizationFile, "/clang.cfg");
+    if (llvm::sys::fs::is_regular_file(CustomizationFile))
+      return readConfigFile(CustomizationFile, ExpCtx);
+    Diag(diag::err_drv_config_file_not_found) << CustomizationFile;
+    return true;
+  }
+
+  SmallString<128> BaseDir(llvm::sys::path::parent_path(Dir));
+  llvm::sys::path::append(CustomizationFile, BaseDir + "/etc/clang.cfg");
+  if (llvm::sys::fs::is_regular_file(CustomizationFile))
+    return readConfigFile(CustomizationFile, ExpCtx);
+
+  // If no customization file, just return
+  return false;
+}
+
 static void appendOneArg(InputArgList &Args, const Arg *Opt) {
   // The args for config files or /clang: flags belong to different InputArgList
   // objects than Args. This copies an Arg from one of those other InputArgLists
@@ -1219,11 +1312,18 @@ bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) {
   }
 
   // Otherwise, use the real triple as used by the driver.
+  llvm::Triple RealTriple =
+      computeTargetTriple(*this, TargetTriple, *CLOptions);
   if (Triple.str().empty()) {
-    Triple = computeTargetTriple(*this, TargetTriple, *CLOptions);
+    Triple = RealTriple;
     assert(!Triple.str().empty());
   }
 
+  // On z/OS, start by loading the customization file before loading
+  // the usual default config file(s).
+  if (RealTriple.isOSzOS() && loadZOSCustomizationFile(ExpCtx))
+    return true;
+
   // Search for config files in the following order:
   // 1. <triple>-<mode>.cfg using real driver mode
   //    (e.g. i386-pc-linux-gnu-clang++.cfg).
@@ -4230,6 +4330,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
 
   bool UseNewOffloadingDriver =
       C.isOffloadingHostKind(Action::OFK_OpenMP) ||
+      C.isOffloadingHostKind(Action::OFK_SYCL) ||
       Args.hasFlag(options::OPT_foffload_via_llvm,
                    options::OPT_fno_offload_via_llvm, false) ||
       Args.hasFlag(options::OPT_offload_new_driver,
@@ -4647,6 +4748,8 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
       Archs.insert(OffloadArchToString(OffloadArch::HIPDefault));
     else if (Kind == Action::OFK_OpenMP)
       Archs.insert(StringRef());
+    else if (Kind == Action::OFK_SYCL)
+      Archs.insert(StringRef());
   } else {
     Args.ClaimAllArgs(options::OPT_offload_arch_EQ);
     Args.ClaimAllArgs(options::OPT_no_offload_arch_EQ);
@@ -4671,7 +4774,7 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
   OffloadAction::DeviceDependences DDeps;
 
   const Action::OffloadKind OffloadKinds[] = {
-      Action::OFK_OpenMP, Action::OFK_Cuda, Action::OFK_HIP};
+      Action::OFK_OpenMP, Action::OFK_Cuda, Action::OFK_HIP, Action::OFK_SYCL};
 
   for (Action::OffloadKind Kind : OffloadKinds) {
     SmallVector<const ToolChain *, 2> ToolChains;
@@ -4708,6 +4811,15 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     if (DeviceActions.empty())
       return HostAction;
 
+    // FIXME: Do not collapse the host side for Darwin targets with SYCL offload
+    // compilations. The toolchain is not properly initialized for the target.
+    if (isa<CompileJobAction>(HostAction) && Kind == Action::OFK_SYCL &&
+        HostAction->getType() != types::TY_Nothing &&
+        C.getSingleOffloadToolChain<Action::OFK_Host>()
+            ->getTriple()
+            .isOSDarwin())
+      HostAction->setCannotBeCollapsedWithNextDependentAction();
+
     auto PL = types::getCompilationPhases(*this, Args, InputType);
 
     for (phases::ID Phase : PL) {
@@ -4716,6 +4828,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
         break;
       }
 
+      // Assemble actions are not used for the SYCL device side.  Both compile
+      // and backend actions are used to generate IR and textual IR if needed.
+      if (Kind == Action::OFK_SYCL && Phase == phases::Assemble)
+        continue;
+
       auto TCAndArch = TCAndArchs.begin();
       for (Action *&A : DeviceActions) {
         if (A->getType() == types::TY_Nothing)
@@ -4954,6 +5071,7 @@ Action *Driver::ConstructPhaseAction(
       return C.MakeAction<BackendJobAction>(Input, Output);
     }
     if (Args.hasArg(options::OPT_emit_llvm) ||
+        TargetDeviceOffloadKind == Action::OFK_SYCL ||
         (((Input->getOffloadingToolChain() &&
            Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
           TargetDeviceOffloadKind == Action::OFK_HIP) &&
@@ -6603,6 +6721,8 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
           TC = std::make_unique<toolchains::BareMetal>(*this, Target, Args);
         else if (Target.isOSBinFormatELF())
           TC = std::make_unique<toolchains::Generic_ELF>(*this, Target, Args);
+        else if (Target.isAppleMachO())
+          TC = std::make_unique<toolchains::AppleMachO>(*this, Target, Args);
         else if (Target.isOSBinFormatMachO())
           TC = std::make_unique<toolchains::MachO>(*this, Target, Args);
         else
@@ -6640,11 +6760,16 @@ const ToolChain &Driver::getOffloadingDeviceToolChain(
                                                            HostTC, Args);
       break;
     }
+    case Action::OFK_SYCL:
+      if (Target.isSPIROrSPIRV())
+        TC = std::make_unique<toolchains::SYCLToolChain>(*this, Target, HostTC,
+                                                         Args);
+      break;
     default:
       break;
     }
   }
-
+  assert(TC && "Could not create offloading device tool chain.");
   return *TC;
 }
 
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 7726e46..98116e2 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -338,6 +338,14 @@ bool SanitizerArgs::needsUbsanRt() const {
          CoverageFeatures;
 }
 
+bool SanitizerArgs::needsUbsanCXXRt() const {
+  // Link UBSAN C++ runtime very selectively, as it's needed in only very
+  // specific cases, but forces the program to depend on C++ ABI. UBSAN C++
+  // runtime is not included with other sanitizers.
+  return static_cast<bool>(Sanitizers.Mask & NeedsUbsanCxxRt &
+                           ~TrapSanitizers.Mask);
+}
+
 bool SanitizerArgs::needsCfiRt() const {
   return !(Sanitizers.Mask & SanitizerKind::CFI & ~TrapSanitizers.Mask) &&
          CfiCrossDso && !ImplicitCfiRuntime;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 9f174fb..2b4df64 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1485,6 +1485,9 @@ void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
 void ToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                   ArgStringList &CC1Args) const {}
 
+void ToolChain::addSYCLIncludeArgs(const ArgList &DriverArgs,
+                                   ArgStringList &CC1Args) const {}
+
 llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
 ToolChain::getDeviceLibs(const ArgList &DriverArgs) const {
   return {};
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a020e00..c4b5374 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -24,6 +24,7 @@
 #include "Hexagon.h"
 #include "MSP430.h"
 #include "PS4CPU.h"
+#include "SYCL.h"
 #include "clang/Basic/CLWarnings.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/CodeGenOptions.h"
@@ -122,6 +123,13 @@ forAllAssociatedToolChains(Compilation &C, const JobAction &JA,
   } else if (JA.isDeviceOffloading(Action::OFK_OpenMP))
     Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
 
+  if (JA.isHostOffloading(Action::OFK_SYCL)) {
+    auto TCs = C.getOffloadToolChains<Action::OFK_SYCL>();
+    for (auto II = TCs.first, IE = TCs.second; II != IE; ++II)
+      Work(*II->second);
+  } else if (JA.isDeviceOffloading(Action::OFK_SYCL))
+    Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
+
   //
   // TODO: Add support for other offloading programming models here.
   //
@@ -1070,14 +1078,16 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_MP);
   Args.AddLastArg(CmdArgs, options::OPT_MV);
 
-  // Add offload include arguments specific for CUDA/HIP.  This must happen
+  // Add offload include arguments specific for CUDA/HIP/SYCL. This must happen
   // before we -I or -include anything else, because we must pick up the
-  // CUDA/HIP headers from the particular CUDA/ROCm installation, rather than
-  // from e.g. /usr/local/include.
+  // CUDA/HIP/SYCL headers from the particular CUDA/ROCm/SYCL installation,
+  // rather than from e.g. /usr/local/include.
   if (JA.isOffloading(Action::OFK_Cuda))
     getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
   if (JA.isOffloading(Action::OFK_HIP))
     getToolChain().AddHIPIncludeArgs(Args, CmdArgs);
+  if (JA.isOffloading(Action::OFK_SYCL))
+    getToolChain().addSYCLIncludeArgs(Args, CmdArgs);
 
   // If we are offloading to a target via OpenMP we need to include the
   // openmp_wrappers folder which contains alternative system headers.
@@ -5037,17 +5047,21 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // second input. Module precompilation accepts a list of header files to
   // include as part of the module. API extraction accepts a list of header
   // files whose API information is emitted in the output. All other jobs are
-  // expected to have exactly one input.
+  // expected to have exactly one input. SYCL compilation only expects a
+  // single input.
   bool IsCuda = JA.isOffloading(Action::OFK_Cuda);
   bool IsCudaDevice = JA.isDeviceOffloading(Action::OFK_Cuda);
   bool IsHIP = JA.isOffloading(Action::OFK_HIP);
   bool IsHIPDevice = JA.isDeviceOffloading(Action::OFK_HIP);
+  bool IsSYCL = JA.isOffloading(Action::OFK_SYCL);
+  bool IsSYCLDevice = JA.isDeviceOffloading(Action::OFK_SYCL);
   bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP);
   bool IsExtractAPI = isa<ExtractAPIJobAction>(JA);
   bool IsDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) ||
                                  JA.isDeviceOffloading(Action::OFK_Host));
   bool IsHostOffloadingAction =
       JA.isHostOffloading(Action::OFK_OpenMP) ||
+      JA.isHostOffloading(Action::OFK_SYCL) ||
       (JA.isHostOffloading(C.getActiveOffloadKinds()) &&
        Args.hasFlag(options::OPT_offload_new_driver,
                     options::OPT_no_offload_new_driver, false));
@@ -5095,12 +5109,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   const llvm::Triple *AuxTriple =
       (IsCuda || IsHIP) ? TC.getAuxTriple() : nullptr;
   bool IsWindowsMSVC = RawTriple.isWindowsMSVCEnvironment();
+  bool IsUEFI = RawTriple.isUEFI();
   bool IsIAMCU = RawTriple.isOSIAMCU();
 
-  // Adjust IsWindowsXYZ for CUDA/HIP compilations.  Even when compiling in
+  // Adjust IsWindowsXYZ for CUDA/HIP/SYCL compilations.  Even when compiling in
   // device mode (i.e., getToolchain().getTriple() is NVPTX/AMDGCN, not
   // Windows), we need to pass Windows-specific flags to cc1.
-  if (IsCuda || IsHIP)
+  if (IsCuda || IsHIP || IsSYCL)
     IsWindowsMSVC |= AuxTriple && AuxTriple->isWindowsMSVCEnvironment();
 
   // C++ is not supported for IAMCU.
@@ -5184,11 +5199,33 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (const Arg *PF = Args.getLastArg(options::OPT_mprintf_kind_EQ))
     PF->claim();
 
-  if (Args.hasFlag(options::OPT_fsycl, options::OPT_fno_sycl, false)) {
-    CmdArgs.push_back("-fsycl-is-device");
+  if (IsSYCL) {
+    if (IsSYCLDevice) {
+      // Host triple is needed when doing SYCL device compilations.
+      llvm::Triple AuxT = C.getDefaultToolChain().getTriple();
+      std::string NormalizedTriple = AuxT.normalize();
+      CmdArgs.push_back("-aux-triple");
+      CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
 
-    if (Arg *A = Args.getLastArg(options::OPT_sycl_std_EQ)) {
-      A->render(Args, CmdArgs);
+      // We want to compile sycl kernels.
+      CmdArgs.push_back("-fsycl-is-device");
+
+      // Set O2 optimization level by default
+      if (!Args.getLastArg(options::OPT_O_Group))
+        CmdArgs.push_back("-O2");
+    } else {
+      // Add any options that are needed specific to SYCL offload while
+      // performing the host side compilation.
+
+      // Let the front-end host compilation flow know about SYCL offload
+      // compilation.
+      CmdArgs.push_back("-fsycl-is-host");
+    }
+
+    // Set options for both host and device.
+    Arg *SYCLStdArg = Args.getLastArg(options::OPT_sycl_std_EQ);
+    if (SYCLStdArg) {
+      SYCLStdArg->render(Args, CmdArgs);
     } else {
       // Ensure the default version in SYCL mode is 2020.
       CmdArgs.push_back("-sycl-std=2020");
@@ -6135,7 +6172,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // Prepare `-aux-target-cpu` and `-aux-target-feature` unless
   // `--gpu-use-aux-triple-only` is specified.
   if (!Args.getLastArg(options::OPT_gpu_use_aux_triple_only) &&
-      (IsCudaDevice || IsHIPDevice)) {
+      (IsCudaDevice || IsHIPDevice || IsSYCLDevice)) {
     const ArgList &HostArgs =
         C.getArgsForToolChain(nullptr, StringRef(), Action::OFK_None);
     std::string HostCPU =
@@ -7216,7 +7253,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   // -fms-extensions=0 is default.
   if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions,
-                   IsWindowsMSVC))
+                   IsWindowsMSVC || IsUEFI))
     CmdArgs.push_back("-fms-extensions");
 
   // -fms-compatibility=0 is default.
@@ -8010,15 +8047,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  if (Args.hasArg(options::OPT_forder_file_instrumentation)) {
-     CmdArgs.push_back("-forder-file-instrumentation");
-     // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is
-     // on, we need to pass these flags as linker flags and that will be handled
-     // outside of the compiler.
-     if (!IsUsingLTO) {
-       CmdArgs.push_back("-mllvm");
-       CmdArgs.push_back("-enable-order-file-instrumentation");
-     }
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_forder_file_instrumentation)) {
+    D.Diag(diag::warn_drv_deprecated_arg)
+        << A->getAsString(Args) << /*hasReplacement=*/true
+        << "-mllvm -pgo-temporal-instrumentation";
+    CmdArgs.push_back("-forder-file-instrumentation");
+    // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is
+    // on, we need to pass these flags as linker flags and that will be handled
+    // outside of the compiler.
+    if (!IsUsingLTO) {
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back("-enable-order-file-instrumentation");
+    }
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fforce_enable_int128,
@@ -9241,6 +9282,10 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   if (const Arg *A = Args.getLastArg(options::OPT_Rpass_analysis_EQ))
     CmdArgs.push_back(Args.MakeArgString(
         Twine("--offload-opt=-pass-remarks-analysis=") + A->getValue()));
+
+  if (Args.getLastArg(options::OPT_ftime_report))
+    CmdArgs.push_back("--device-compiler=-ftime-report");
+
   if (Args.getLastArg(options::OPT_save_temps_EQ))
     CmdArgs.push_back("--save-temps");
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index e33fa44..f896789 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -708,6 +708,11 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args,
   case llvm::Triple::loongarch32:
   case llvm::Triple::loongarch64:
     return loongarch::getLoongArchTargetCPU(Args, T);
+
+  case llvm::Triple::xtensa:
+    if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
+      return A->getValue();
+    return "";
   }
 }
 
@@ -1204,6 +1209,10 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   if (ImplicitMapSyms)
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-implicit-mapsyms"));
+
+  if (Args.hasArg(options::OPT_ftime_report))
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine(PluginOptPrefix) + "-time-passes"));
 }
 
 void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
@@ -1454,6 +1463,7 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
                          SmallVectorImpl<StringRef> &NonWholeStaticRuntimes,
                          SmallVectorImpl<StringRef> &HelperStaticRuntimes,
                          SmallVectorImpl<StringRef> &RequiredSymbols) {
+  assert(!TC.getTriple().isOSDarwin() && "it's not used by Darwin");
   const SanitizerArgs &SanArgs = TC.getSanitizerArgs(Args);
   // Collect shared runtimes.
   if (SanArgs.needsSharedRt()) {
@@ -1574,7 +1584,7 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
       StaticRuntimes.push_back("cfi_diag");
   }
   if (SanArgs.linkCXXRuntimes() && !SanArgs.requiresMinimalRuntime() &&
-      ((!SanArgs.needsSharedRt() && SanArgs.needsUbsanRt()) ||
+      ((!SanArgs.needsSharedRt() && SanArgs.needsUbsanCXXRt()) ||
        SanArgs.needsCfiDiagRt())) {
     StaticRuntimes.push_back("ubsan_standalone_cxx");
   }
@@ -2833,12 +2843,14 @@ void tools::addOpenMPDeviceRTL(const Driver &D,
     LibraryPaths.emplace_back(LibPath);
 
   OptSpecifier LibomptargetBCPathOpt =
-      Triple.isAMDGCN() ? options::OPT_libomptarget_amdgpu_bc_path_EQ
-                        : options::OPT_libomptarget_nvptx_bc_path_EQ;
-
-  StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx";
-  std::string LibOmpTargetName =
-      ("libomptarget-" + ArchPrefix + "-" + BitcodeSuffix + ".bc").str();
+      Triple.isAMDGCN()  ? options::OPT_libomptarget_amdgpu_bc_path_EQ
+      : Triple.isNVPTX() ? options::OPT_libomptarget_nvptx_bc_path_EQ
+                         : options::OPT_libomptarget_spirv_bc_path_EQ;
+
+  StringRef ArchPrefix = Triple.isAMDGCN()  ? "amdgpu"
+                         : Triple.isNVPTX() ? "nvptx"
+                                            : "spirv64";
+  std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + ".bc").str();
 
   // First check whether user specifies bc library
   if (const Arg *A = DriverArgs.getLastArg(LibomptargetBCPathOpt)) {
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 1027948..214f1e5 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -851,7 +851,6 @@ void CudaToolChain::addClangTargetOptions(
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
   StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
-  assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
   assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
           DeviceOffloadingKind == Action::OFK_Cuda) &&
          "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 4105d38..e5dffb1 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -966,10 +966,14 @@ MachO::MachO(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
   getProgramPaths().push_back(getDriver().Dir);
 }
 
+AppleMachO::AppleMachO(const Driver &D, const llvm::Triple &Triple,
+                       const ArgList &Args)
+    : MachO(D, Triple, Args), CudaInstallation(D, Triple, Args),
+      RocmInstallation(D, Triple, Args), SYCLInstallation(D, Triple, Args) {}
+
 /// Darwin - Darwin tool chain for i386 and x86_64.
 Darwin::Darwin(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
-    : MachO(D, Triple, Args), TargetInitialized(false),
-      CudaInstallation(D, Triple, Args), RocmInstallation(D, Triple, Args) {}
+    : AppleMachO(D, Triple, Args), TargetInitialized(false) {}
 
 types::ID MachO::LookupTypeForExtension(StringRef Ext) const {
   types::ID Ty = ToolChain::LookupTypeForExtension(Ext);
@@ -1018,16 +1022,21 @@ bool Darwin::hasBlocksRuntime() const {
   }
 }
 
-void Darwin::AddCudaIncludeArgs(const ArgList &DriverArgs,
-                                ArgStringList &CC1Args) const {
+void AppleMachO::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                                    ArgStringList &CC1Args) const {
   CudaInstallation->AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
-void Darwin::AddHIPIncludeArgs(const ArgList &DriverArgs,
-                               ArgStringList &CC1Args) const {
+void AppleMachO::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                   ArgStringList &CC1Args) const {
   RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
+void AppleMachO::addSYCLIncludeArgs(const ArgList &DriverArgs,
+                                    ArgStringList &CC1Args) const {
+  SYCLInstallation->addSYCLIncludeArgs(DriverArgs, CC1Args);
+}
+
 // This is just a MachO name translation routine and there's no
 // way to join this into ARMTargetParser without breaking all
 // other assumptions. Maybe MachO should consider standardising
@@ -1119,6 +1128,8 @@ VersionTuple MachO::getLinkerVersion(const llvm::opt::ArgList &Args) const {
 
 Darwin::~Darwin() {}
 
+AppleMachO::~AppleMachO() {}
+
 MachO::~MachO() {}
 
 std::string Darwin::ComputeEffectiveClangTriple(const ArgList &Args,
@@ -2482,7 +2493,7 @@ static void AppendPlatformPrefix(SmallString<128> &Path,
 // Returns the effective sysroot from either -isysroot or --sysroot, plus the
 // platform prefix (if any).
 llvm::SmallString<128>
-DarwinClang::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const {
+AppleMachO::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const {
   llvm::SmallString<128> Path("/");
   if (DriverArgs.hasArg(options::OPT_isysroot))
     Path = DriverArgs.getLastArgValue(options::OPT_isysroot);
@@ -2495,8 +2506,9 @@ DarwinClang::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const {
   return Path;
 }
 
-void DarwinClang::AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                                            llvm::opt::ArgStringList &CC1Args) const {
+void AppleMachO::AddClangSystemIncludeArgs(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
   llvm::SmallString<128> Sysroot = GetEffectiveSysroot(DriverArgs);
@@ -2574,7 +2586,7 @@ bool DarwinClang::AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverAr
   return getVFS().exists(Base);
 }
 
-void DarwinClang::AddClangCXXStdlibIncludeArgs(
+void AppleMachO::AddClangCXXStdlibIncludeArgs(
     const llvm::opt::ArgList &DriverArgs,
     llvm::opt::ArgStringList &CC1Args) const {
   // The implementation from a base class will pass through the -stdlib to
@@ -2631,55 +2643,60 @@ void DarwinClang::AddClangCXXStdlibIncludeArgs(
   }
 
   case ToolChain::CST_Libstdcxx:
-    llvm::SmallString<128> UsrIncludeCxx = Sysroot;
-    llvm::sys::path::append(UsrIncludeCxx, "usr", "include", "c++");
-
-    llvm::Triple::ArchType arch = getTriple().getArch();
-    bool IsBaseFound = true;
-    switch (arch) {
-    default: break;
-
-    case llvm::Triple::x86:
-    case llvm::Triple::x86_64:
-      IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.2.1",
-                                                "i686-apple-darwin10",
-                                                arch == llvm::Triple::x86_64 ? "x86_64" : "");
-      IsBaseFound |= AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.0.0", "i686-apple-darwin8",
-                                                 "");
-      break;
+    AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args);
+    break;
+  }
+}
 
-    case llvm::Triple::arm:
-    case llvm::Triple::thumb:
-      IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.2.1",
-                                                "arm-apple-darwin10",
-                                                "v7");
-      IsBaseFound |= AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.2.1",
-                                                "arm-apple-darwin10",
-                                                 "v6");
-      break;
+void AppleMachO::AddGnuCPlusPlusIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {}
 
-    case llvm::Triple::aarch64:
-      IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.2.1",
-                                                "arm64-apple-darwin10",
-                                                "");
-      break;
-    }
+void DarwinClang::AddGnuCPlusPlusIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  llvm::SmallString<128> UsrIncludeCxx = GetEffectiveSysroot(DriverArgs);
+  llvm::sys::path::append(UsrIncludeCxx, "usr", "include", "c++");
 
-    if (!IsBaseFound) {
-      getDriver().Diag(diag::warn_drv_libstdcxx_not_found);
-    }
+  llvm::Triple::ArchType arch = getTriple().getArch();
+  bool IsBaseFound = true;
+  switch (arch) {
+  default:
+    break;
+
+  case llvm::Triple::x86:
+  case llvm::Triple::x86_64:
+    IsBaseFound = AddGnuCPlusPlusIncludePaths(
+        DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1", "i686-apple-darwin10",
+        arch == llvm::Triple::x86_64 ? "x86_64" : "");
+    IsBaseFound |= AddGnuCPlusPlusIncludePaths(
+        DriverArgs, CC1Args, UsrIncludeCxx, "4.0.0", "i686-apple-darwin8", "");
+    break;
 
+  case llvm::Triple::arm:
+  case llvm::Triple::thumb:
+    IsBaseFound =
+        AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1",
+                                    "arm-apple-darwin10", "v7");
+    IsBaseFound |=
+        AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1",
+                                    "arm-apple-darwin10", "v6");
+    break;
+
+  case llvm::Triple::aarch64:
+    IsBaseFound =
+        AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1",
+                                    "arm64-apple-darwin10", "");
     break;
   }
+
+  if (!IsBaseFound) {
+    getDriver().Diag(diag::warn_drv_libstdcxx_not_found);
+  }
 }
 
-void DarwinClang::AddCXXStdlibLibArgs(const ArgList &Args,
-                                      ArgStringList &CmdArgs) const {
+void AppleMachO::AddCXXStdlibLibArgs(const ArgList &Args,
+                                     ArgStringList &CmdArgs) const {
   CXXStdlibType Type = GetCXXStdlibType(Args);
 
   switch (Type) {
@@ -3615,7 +3632,7 @@ SanitizerMask Darwin::getSupportedSanitizers() const {
   return Res;
 }
 
-void Darwin::printVerboseInfo(raw_ostream &OS) const {
+void AppleMachO::printVerboseInfo(raw_ostream &OS) const {
   CudaInstallation->print(OS);
   RocmInstallation->print(OS);
 }
diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index 2e55b496..c44780c 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -12,6 +12,7 @@
 #include "Cuda.h"
 #include "LazyDetector.h"
 #include "ROCm.h"
+#include "SYCL.h"
 #include "clang/Basic/DarwinSDKInfo.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Driver/Tool.h"
@@ -290,8 +291,52 @@ public:
   /// }
 };
 
+/// Apple specific MachO extensions
+class LLVM_LIBRARY_VISIBILITY AppleMachO : public MachO {
+public:
+  AppleMachO(const Driver &D, const llvm::Triple &Triple,
+             const llvm::opt::ArgList &Args);
+  ~AppleMachO() override;
+
+  /// }
+  /// @name Apple Specific ToolChain Implementation
+  /// {
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
+  void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
+  void AddClangCXXStdlibIncludeArgs(
+      const llvm::opt::ArgList &DriverArgs,
+      llvm::opt::ArgStringList &CC1Args) const override;
+  void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
+                           llvm::opt::ArgStringList &CmdArgs) const override;
+
+  void printVerboseInfo(raw_ostream &OS) const override;
+  /// }
+
+  LazyDetector<CudaInstallationDetector> CudaInstallation;
+  LazyDetector<RocmInstallationDetector> RocmInstallation;
+  LazyDetector<SYCLInstallationDetector> SYCLInstallation;
+
+protected:
+  llvm::SmallString<128>
+  GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const;
+
+private:
+  virtual void
+  AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                              llvm::opt::ArgStringList &CC1Args) const;
+};
+
 /// Darwin - The base Darwin tool chain.
-class LLVM_LIBRARY_VISIBILITY Darwin : public MachO {
+class LLVM_LIBRARY_VISIBILITY Darwin : public AppleMachO {
 public:
   /// Whether the information on the target has been initialized.
   //
@@ -329,9 +374,6 @@ public:
   /// The target variant triple that was specified (if any).
   mutable std::optional<llvm::Triple> TargetVariantTriple;
 
-  LazyDetector<CudaInstallationDetector> CudaInstallation;
-  LazyDetector<RocmInstallationDetector> RocmInstallation;
-
 private:
   void AddDeploymentTarget(llvm::opt::DerivedArgList &Args) const;
 
@@ -343,7 +385,7 @@ public:
   std::string ComputeEffectiveClangTriple(const llvm::opt::ArgList &Args,
                                           types::ID InputType) const override;
 
-  /// @name Apple Specific Toolchain Implementation
+  /// @name Darwin Specific Toolchain Implementation
   /// {
 
   void addMinVersionArgs(const llvm::opt::ArgList &Args,
@@ -559,11 +601,6 @@ public:
   ObjCRuntime getDefaultObjCRuntime(bool isNonFragile) const override;
   bool hasBlocksRuntime() const override;
 
-  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                          llvm::opt::ArgStringList &CC1Args) const override;
-  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                         llvm::opt::ArgStringList &CC1Args) const override;
-
   bool UseObjCMixedDispatch() const override {
     // This is only used with the non-fragile ABI and non-legacy dispatch.
 
@@ -594,8 +631,6 @@ public:
   bool SupportsEmbeddedBitcode() const override;
 
   SanitizerMask getSupportedSanitizers() const override;
-
-  void printVerboseInfo(raw_ostream &OS) const override;
 };
 
 /// DarwinClang - The Darwin toolchain used by Clang.
@@ -613,16 +648,6 @@ public:
                              llvm::opt::ArgStringList &CmdArgs,
                              bool ForceLinkBuiltinRT = false) const override;
 
-  void AddClangCXXStdlibIncludeArgs(
-      const llvm::opt::ArgList &DriverArgs,
-      llvm::opt::ArgStringList &CC1Args) const override;
-
-  void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                                 llvm::opt::ArgStringList &CC1Args) const override;
-
-  void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
-                           llvm::opt::ArgStringList &CmdArgs) const override;
-
   void AddCCKextLibArgs(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const override;
 
@@ -647,15 +672,16 @@ private:
                                StringRef Sanitizer,
                                bool shared = true) const;
 
+  void
+  AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                              llvm::opt::ArgStringList &CC1Args) const override;
+
   bool AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs,
                                    llvm::opt::ArgStringList &CC1Args,
                                    llvm::SmallString<128> Base,
                                    llvm::StringRef Version,
                                    llvm::StringRef ArchDir,
                                    llvm::StringRef BitDir) const;
-
-  llvm::SmallString<128>
-  GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const;
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 7034e5b..75b10e8 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -57,7 +57,8 @@ void Flang::addFortranDialectOptions(const ArgList &Args,
                             options::OPT_fno_automatic,
                             options::OPT_fhermetic_module_files,
                             options::OPT_frealloc_lhs,
-                            options::OPT_fno_realloc_lhs});
+                            options::OPT_fno_realloc_lhs,
+                            options::OPT_fsave_main_program});
 }
 
 void Flang::addPreprocessingOptions(const ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 8397f11..e5db1b2 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -3058,7 +3058,8 @@ bool Generic_GCC::GCCInstallationDetector::ScanGentooGccConfig(
 Generic_GCC::Generic_GCC(const Driver &D, const llvm::Triple &Triple,
                          const ArgList &Args)
     : ToolChain(D, Triple, Args), GCCInstallation(D),
-      CudaInstallation(D, Triple, Args), RocmInstallation(D, Triple, Args) {
+      CudaInstallation(D, Triple, Args), RocmInstallation(D, Triple, Args),
+      SYCLInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().Dir);
 }
 
@@ -3274,6 +3275,11 @@ void Generic_GCC::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
   }
 }
 
+void Generic_GCC::addSYCLIncludeArgs(const ArgList &DriverArgs,
+                                     ArgStringList &CC1Args) const {
+  SYCLInstallation->addSYCLIncludeArgs(DriverArgs, CC1Args);
+}
+
 void
 Generic_GCC::addLibCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
                                    llvm::opt::ArgStringList &CC1Args) const {
diff --git a/clang/lib/Driver/ToolChains/Gnu.h b/clang/lib/Driver/ToolChains/Gnu.h
index 0b664a1..3b8df71 100644
--- a/clang/lib/Driver/ToolChains/Gnu.h
+++ b/clang/lib/Driver/ToolChains/Gnu.h
@@ -12,6 +12,7 @@
 #include "Cuda.h"
 #include "LazyDetector.h"
 #include "ROCm.h"
+#include "SYCL.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include <set>
@@ -288,6 +289,7 @@ protected:
   GCCInstallationDetector GCCInstallation;
   LazyDetector<CudaInstallationDetector> CudaInstallation;
   LazyDetector<RocmInstallationDetector> RocmInstallation;
+  LazyDetector<SYCLInstallationDetector> SYCLInstallation;
 
 public:
   Generic_GCC(const Driver &D, const llvm::Triple &Triple,
@@ -336,6 +338,9 @@ protected:
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
 
+  void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
   virtual void
   addLibCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
                         llvm::opt::ArgStringList &CC1Args) const;
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index c91b55b..1c56355 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -777,6 +777,11 @@ void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
   }
 }
 
+void Linux::addSYCLIncludeArgs(const ArgList &DriverArgs,
+                               ArgStringList &CC1Args) const {
+  SYCLInstallation->addSYCLIncludeArgs(DriverArgs, CC1Args);
+}
+
 bool Linux::isPIEDefault(const llvm::opt::ArgList &Args) const {
   return CLANG_DEFAULT_PIE_ON_LINUX || getTriple().isAndroid() ||
          getTriple().isMusl() || getSanitizerArgs(Args).requiresPIE();
diff --git a/clang/lib/Driver/ToolChains/Linux.h b/clang/lib/Driver/ToolChains/Linux.h
index 2d9e674..2eb2d05 100644
--- a/clang/lib/Driver/ToolChains/Linux.h
+++ b/clang/lib/Driver/ToolChains/Linux.h
@@ -41,6 +41,8 @@ public:
                             llvm::opt::ArgStringList &CmdArgs) const override;
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
+  void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
   RuntimeLibType GetDefaultRuntimeLibType() const override;
   unsigned GetDefaultDwarfVersion() const override;
   CXXStdlibType GetDefaultCXXStdlibType() const override;
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index 752c2e2..bae41fc 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -430,7 +430,7 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 MSVCToolChain::MSVCToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ArgList &Args)
     : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args),
-      RocmInstallation(D, Triple, Args) {
+      RocmInstallation(D, Triple, Args), SYCLInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().Dir);
 
   std::optional<llvm::StringRef> VCToolsDir, VCToolsVersion;
@@ -509,6 +509,11 @@ void MSVCToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
   RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
+void MSVCToolChain::addSYCLIncludeArgs(const ArgList &DriverArgs,
+                                       ArgStringList &CC1Args) const {
+  SYCLInstallation->addSYCLIncludeArgs(DriverArgs, CC1Args);
+}
+
 void MSVCToolChain::AddHIPRuntimeLibArgs(const ArgList &Args,
                                          ArgStringList &CmdArgs) const {
   CmdArgs.append({Args.MakeArgString(StringRef("-libpath:") +
diff --git a/clang/lib/Driver/ToolChains/MSVC.h b/clang/lib/Driver/ToolChains/MSVC.h
index 3950a8e..b35390c 100644
--- a/clang/lib/Driver/ToolChains/MSVC.h
+++ b/clang/lib/Driver/ToolChains/MSVC.h
@@ -12,6 +12,7 @@
 #include "AMDGPU.h"
 #include "Cuda.h"
 #include "LazyDetector.h"
+#include "SYCL.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
@@ -100,6 +101,9 @@ public:
   void AddHIPRuntimeLibArgs(const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs) const override;
 
+  void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
   bool getWindowsSDKLibraryPath(
       const llvm::opt::ArgList &Args, std::string &path) const;
   bool getUniversalCRTLibraryPath(const llvm::opt::ArgList &Args,
@@ -138,6 +142,7 @@ private:
   llvm::ToolsetLayout VSLayout = llvm::ToolsetLayout::OlderVS;
   LazyDetector<CudaInstallationDetector> CudaInstallation;
   LazyDetector<RocmInstallationDetector> RocmInstallation;
+  LazyDetector<SYCLInstallationDetector> SYCLInstallation;
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 963de81..9f0c616 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -138,6 +138,9 @@ void tools::MinGW::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     else
       CmdArgs.push_back("arm64pe");
     break;
+  case llvm::Triple::mipsel:
+    CmdArgs.push_back("mipspe");
+    break;
   default:
     D.Diag(diag::err_target_unknown_triple) << TC.getEffectiveTriple().str();
   }
diff --git a/clang/lib/Driver/ToolChains/SPIRV.h b/clang/lib/Driver/ToolChains/SPIRV.h
index d59a8c7..415f639 100644
--- a/clang/lib/Driver/ToolChains/SPIRV.h
+++ b/clang/lib/Driver/ToolChains/SPIRV.h
@@ -52,7 +52,7 @@ public:
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY SPIRVToolChain final : public ToolChain {
+class LLVM_LIBRARY_VISIBILITY SPIRVToolChain : public ToolChain {
   mutable std::unique_ptr<Tool> Translator;
 
 public:
diff --git a/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp b/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp
new file mode 100644
index 0000000..1f27245
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/SPIRVOpenMP.cpp
@@ -0,0 +1,34 @@
+//==- SPIRVOpenMP.cpp - SPIR-V OpenMP Tool Implementations --------*- C++ -*==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//==------------------------------------------------------------------------==//
+#include "SPIRVOpenMP.h"
+#include "CommonArgs.h"
+
+using namespace clang::driver;
+using namespace clang::driver::toolchains;
+using namespace clang::driver::tools;
+using namespace llvm::opt;
+
+namespace clang::driver::toolchains {
+SPIRVOpenMPToolChain::SPIRVOpenMPToolChain(const Driver &D,
+                                           const llvm::Triple &Triple,
+                                           const ToolChain &HostToolchain,
+                                           const ArgList &Args)
+    : SPIRVToolChain(D, Triple, Args), HostTC(HostToolchain) {}
+
+void SPIRVOpenMPToolChain::addClangTargetOptions(
+    const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
+    Action::OffloadKind DeviceOffloadingKind) const {
+
+  if (DeviceOffloadingKind != Action::OFK_OpenMP)
+    return;
+
+  if (DriverArgs.hasArg(options::OPT_nogpulib))
+    return;
+  addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, "", getTriple(), HostTC);
+}
+} // namespace clang::driver::toolchains
diff --git a/clang/lib/Driver/ToolChains/SPIRVOpenMP.h b/clang/lib/Driver/ToolChains/SPIRVOpenMP.h
new file mode 100644
index 0000000..64404e2
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/SPIRVOpenMP.h
@@ -0,0 +1,29 @@
+//===--- SPIRVOpenMP.h - SPIR-V OpenMP Tool Implementations ------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SPIRV_OPENMP_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SPIRV_OPENMP_H
+
+#include "SPIRV.h"
+#include "clang/Driver/Tool.h"
+#include "clang/Driver/ToolChain.h"
+
+namespace clang::driver::toolchains {
+class LLVM_LIBRARY_VISIBILITY SPIRVOpenMPToolChain : public SPIRVToolChain {
+public:
+  SPIRVOpenMPToolChain(const Driver &D, const llvm::Triple &Triple,
+                       const ToolChain &HostTC, const llvm::opt::ArgList &Args);
+
+  void addClangTargetOptions(
+      const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
+      Action::OffloadKind DeviceOffloadingKind) const override;
+
+  const ToolChain &HostTC;
+};
+} // namespace clang::driver::toolchains
+#endif
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
new file mode 100644
index 0000000..a2b07ef
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -0,0 +1,154 @@
+//===--- SYCL.cpp - SYCL Tool and ToolChain Implementations -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "SYCL.h"
+#include "CommonArgs.h"
+#include "llvm/Support/Path.h"
+
+using namespace clang::driver;
+using namespace clang::driver::toolchains;
+using namespace clang::driver::tools;
+using namespace clang;
+using namespace llvm::opt;
+
+SYCLInstallationDetector::SYCLInstallationDetector(
+    const Driver &D, const llvm::Triple &HostTriple,
+    const llvm::opt::ArgList &Args) {}
+
+void SYCLInstallationDetector::addSYCLIncludeArgs(
+    const ArgList &DriverArgs, ArgStringList &CC1Args) const {
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nobuiltininc))
+    return;
+
+  // Add the SYCL header search locations in the specified order.
+  // FIXME: Add the header file locations once the SYCL library and headers
+  //        are properly established within the build.
+}
+
+// Unsupported options for SYCL device compilation.
+static ArrayRef<options::ID> getUnsupportedOpts() {
+  static constexpr options::ID UnsupportedOpts[] = {
+      options::OPT_fsanitize_EQ,      // -fsanitize
+      options::OPT_fcf_protection_EQ, // -fcf-protection
+      options::OPT_fprofile_generate,
+      options::OPT_fprofile_generate_EQ,
+      options::OPT_fno_profile_generate, // -f[no-]profile-generate
+      options::OPT_ftest_coverage,
+      options::OPT_fno_test_coverage, // -f[no-]test-coverage
+      options::OPT_fcoverage_mapping,
+      options::OPT_fno_coverage_mapping, // -f[no-]coverage-mapping
+      options::OPT_coverage,             // --coverage
+      options::OPT_fprofile_instr_generate,
+      options::OPT_fprofile_instr_generate_EQ,
+      options::OPT_fno_profile_instr_generate, // -f[no-]profile-instr-generate
+      options::OPT_fprofile_arcs,
+      options::OPT_fno_profile_arcs, // -f[no-]profile-arcs
+      options::OPT_fcreate_profile,  // -fcreate-profile
+      options::OPT_fprofile_instr_use,
+      options::OPT_fprofile_instr_use_EQ,       // -fprofile-instr-use
+      options::OPT_forder_file_instrumentation, // -forder-file-instrumentation
+      options::OPT_fcs_profile_generate,        // -fcs-profile-generate
+      options::OPT_fcs_profile_generate_EQ,
+  };
+  return UnsupportedOpts;
+}
+
+SYCLToolChain::SYCLToolChain(const Driver &D, const llvm::Triple &Triple,
+                             const ToolChain &HostTC, const ArgList &Args)
+    : ToolChain(D, Triple, Args), HostTC(HostTC),
+      SYCLInstallation(D, Triple, Args) {
+  // Lookup binaries into the driver directory, this is used to discover any
+  // dependent SYCL offload compilation tools.
+  getProgramPaths().push_back(getDriver().Dir);
+
+  // Diagnose unsupported options only once.
+  for (OptSpecifier Opt : getUnsupportedOpts()) {
+    if (const Arg *A = Args.getLastArg(Opt)) {
+      D.Diag(clang::diag::warn_drv_unsupported_option_for_target)
+          << A->getAsString(Args) << getTriple().str();
+    }
+  }
+}
+
+void SYCLToolChain::addClangTargetOptions(
+    const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
+    Action::OffloadKind DeviceOffloadingKind) const {
+  HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
+}
+
+llvm::opt::DerivedArgList *
+SYCLToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
+                             StringRef BoundArch,
+                             Action::OffloadKind DeviceOffloadKind) const {
+  DerivedArgList *DAL =
+      HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind);
+
+  bool IsNewDAL = false;
+  if (!DAL) {
+    DAL = new DerivedArgList(Args.getBaseArgs());
+    IsNewDAL = true;
+  }
+
+  for (Arg *A : Args) {
+    // Filter out any options we do not want to pass along to the device
+    // compilation.
+    auto Opt(A->getOption());
+    bool Unsupported = false;
+    for (OptSpecifier UnsupportedOpt : getUnsupportedOpts()) {
+      if (Opt.matches(UnsupportedOpt)) {
+        if (Opt.getID() == options::OPT_fsanitize_EQ &&
+            A->getValues().size() == 1) {
+          std::string SanitizeVal = A->getValue();
+          if (SanitizeVal == "address") {
+            if (IsNewDAL)
+              DAL->append(A);
+            continue;
+          }
+        }
+        if (!IsNewDAL)
+          DAL->eraseArg(Opt.getID());
+        Unsupported = true;
+      }
+    }
+    if (Unsupported)
+      continue;
+    if (IsNewDAL)
+      DAL->append(A);
+  }
+
+  const OptTable &Opts = getDriver().getOpts();
+  if (!BoundArch.empty()) {
+    DAL->eraseArg(options::OPT_march_EQ);
+    DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
+                      BoundArch);
+  }
+  return DAL;
+}
+
+void SYCLToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
+  HostTC.addClangWarningOptions(CC1Args);
+}
+
+ToolChain::CXXStdlibType
+SYCLToolChain::GetCXXStdlibType(const ArgList &Args) const {
+  return HostTC.GetCXXStdlibType(Args);
+}
+
+void SYCLToolChain::addSYCLIncludeArgs(const ArgList &DriverArgs,
+                                       ArgStringList &CC1Args) const {
+  SYCLInstallation.addSYCLIncludeArgs(DriverArgs, CC1Args);
+}
+
+void SYCLToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
+                                              ArgStringList &CC1Args) const {
+  HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
+}
+
+void SYCLToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,
+                                                 ArgStringList &CC1Args) const {
+  HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
+}
diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h
new file mode 100644
index 0000000..2a8b4ec
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/SYCL.h
@@ -0,0 +1,77 @@
+//===--- SYCL.h - SYCL ToolChain Implementations ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SYCL_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SYCL_H
+
+#include "clang/Driver/Tool.h"
+#include "clang/Driver/ToolChain.h"
+
+namespace clang {
+namespace driver {
+
+class SYCLInstallationDetector {
+public:
+  SYCLInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
+                           const llvm::opt::ArgList &Args);
+
+  void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const;
+};
+
+namespace toolchains {
+
+class LLVM_LIBRARY_VISIBILITY SYCLToolChain : public ToolChain {
+public:
+  SYCLToolChain(const Driver &D, const llvm::Triple &Triple,
+                const ToolChain &HostTC, const llvm::opt::ArgList &Args);
+
+  const llvm::Triple *getAuxTriple() const override {
+    return &HostTC.getTriple();
+  }
+
+  llvm::opt::DerivedArgList *
+  TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
+                Action::OffloadKind DeviceOffloadKind) const override;
+  void
+  addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                        llvm::opt::ArgStringList &CC1Args,
+                        Action::OffloadKind DeviceOffloadKind) const override;
+
+  bool useIntegratedAs() const override { return true; }
+  bool isPICDefault() const override { return false; }
+  llvm::codegenoptions::DebugInfoFormat getDefaultDebugFormat() const override {
+    return this->HostTC.getDefaultDebugFormat();
+  }
+  bool isPIEDefault(const llvm::opt::ArgList &Args) const override {
+    return false;
+  }
+  bool isPICDefaultForced() const override { return false; }
+
+  void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const override;
+  CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
+  void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+  void AddClangCXXStdlibIncludeArgs(
+      const llvm::opt::ArgList &Args,
+      llvm::opt::ArgStringList &CC1Args) const override;
+
+private:
+  const ToolChain &HostTC;
+  SYCLInstallationDetector SYCLInstallation;
+};
+
+} // end namespace toolchains
+
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_SYCL_H
diff --git a/clang/lib/Format/AffectedRangeManager.cpp b/clang/lib/Format/AffectedRangeManager.cpp
index bf124d7..67108f3 100644
--- a/clang/lib/Format/AffectedRangeManager.cpp
+++ b/clang/lib/Format/AffectedRangeManager.cpp
@@ -21,8 +21,8 @@ namespace format {
 
 bool AffectedRangeManager::computeAffectedLines(
     SmallVectorImpl<AnnotatedLine *> &Lines) {
-  SmallVectorImpl<AnnotatedLine *>::iterator I = Lines.begin();
-  SmallVectorImpl<AnnotatedLine *>::iterator E = Lines.end();
+  ArrayRef<AnnotatedLine *>::iterator I = Lines.begin();
+  ArrayRef<AnnotatedLine *>::iterator E = Lines.end();
   bool SomeLineAffected = false;
   const AnnotatedLine *PreviousLine = nullptr;
   while (I != E) {
@@ -34,7 +34,7 @@ bool AffectedRangeManager::computeAffectedLines(
     // if any token within the directive is affected.
     if (Line->InPPDirective) {
       FormatToken *Last = Line->Last;
-      SmallVectorImpl<AnnotatedLine *>::iterator PPEnd = I + 1;
+      const auto *PPEnd = I + 1;
       while (PPEnd != E && !(*PPEnd)->First->HasUnescapedNewline) {
         Last = (*PPEnd)->Last;
         ++PPEnd;
@@ -89,8 +89,8 @@ bool AffectedRangeManager::affectsLeadingEmptyLines(const FormatToken &Tok) {
 }
 
 void AffectedRangeManager::markAllAsAffected(
-    SmallVectorImpl<AnnotatedLine *>::iterator I,
-    SmallVectorImpl<AnnotatedLine *>::iterator E) {
+    ArrayRef<AnnotatedLine *>::iterator I,
+    ArrayRef<AnnotatedLine *>::iterator E) {
   while (I != E) {
     (*I)->Affected = true;
     markAllAsAffected((*I)->Children.begin(), (*I)->Children.end());
diff --git a/clang/lib/Format/AffectedRangeManager.h b/clang/lib/Format/AffectedRangeManager.h
index add16bd..eef056f 100644
--- a/clang/lib/Format/AffectedRangeManager.h
+++ b/clang/lib/Format/AffectedRangeManager.h
@@ -47,8 +47,8 @@ private:
   bool affectsLeadingEmptyLines(const FormatToken &Tok);
 
   // Marks all lines between I and E as well as all their children as affected.
-  void markAllAsAffected(SmallVectorImpl<AnnotatedLine *>::iterator I,
-                         SmallVectorImpl<AnnotatedLine *>::iterator E);
+  void markAllAsAffected(ArrayRef<AnnotatedLine *>::iterator I,
+                         ArrayRef<AnnotatedLine *>::iterator E);
 
   // Determines whether 'Line' is affected by the SourceRanges given as input.
   // Returns \c true if line or one if its children is affected.
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index dcaac4b..fc60c5e 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -839,6 +839,18 @@ template <> struct ScalarEnumerationTraits<FormatStyle::UseTabStyle> {
   }
 };
 
+template <>
+struct ScalarEnumerationTraits<
+    FormatStyle::WrapNamespaceBodyWithEmptyLinesStyle> {
+  static void
+  enumeration(IO &IO,
+              FormatStyle::WrapNamespaceBodyWithEmptyLinesStyle &Value) {
+    IO.enumCase(Value, "Never", FormatStyle::WNBWELS_Never);
+    IO.enumCase(Value, "Always", FormatStyle::WNBWELS_Always);
+    IO.enumCase(Value, "Leave", FormatStyle::WNBWELS_Leave);
+  }
+};
+
 template <> struct MappingTraits<FormatStyle> {
   static void mapping(IO &IO, FormatStyle &Style) {
     // When reading, read the language first, we need it for getPredefinedStyle.
@@ -975,6 +987,8 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.AllowShortLambdasOnASingleLine);
     IO.mapOptional("AllowShortLoopsOnASingleLine",
                    Style.AllowShortLoopsOnASingleLine);
+    IO.mapOptional("AllowShortNamespacesOnASingleLine",
+                   Style.AllowShortNamespacesOnASingleLine);
     IO.mapOptional("AlwaysBreakAfterDefinitionReturnType",
                    Style.AlwaysBreakAfterDefinitionReturnType);
     IO.mapOptional("AlwaysBreakBeforeMultilineStrings",
@@ -1164,10 +1178,13 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("TypeNames", Style.TypeNames);
     IO.mapOptional("TypenameMacros", Style.TypenameMacros);
     IO.mapOptional("UseTab", Style.UseTab);
+    IO.mapOptional("VariableTemplates", Style.VariableTemplates);
     IO.mapOptional("VerilogBreakBetweenInstancePorts",
                    Style.VerilogBreakBetweenInstancePorts);
     IO.mapOptional("WhitespaceSensitiveMacros",
                    Style.WhitespaceSensitiveMacros);
+    IO.mapOptional("WrapNamespaceBodyWithEmptyLines",
+                   Style.WrapNamespaceBodyWithEmptyLines);
 
     // If AlwaysBreakAfterDefinitionReturnType was specified but
     // BreakAfterReturnType was not, initialize the latter from the former for
@@ -1480,6 +1497,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_Never;
   LLVMStyle.AllowShortLambdasOnASingleLine = FormatStyle::SLS_All;
   LLVMStyle.AllowShortLoopsOnASingleLine = false;
+  LLVMStyle.AllowShortNamespacesOnASingleLine = false;
   LLVMStyle.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_None;
   LLVMStyle.AlwaysBreakBeforeMultilineStrings = false;
   LLVMStyle.AttributeMacros.push_back("__capability");
@@ -1635,6 +1653,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.WhitespaceSensitiveMacros.push_back("NS_SWIFT_NAME");
   LLVMStyle.WhitespaceSensitiveMacros.push_back("PP_STRINGIZE");
   LLVMStyle.WhitespaceSensitiveMacros.push_back("STRINGIZE");
+  LLVMStyle.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Leave;
 
   LLVMStyle.PenaltyBreakAssignment = prec::Assignment;
   LLVMStyle.PenaltyBreakBeforeFirstCallParameter = 19;
@@ -3066,8 +3085,8 @@ static bool affectsRange(ArrayRef<tooling::Range> Ranges, unsigned Start,
 // its current line.
 // If `Cursor` is not on any #include, `Index` will be UINT_MAX.
 static std::pair<unsigned, unsigned>
-FindCursorIndex(const SmallVectorImpl<IncludeDirective> &Includes,
-                const SmallVectorImpl<unsigned> &Indices, unsigned Cursor) {
+FindCursorIndex(const ArrayRef<IncludeDirective> &Includes,
+                const ArrayRef<unsigned> &Indices, unsigned Cursor) {
   unsigned CursorIndex = UINT_MAX;
   unsigned OffsetToEOL = 0;
   for (int i = 0, e = Includes.size(); i != e; ++i) {
@@ -3116,7 +3135,7 @@ std::string replaceCRLF(const std::string &Code) {
 // provided and put on a deleted #include, it will be moved to the remaining
 // #include in the duplicate #includes.
 static void sortCppIncludes(const FormatStyle &Style,
-                            const SmallVectorImpl<IncludeDirective> &Includes,
+                            const ArrayRef<IncludeDirective> &Includes,
                             ArrayRef<tooling::Range> Ranges, StringRef FileName,
                             StringRef Code, tooling::Replacements &Replaces,
                             unsigned *Cursor) {
@@ -3246,8 +3265,15 @@ tooling::Replacements sortCppIncludes(const FormatStyle &Style, StringRef Code,
   SmallVector<StringRef, 2> RawStringMatches;
   std::string RawStringTermination = ")\"";
 
-  for (;;) {
-    auto Pos = Code.find('\n', SearchFrom);
+  for (const auto Size = Code.size(); SearchFrom < Size;) {
+    size_t Pos = SearchFrom;
+    if (Code[SearchFrom] != '\n') {
+      do { // Search for the first newline while skipping line splices.
+        ++Pos;
+        Pos = Code.find('\n', Pos);
+      } while (Pos != StringRef::npos && Code[Pos - 1] == '\\');
+    }
+
     StringRef Line =
         Code.substr(Prev, (Pos != StringRef::npos ? Pos : Code.size()) - Prev);
 
@@ -3352,7 +3378,7 @@ static unsigned findJavaImportGroup(const FormatStyle &Style,
 // import group, a newline is inserted, and within each import group, a
 // lexicographic sort based on ASCII value is performed.
 static void sortJavaImports(const FormatStyle &Style,
-                            const SmallVectorImpl<JavaImportDirective> &Imports,
+                            const ArrayRef<JavaImportDirective> &Imports,
                             ArrayRef<tooling::Range> Ranges, StringRef FileName,
                             StringRef Code, tooling::Replacements &Replaces) {
   unsigned ImportsBeginOffset = Imports.front().Offset;
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index f6bb860..d97b652 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -25,6 +25,7 @@ namespace clang {
 namespace format {
 
 #define LIST_TOKEN_TYPES                                                       \
+  TYPE(AfterPPDirective)                                                       \
   TYPE(ArrayInitializerLSquare)                                                \
   TYPE(ArraySubscriptLSquare)                                                  \
   TYPE(AttributeColon)                                                         \
@@ -44,6 +45,7 @@ namespace format {
   TYPE(CastRParen)                                                             \
   TYPE(ClassLBrace)                                                            \
   TYPE(ClassRBrace)                                                            \
+  TYPE(CompoundRequirementLBrace)                                              \
   /* ternary ?: expression */                                                  \
   TYPE(ConditionalExpr)                                                        \
   /* the condition in an if statement */                                       \
@@ -186,6 +188,7 @@ namespace format {
   TYPE(UnionLBrace)                                                            \
   TYPE(UnionRBrace)                                                            \
   TYPE(UntouchableMacroFunc)                                                   \
+  TYPE(VariableTemplate)                                                       \
   /* Like in 'assign x = 0, y = 1;' . */                                       \
   TYPE(VerilogAssignComma)                                                     \
   /* like in begin : block */                                                  \
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 7a264bd..a1d7eea 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -76,6 +76,8 @@ FormatTokenLexer::FormatTokenLexer(
     TemplateNames.insert(&IdentTable.get(TemplateName));
   for (const auto &TypeName : Style.TypeNames)
     TypeNames.insert(&IdentTable.get(TypeName));
+  for (const auto &VariableTemplate : Style.VariableTemplates)
+    VariableTemplates.insert(&IdentTable.get(VariableTemplate));
 }
 
 ArrayRef<FormatToken *> FormatTokenLexer::lex() {
@@ -562,8 +564,7 @@ bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
   if (Tokens.size() < Kinds.size())
     return false;
 
-  SmallVectorImpl<FormatToken *>::const_iterator First =
-      Tokens.end() - Kinds.size();
+  const auto *First = Tokens.end() - Kinds.size();
   for (unsigned i = 0; i < Kinds.size(); ++i)
     if (First[i]->isNot(Kinds[i]))
       return false;
@@ -575,7 +576,7 @@ bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
   if (Tokens.size() < Count)
     return false;
 
-  SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
+  const auto *First = Tokens.end() - Count;
   unsigned AddLength = 0;
   for (size_t i = 1; i < Count; ++i) {
     // If there is whitespace separating the token and the previous one,
@@ -1382,6 +1383,8 @@ FormatToken *FormatTokenLexer::getNextToken() {
         FormatTok->setFinalizedType(TT_TemplateName);
       else if (TypeNames.contains(Identifier))
         FormatTok->setFinalizedType(TT_TypeName);
+      else if (VariableTemplates.contains(Identifier))
+        FormatTok->setFinalizedType(TT_VariableTemplate);
     }
   }
 
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 71389d2..61474a3 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -129,7 +129,8 @@ private:
 
   llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros;
 
-  llvm::SmallPtrSet<IdentifierInfo *, 8> TemplateNames, TypeNames;
+  llvm::SmallPtrSet<IdentifierInfo *, 8> TemplateNames, TypeNames,
+      VariableTemplates;
 
   bool FormattingDisabled;
 
diff --git a/clang/lib/Format/MatchFilePath.cpp b/clang/lib/Format/MatchFilePath.cpp
index 062b334d..1f1e4bf 100644
--- a/clang/lib/Format/MatchFilePath.cpp
+++ b/clang/lib/Format/MatchFilePath.cpp
@@ -25,9 +25,11 @@ bool matchFilePath(StringRef Pattern, StringRef FilePath) {
   assert(!Pattern.empty());
   assert(!FilePath.empty());
 
+  const auto FilePathBack = FilePath.back();
+
   // No match if `Pattern` ends with a non-meta character not equal to the last
   // character of `FilePath`.
-  if (const auto C = Pattern.back(); !strchr("?*]", C) && C != FilePath.back())
+  if (const auto C = Pattern.back(); !strchr("?*]", C) && C != FilePathBack)
     return false;
 
   constexpr auto Separator = '/';
@@ -49,25 +51,37 @@ bool matchFilePath(StringRef Pattern, StringRef FilePath) {
         return false;
       break;
     case '*': {
-      while (++I < EOP && Pattern[I] == '*') { // Skip consecutive stars.
+      bool Globstar = I == 0 || Pattern[I - 1] == Separator;
+      int StarCount = 1;
+      for (; ++I < EOP && Pattern[I] == '*'; ++StarCount) {
+        // Skip consecutive stars.
       }
+      if (StarCount != 2)
+        Globstar = false;
       const auto K = FilePath.find(Separator, J); // Index of next `Separator`.
       const bool NoMoreSeparatorsInFilePath = K == StringRef::npos;
       if (I == EOP) // `Pattern` ends with a star.
-        return NoMoreSeparatorsInFilePath;
-      // `Pattern` ends with a lone backslash.
-      if (Pattern[I] == '\\' && ++I == EOP)
-        return false;
+        return Globstar || NoMoreSeparatorsInFilePath;
+      if (Pattern[I] != Separator) {
+        // `Pattern` ends with a lone backslash.
+        if (Pattern[I] == '\\' && ++I == EOP)
+          return false;
+        Globstar = false;
+      }
       // The star is followed by a (possibly escaped) `Separator`.
       if (Pattern[I] == Separator) {
-        if (NoMoreSeparatorsInFilePath)
-          return false;
-        J = K; // Skip to next `Separator` in `FilePath`.
-        break;
+        if (!Globstar) {
+          if (NoMoreSeparatorsInFilePath)
+            return false;
+          J = K; // Skip to next `Separator` in `FilePath`.
+          break;
+        }
+        if (++I == EOP)
+          return FilePathBack == Separator;
       }
       // Recurse.
-      for (auto Pat = Pattern.substr(I); J < End && FilePath[J] != Separator;
-           ++J) {
+      for (auto Pat = Pattern.substr(I);
+           J < End && (Globstar || FilePath[J] != Separator); ++J) {
         if (matchFilePath(Pat, FilePath.substr(J)))
           return true;
       }
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index f2cfa7f..bf5ee28 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -137,12 +137,12 @@ public:
 private:
   ScopeType getScopeType(const FormatToken &Token) const {
     switch (Token.getType()) {
-    case TT_LambdaLBrace:
-      return ST_ChildBlock;
     case TT_ClassLBrace:
     case TT_StructLBrace:
     case TT_UnionLBrace:
       return ST_Class;
+    case TT_CompoundRequirementLBrace:
+      return ST_CompoundRequirement;
     default:
       return ST_Other;
     }
@@ -1580,7 +1580,10 @@ private:
         return false;
       break;
     case tok::l_brace:
-      if (Style.Language == FormatStyle::LK_TextProto) {
+      if (IsCpp) {
+        if (Tok->is(TT_RequiresExpressionLBrace))
+          Line.Type = LT_RequiresExpression;
+      } else if (Style.Language == FormatStyle::LK_TextProto) {
         FormatToken *Previous = Tok->getPreviousNonComment();
         if (Previous && Previous->isNot(TT_DictLiteral))
           Previous->setType(TT_SelectorName);
@@ -2022,8 +2025,11 @@ public:
       if (!consumeToken())
         return LT_Invalid;
     }
-    if (Line.Type == LT_AccessModifier)
-      return LT_AccessModifier;
+    if (const auto Type = Line.Type; Type == LT_AccessModifier ||
+                                     Type == LT_RequiresExpression ||
+                                     Type == LT_SimpleRequirement) {
+      return Type;
+    }
     if (KeywordVirtualFound)
       return LT_VirtualFunctionDecl;
     if (ImportStatement)
@@ -2076,7 +2082,7 @@ private:
             TT_RecordLBrace, TT_StructLBrace, TT_UnionLBrace, TT_RequiresClause,
             TT_RequiresClauseInARequiresExpression, TT_RequiresExpression,
             TT_RequiresExpressionLParen, TT_RequiresExpressionLBrace,
-            TT_BracedListLBrace)) {
+            TT_CompoundRequirementLBrace, TT_BracedListLBrace)) {
       CurrentToken->setType(TT_Unknown);
     }
     CurrentToken->Role.reset();
@@ -2792,6 +2798,16 @@ private:
       return true;
     }
 
+    auto IsNonVariableTemplate = [](const FormatToken &Tok) {
+      if (Tok.isNot(TT_TemplateCloser))
+        return false;
+      const auto *Less = Tok.MatchingParen;
+      if (!Less)
+        return false;
+      const auto *BeforeLess = Less->getPreviousNonComment();
+      return BeforeLess && BeforeLess->isNot(TT_VariableTemplate);
+    };
+
     // Heuristically try to determine whether the parentheses contain a type.
     auto IsQualifiedPointerOrReference = [](const FormatToken *T,
                                             const LangOptions &LangOpts) {
@@ -2825,10 +2841,11 @@ private:
       }
       return T && T->is(TT_PointerOrReference);
     };
-    bool ParensAreType =
-        BeforeRParen->isOneOf(TT_TemplateCloser, TT_TypeDeclarationParen) ||
-        BeforeRParen->isTypeName(LangOpts) ||
-        IsQualifiedPointerOrReference(BeforeRParen, LangOpts);
+
+    bool ParensAreType = IsNonVariableTemplate(*BeforeRParen) ||
+                         BeforeRParen->is(TT_TypeDeclarationParen) ||
+                         BeforeRParen->isTypeName(LangOpts) ||
+                         IsQualifiedPointerOrReference(BeforeRParen, LangOpts);
     bool ParensCouldEndDecl =
         AfterRParen->isOneOf(tok::equal, tok::semi, tok::l_brace, tok::greater);
     if (ParensAreType && !ParensCouldEndDecl)
@@ -3089,6 +3106,11 @@ private:
       }
     }
 
+    if (Line.Type == LT_SimpleRequirement ||
+        (!Scopes.empty() && Scopes.back() == ST_CompoundRequirement)) {
+      return TT_BinaryOperator;
+    }
+
     return TT_PointerOrReference;
   }
 
@@ -3371,13 +3393,13 @@ private:
   /// Parse unary operator expressions and surround them with fake
   /// parentheses if appropriate.
   void parseUnaryOperator() {
-    llvm::SmallVector<FormatToken *, 2> Tokens;
+    SmallVector<FormatToken *, 2> Tokens;
     while (Current && Current->is(TT_UnaryOperator)) {
       Tokens.push_back(Current);
       next();
     }
     parse(PrecedenceArrowAndPeriod);
-    for (FormatToken *Token : llvm::reverse(Tokens)) {
+    for (FormatToken *Token : reverse(Tokens)) {
       // The actual precedence doesn't matter.
       addFakeParenthesis(Token, prec::Unknown);
     }
@@ -3555,7 +3577,7 @@ private:
 void TokenAnnotator::setCommentLineLevels(
     SmallVectorImpl<AnnotatedLine *> &Lines) const {
   const AnnotatedLine *NextNonCommentLine = nullptr;
-  for (AnnotatedLine *Line : llvm::reverse(Lines)) {
+  for (AnnotatedLine *Line : reverse(Lines)) {
     assert(Line->First);
 
     // If the comment is currently aligned with the line immediately following
@@ -3676,9 +3698,16 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
   Line.Type = Parser.parseLine();
 
   if (!Line.Children.empty()) {
-    ScopeStack.push_back(ST_ChildBlock);
-    for (auto &Child : Line.Children)
+    ScopeStack.push_back(ST_Other);
+    const bool InRequiresExpression = Line.Type == LT_RequiresExpression;
+    for (auto &Child : Line.Children) {
+      if (InRequiresExpression &&
+          !Child->First->isOneOf(tok::kw_typename, tok::kw_requires,
+                                 TT_CompoundRequirementLBrace)) {
+        Child->Type = LT_SimpleRequirement;
+      }
       annotate(*Child);
+    }
     // ScopeStack can become empty if Child has an unmatched `}`.
     if (!ScopeStack.empty())
       ScopeStack.pop_back();
@@ -4930,6 +4959,10 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
         Right.is(TT_ModulePartitionColon)) {
       return true;
     }
+
+    if (Right.is(TT_AfterPPDirective))
+      return true;
+
     // No space between import foo:bar but keep a space between import :bar;
     if (Left.is(tok::identifier) && Right.is(TT_ModulePartitionColon))
       return false;
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index 9117ca3..16e920e 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -33,14 +33,16 @@ enum LineType {
   LT_VirtualFunctionDecl,
   LT_ArrayOfStructInitializer,
   LT_CommentAbovePPDirective,
+  LT_RequiresExpression,
+  LT_SimpleRequirement,
 };
 
 enum ScopeType {
-  // Contained in child block.
-  ST_ChildBlock,
   // Contained in class declaration/definition.
   ST_Class,
-  // Contained within other scope block (function, loop, if/else, etc).
+  // Contained in compound requirement.
+  ST_CompoundRequirement,
+  // Contained in other blocks (function, lambda, loop, if/else, child, etc).
   ST_Other,
 };
 
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 1804c14..ec65fea 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -183,9 +183,9 @@ private:
   unsigned Indent = 0;
 };
 
-const FormatToken *getMatchingNamespaceToken(
-    const AnnotatedLine *Line,
-    const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines) {
+const FormatToken *
+getMatchingNamespaceToken(const AnnotatedLine *Line,
+                          const ArrayRef<AnnotatedLine *> &AnnotatedLines) {
   if (!Line->startsWith(tok::r_brace))
     return nullptr;
   size_t StartLineIndex = Line->MatchingOpeningBlockLineIndex;
@@ -200,9 +200,9 @@ StringRef getNamespaceTokenText(const AnnotatedLine *Line) {
   return NamespaceToken ? NamespaceToken->TokenText : StringRef();
 }
 
-StringRef getMatchingNamespaceTokenText(
-    const AnnotatedLine *Line,
-    const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines) {
+StringRef
+getMatchingNamespaceTokenText(const AnnotatedLine *Line,
+                              const ArrayRef<AnnotatedLine *> &AnnotatedLines) {
   const FormatToken *NamespaceToken =
       getMatchingNamespaceToken(Line, AnnotatedLines);
   return NamespaceToken ? NamespaceToken->TokenText : StringRef();
@@ -241,8 +241,8 @@ private:
   /// Calculates how many lines can be merged into 1 starting at \p I.
   unsigned
   tryFitMultipleLinesInOne(LevelIndentTracker &IndentTracker,
-                           SmallVectorImpl<AnnotatedLine *>::const_iterator I,
-                           SmallVectorImpl<AnnotatedLine *>::const_iterator E) {
+                           ArrayRef<AnnotatedLine *>::const_iterator I,
+                           ArrayRef<AnnotatedLine *>::const_iterator E) {
     const unsigned Indent = IndentTracker.getIndent();
 
     // Can't join the last line with anything.
@@ -361,9 +361,18 @@ private:
     const auto *FirstNonComment = TheLine->getFirstNonComment();
     if (!FirstNonComment)
       return 0;
+
     // FIXME: There are probably cases where we should use FirstNonComment
     // instead of TheLine->First.
 
+    if (Style.AllowShortNamespacesOnASingleLine &&
+        TheLine->First->is(tok::kw_namespace) &&
+        TheLine->Last->is(tok::l_brace)) {
+      const auto result = tryMergeNamespace(I, E, Limit);
+      if (result > 0)
+        return result;
+    }
+
     if (Style.CompactNamespaces) {
       if (const auto *NSToken = TheLine->First->getNamespaceToken()) {
         int J = 1;
@@ -373,7 +382,7 @@ private:
              ClosingLineIndex == I[J]->MatchingClosingBlockLineIndex &&
              I[J]->Last->TotalLength < Limit;
              ++J, --ClosingLineIndex) {
-          Limit -= I[J]->Last->TotalLength;
+          Limit -= I[J]->Last->TotalLength + 1;
 
           // Reduce indent level for bodies of namespaces which were compacted,
           // but only if their content was indented in the first place.
@@ -420,6 +429,7 @@ private:
         TheLine->First != LastNonComment) {
       return MergeShortFunctions ? tryMergeSimpleBlock(I, E, Limit) : 0;
     }
+
     // Try to merge a control statement block with left brace unwrapped.
     if (TheLine->Last->is(tok::l_brace) && FirstNonComment != TheLine->Last &&
         FirstNonComment->isOneOf(tok::kw_if, tok::kw_while, tok::kw_for,
@@ -525,7 +535,7 @@ private:
       // Try to merge records.
       if (TheLine->Last->is(TT_EnumLBrace)) {
         ShouldMerge = Style.AllowShortEnumsOnASingleLine;
-      } else if (TheLine->Last->is(TT_RequiresExpressionLBrace)) {
+      } else if (TheLine->Last->is(TT_CompoundRequirementLBrace)) {
         ShouldMerge = Style.AllowShortCompoundRequirementOnASingleLine;
       } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace)) {
         // NOTE: We use AfterClass (whereas AfterStruct exists) for both classes
@@ -604,8 +614,8 @@ private:
   }
 
   unsigned
-  tryMergeSimplePPDirective(SmallVectorImpl<AnnotatedLine *>::const_iterator I,
-                            SmallVectorImpl<AnnotatedLine *>::const_iterator E,
+  tryMergeSimplePPDirective(ArrayRef<AnnotatedLine *>::const_iterator I,
+                            ArrayRef<AnnotatedLine *>::const_iterator E,
                             unsigned Limit) {
     if (Limit == 0)
       return 0;
@@ -616,9 +626,76 @@ private:
     return 1;
   }
 
-  unsigned tryMergeSimpleControlStatement(
-      SmallVectorImpl<AnnotatedLine *>::const_iterator I,
-      SmallVectorImpl<AnnotatedLine *>::const_iterator E, unsigned Limit) {
+  unsigned tryMergeNamespace(ArrayRef<AnnotatedLine *>::const_iterator I,
+                             ArrayRef<AnnotatedLine *>::const_iterator E,
+                             unsigned Limit) {
+    if (Limit == 0)
+      return 0;
+
+    assert(I[1]);
+    const auto &L1 = *I[1];
+    if (L1.InPPDirective != (*I)->InPPDirective ||
+        (L1.InPPDirective && L1.First->HasUnescapedNewline)) {
+      return 0;
+    }
+
+    if (std::distance(I, E) <= 2)
+      return 0;
+
+    assert(I[2]);
+    const auto &L2 = *I[2];
+    if (L2.Type == LT_Invalid)
+      return 0;
+
+    Limit = limitConsideringMacros(I + 1, E, Limit);
+
+    if (!nextTwoLinesFitInto(I, Limit))
+      return 0;
+
+    // Check if it's a namespace inside a namespace, and call recursively if so.
+    // '3' is the sizes of the whitespace and closing brace for " _inner_ }".
+    if (L1.First->is(tok::kw_namespace)) {
+      if (L1.Last->is(tok::comment) || !Style.CompactNamespaces)
+        return 0;
+
+      assert(Limit >= L1.Last->TotalLength + 3);
+      const auto InnerLimit = Limit - L1.Last->TotalLength - 3;
+      const auto MergedLines = tryMergeNamespace(I + 1, E, InnerLimit);
+      if (MergedLines == 0)
+        return 0;
+      const auto N = MergedLines + 2;
+      // Check if there is even a line after the inner result.
+      if (std::distance(I, E) <= N)
+        return 0;
+      // Check that the line after the inner result starts with a closing brace
+      // which we are permitted to merge into one line.
+      if (I[N]->First->is(tok::r_brace) && !I[N]->First->MustBreakBefore &&
+          I[MergedLines + 1]->Last->isNot(tok::comment) &&
+          nextNLinesFitInto(I, I + N + 1, Limit)) {
+        return N;
+      }
+      return 0;
+    }
+
+    // There's no inner namespace, so we are considering to merge at most one
+    // line.
+
+    // The line which is in the namespace should end with semicolon.
+    if (L1.Last->isNot(tok::semi))
+      return 0;
+
+    // Last, check that the third line starts with a closing brace.
+    if (L2.First->isNot(tok::r_brace) || L2.First->MustBreakBefore)
+      return 0;
+
+    // If so, merge all three lines.
+    return 2;
+  }
+
+  unsigned
+  tryMergeSimpleControlStatement(ArrayRef<AnnotatedLine *>::const_iterator I,
+                                 ArrayRef<AnnotatedLine *>::const_iterator E,
+                                 unsigned Limit) {
     if (Limit == 0)
       return 0;
     if (Style.BraceWrapping.AfterControlStatement ==
@@ -658,10 +735,9 @@ private:
     return 1;
   }
 
-  unsigned
-  tryMergeShortCaseLabels(SmallVectorImpl<AnnotatedLine *>::const_iterator I,
-                          SmallVectorImpl<AnnotatedLine *>::const_iterator E,
-                          unsigned Limit) {
+  unsigned tryMergeShortCaseLabels(ArrayRef<AnnotatedLine *>::const_iterator I,
+                                   ArrayRef<AnnotatedLine *>::const_iterator E,
+                                   unsigned Limit) {
     if (Limit == 0 || I + 1 == E ||
         I[1]->First->isOneOf(tok::kw_case, tok::kw_default)) {
       return 0;
@@ -692,7 +768,7 @@ private:
       if (Line->First->is(tok::comment)) {
         if (Level != Line->Level)
           return 0;
-        SmallVectorImpl<AnnotatedLine *>::const_iterator J = I + 2 + NumStmts;
+        const auto *J = I + 2 + NumStmts;
         for (; J != E; ++J) {
           Line = *J;
           if (Line->InPPDirective != InPPDirective)
@@ -713,10 +789,9 @@ private:
     return NumStmts;
   }
 
-  unsigned
-  tryMergeSimpleBlock(SmallVectorImpl<AnnotatedLine *>::const_iterator I,
-                      SmallVectorImpl<AnnotatedLine *>::const_iterator E,
-                      unsigned Limit) {
+  unsigned tryMergeSimpleBlock(ArrayRef<AnnotatedLine *>::const_iterator I,
+                               ArrayRef<AnnotatedLine *>::const_iterator E,
+                               unsigned Limit) {
     // Don't merge with a preprocessor directive.
     if (I[1]->Type == LT_PreprocessorDirective)
       return 0;
@@ -898,10 +973,9 @@ private:
 
   /// Returns the modified column limit for \p I if it is inside a macro and
   /// needs a trailing '\'.
-  unsigned
-  limitConsideringMacros(SmallVectorImpl<AnnotatedLine *>::const_iterator I,
-                         SmallVectorImpl<AnnotatedLine *>::const_iterator E,
-                         unsigned Limit) {
+  unsigned limitConsideringMacros(ArrayRef<AnnotatedLine *>::const_iterator I,
+                                  ArrayRef<AnnotatedLine *>::const_iterator E,
+                                  unsigned Limit) {
     if (I[0]->InPPDirective && I + 1 != E &&
         !I[1]->First->HasUnescapedNewline && I[1]->First->isNot(tok::eof)) {
       return Limit < 2 ? 0 : Limit - 2;
@@ -909,13 +983,28 @@ private:
     return Limit;
   }
 
-  bool nextTwoLinesFitInto(SmallVectorImpl<AnnotatedLine *>::const_iterator I,
+  bool nextTwoLinesFitInto(ArrayRef<AnnotatedLine *>::const_iterator I,
                            unsigned Limit) {
     if (I[1]->First->MustBreakBefore || I[2]->First->MustBreakBefore)
       return false;
     return 1 + I[1]->Last->TotalLength + 1 + I[2]->Last->TotalLength <= Limit;
   }
 
+  bool nextNLinesFitInto(ArrayRef<AnnotatedLine *>::const_iterator I,
+                         ArrayRef<AnnotatedLine *>::const_iterator E,
+                         unsigned Limit) {
+    unsigned JoinedLength = 0;
+    for (const auto *J = I + 1; J != E; ++J) {
+      if ((*J)->First->MustBreakBefore)
+        return false;
+
+      JoinedLength += 1 + (*J)->Last->TotalLength;
+      if (JoinedLength > Limit)
+        return false;
+    }
+    return true;
+  }
+
   bool containsMustBreak(const AnnotatedLine *Line) {
     assert(Line->First);
     // Ignore the first token, because in this situation, it applies more to the
@@ -943,9 +1032,9 @@ private:
 
   const FormatStyle &Style;
   const AdditionalKeywords &Keywords;
-  const SmallVectorImpl<AnnotatedLine *>::const_iterator End;
+  const ArrayRef<AnnotatedLine *>::const_iterator End;
 
-  SmallVectorImpl<AnnotatedLine *>::const_iterator Next;
+  ArrayRef<AnnotatedLine *>::const_iterator Next;
   const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines;
 };
 
@@ -1493,6 +1582,23 @@ static auto computeNewlines(const AnnotatedLine &Line,
     Newlines = 1;
   }
 
+  if (Style.WrapNamespaceBodyWithEmptyLines != FormatStyle::WNBWELS_Leave) {
+    // Modify empty lines after TT_NamespaceLBrace.
+    if (PreviousLine && PreviousLine->endsWith(TT_NamespaceLBrace)) {
+      if (Style.WrapNamespaceBodyWithEmptyLines == FormatStyle::WNBWELS_Never)
+        Newlines = 1;
+      else if (!Line.startsWithNamespace())
+        Newlines = std::max(Newlines, 2u);
+    }
+    // Modify empty lines before TT_NamespaceRBrace.
+    if (Line.startsWith(TT_NamespaceRBrace)) {
+      if (Style.WrapNamespaceBodyWithEmptyLines == FormatStyle::WNBWELS_Never)
+        Newlines = 1;
+      else if (!PreviousLine->startsWith(TT_NamespaceRBrace))
+        Newlines = std::max(Newlines, 2u);
+    }
+  }
+
   // Insert or remove empty line before access specifiers.
   if (PreviousLine && RootToken.isAccessSpecifier()) {
     switch (Style.EmptyLineBeforeAccessModifier) {
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 654148a..3177172 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -51,9 +51,7 @@ void printLine(llvm::raw_ostream &OS, const UnwrappedLine &Line,
        << "T=" << (unsigned)I->Tok->getType()
        << ", OC=" << I->Tok->OriginalColumn << ", \"" << I->Tok->TokenText
        << "\"] ";
-    for (SmallVectorImpl<UnwrappedLine>::const_iterator
-             CI = I->Children.begin(),
-             CE = I->Children.end();
+    for (const auto *CI = I->Children.begin(), *CE = I->Children.end();
          CI != CE; ++CI) {
       OS << "\n";
       printLine(OS, *CI, (Prefix + "  ").str());
@@ -394,7 +392,7 @@ bool UnwrappedLineParser::parseLevel(const FormatToken *OpeningBrace,
       break;
     case tok::l_brace:
       if (InRequiresExpression) {
-        FormatTok->setFinalizedType(TT_RequiresExpressionLBrace);
+        FormatTok->setFinalizedType(TT_CompoundRequirementLBrace);
       } else if (FormatTok->Previous &&
                  FormatTok->Previous->ClosesRequiresClause) {
         // We need the 'default' case here to correctly parse a function
@@ -1032,6 +1030,12 @@ void UnwrappedLineParser::parsePPDirective() {
   case tok::pp_pragma:
     parsePPPragma();
     break;
+  case tok::pp_error:
+  case tok::pp_warning:
+    nextToken();
+    if (!eof() && Style.isCpp())
+      FormatTok->setFinalizedType(TT_AfterPPDirective);
+    [[fallthrough]];
   default:
     parsePPUnknown();
     break;
@@ -1211,9 +1215,8 @@ void UnwrappedLineParser::parsePPPragma() {
 }
 
 void UnwrappedLineParser::parsePPUnknown() {
-  do {
+  while (!eof())
     nextToken();
-  } while (!eof());
   if (Style.IndentPPDirectives != FormatStyle::PPDIS_None)
     Line->Level += PPBranchLevel + 1;
   addUnwrappedLine();
@@ -1702,7 +1705,8 @@ void UnwrappedLineParser::parseStructuralElement(
   }
 
   for (const bool InRequiresExpression =
-           OpeningBrace && OpeningBrace->is(TT_RequiresExpressionLBrace);
+           OpeningBrace && OpeningBrace->isOneOf(TT_RequiresExpressionLBrace,
+                                                 TT_CompoundRequirementLBrace);
        !eof();) {
     if (IsCpp && FormatTok->isCppAlternativeOperatorKeyword()) {
       if (auto *Next = Tokens->peekNextToken(/*SkipComment=*/true);
@@ -2041,7 +2045,9 @@ void UnwrappedLineParser::parseStructuralElement(
                 ? FormatTok->NewlinesBefore > 0
                 : CommentsBeforeNextToken.front()->NewlinesBefore > 0;
 
-        if (FollowedByNewline && (Text.size() >= 5 || FunctionLike) &&
+        if (FollowedByNewline &&
+            (Text.size() >= 5 ||
+             (FunctionLike && FormatTok->isNot(tok::l_paren))) &&
             tokenCanStartNewLine(*FormatTok) && Text == Text.upper()) {
           if (PreviousToken->isNot(TT_UntouchableMacroFunc))
             PreviousToken->setFinalizedType(TT_FunctionLikeOrFreestandingMacro);
@@ -4788,8 +4794,7 @@ void UnwrappedLineParser::nextToken(int LevelDifference) {
 }
 
 void UnwrappedLineParser::distributeComments(
-    const SmallVectorImpl<FormatToken *> &Comments,
-    const FormatToken *NextTok) {
+    const ArrayRef<FormatToken *> &Comments, const FormatToken *NextTok) {
   // Whether or not a line comment token continues a line is controlled by
   // the method continuesLineCommentSection, with the following caveat:
   //
@@ -5011,7 +5016,7 @@ void UnwrappedLineParser::readToken(int LevelDifference) {
 namespace {
 template <typename Iterator>
 void pushTokens(Iterator Begin, Iterator End,
-                llvm::SmallVectorImpl<FormatToken *> &Into) {
+                SmallVectorImpl<FormatToken *> &Into) {
   for (auto I = Begin; I != End; ++I) {
     Into.push_back(I->Tok);
     for (const auto &Child : I->Children)
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index b7daf8d..8160d5e 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -228,7 +228,7 @@ private:
   // NextTok specifies the next token. A null pointer NextTok is supported, and
   // signifies either the absence of a next token, or that the next token
   // shouldn't be taken into account for the analysis.
-  void distributeComments(const SmallVectorImpl<FormatToken *> &Comments,
+  void distributeComments(const ArrayRef<FormatToken *> &Comments,
                           const FormatToken *NextTok);
 
   // Adds the comment preceding the next token to unwrapped lines.
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 348c56c..d711df0 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1260,6 +1260,23 @@ static void initOption(AnalyzerOptions::ConfigTable &Config,
       << Name << "an unsigned";
 }
 
+static void initOption(AnalyzerOptions::ConfigTable &Config,
+                       DiagnosticsEngine *Diags,
+                       PositiveAnalyzerOption &OptionField, StringRef Name,
+                       unsigned DefaultVal) {
+  auto Parsed = PositiveAnalyzerOption::create(
+      getStringOption(Config, Name, std::to_string(DefaultVal)));
+  if (Parsed.has_value()) {
+    OptionField = Parsed.value();
+    return;
+  }
+  if (Diags && !Parsed.has_value())
+    Diags->Report(diag::err_analyzer_config_invalid_input)
+        << Name << "a positive";
+
+  OptionField = DefaultVal;
+}
+
 static void parseAnalyzerConfigs(AnalyzerOptions &AnOpts,
                                  DiagnosticsEngine *Diags) {
   // TODO: There's no need to store the entire configtable, it'd be plenty
@@ -1691,7 +1708,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
     }
   }
 
-  if (memcmp(Opts.CoverageVersion, "408*", 4) != 0)
+  if (memcmp(Opts.CoverageVersion, "0000", 4))
     GenerateArg(Consumer, OPT_coverage_version_EQ,
                 StringRef(Opts.CoverageVersion, 4));
 
@@ -2007,7 +2024,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
   } else if (Args.hasArg(OPT_fmemory_profile))
     Opts.MemoryProfileOutput = MemProfileBasename;
 
-  memcpy(Opts.CoverageVersion, "408*", 4);
   if (Opts.CoverageNotesFile.size() || Opts.CoverageDataFile.size()) {
     if (Args.hasArg(OPT_coverage_version_EQ)) {
       StringRef CoverageVersion = Args.getLastArgValue(OPT_coverage_version_EQ);
@@ -4263,6 +4279,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
       if (TT.getArch() == llvm::Triple::UnknownArch ||
           !(TT.getArch() == llvm::Triple::aarch64 || TT.isPPC() ||
+            TT.getArch() == llvm::Triple::spirv64 ||
             TT.getArch() == llvm::Triple::systemz ||
             TT.getArch() == llvm::Triple::loongarch64 ||
             TT.getArch() == llvm::Triple::nvptx ||
diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
index 528eae2..8a36d83 100644
--- a/clang/lib/Frontend/DependencyFile.cpp
+++ b/clang/lib/Frontend/DependencyFile.cpp
@@ -10,11 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Frontend/Utils.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
+#include "clang/Frontend/Utils.h"
 #include "clang/Lex/DirectoryLookup.h"
 #include "clang/Lex/ModuleMap.h"
 #include "clang/Lex/PPCallbacks.h"
@@ -23,8 +23,10 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
+#include <system_error>
 
 using namespace clang;
 
@@ -236,6 +238,7 @@ void DependencyFileGenerator::attachToPreprocessor(Preprocessor &PP) {
     PP.SetSuppressIncludeNotFoundError(true);
 
   DependencyCollector::attachToPreprocessor(PP);
+  FS = PP.getFileManager().getVirtualFileSystemPtr();
 }
 
 bool DependencyFileGenerator::sawDependency(StringRef Filename, bool FromModule,
@@ -312,11 +315,22 @@ void DependencyFileGenerator::finishedMainFile(DiagnosticsEngine &Diags) {
 /// https://msdn.microsoft.com/en-us/library/dd9y37ha.aspx for NMake info,
 /// https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
 /// for Windows file-naming info.
-static void PrintFilename(raw_ostream &OS, StringRef Filename,
+static void printFilename(raw_ostream &OS, llvm::vfs::FileSystem *FS,
+                          StringRef Filename,
                           DependencyOutputFormat OutputFormat) {
   // Convert filename to platform native path
   llvm::SmallString<256> NativePath;
   llvm::sys::path::native(Filename.str(), NativePath);
+  // Resolve absolute path. Make and Ninja canonicalize paths
+  // without checking for symbolic links in the path, for performance concerns.
+  // If there is something like `/bin/../lib64` -> `/usr/lib64`
+  // (where `/bin` links to `/usr/bin`), Make will see them as `/lib64`.
+  if (FS != nullptr && llvm::sys::path::is_absolute(NativePath)) {
+    llvm::SmallString<256> NativePathTmp = NativePath;
+    std::error_code EC = FS->getRealPath(NativePathTmp, NativePath);
+    if (EC)
+      NativePath = NativePathTmp;
+  }
 
   if (OutputFormat == DependencyOutputFormat::NMake) {
     // Add quotes if needed. These are the characters listed as "special" to
@@ -400,7 +414,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) {
       Columns = 2;
     }
     OS << ' ';
-    PrintFilename(OS, File, OutputFormat);
+    printFilename(OS, FS.get(), File, OutputFormat);
     Columns += N + 1;
   }
   OS << '\n';
@@ -411,7 +425,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) {
     for (auto I = Files.begin(), E = Files.end(); I != E; ++I) {
       if (Index++ == InputFileIndex)
         continue;
-      PrintFilename(OS, *I, OutputFormat);
+      printFilename(OS, FS.get(), *I, OutputFormat);
       OS << ":\n";
     }
   }
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index e943f14..30dfa54 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -279,12 +279,14 @@ GenerateModuleInterfaceAction::CreateASTConsumer(CompilerInstance &CI,
       !CI.getFrontendOpts().ModuleOutputPath.empty()) {
     Consumers.push_back(std::make_unique<ReducedBMIGenerator>(
         CI.getPreprocessor(), CI.getModuleCache(),
-        CI.getFrontendOpts().ModuleOutputPath));
+        CI.getFrontendOpts().ModuleOutputPath,
+        +CI.getFrontendOpts().AllowPCMWithCompilerErrors));
   }
 
   Consumers.push_back(std::make_unique<CXX20ModulesGenerator>(
       CI.getPreprocessor(), CI.getModuleCache(),
-      CI.getFrontendOpts().OutputFile));
+      CI.getFrontendOpts().OutputFile,
+      +CI.getFrontendOpts().AllowPCMWithCompilerErrors));
 
   return std::make_unique<MultiplexConsumer>(std::move(Consumers));
 }
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 29723b5..8eba766 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1507,6 +1507,11 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   // ELF targets define __ELF__
   if (TI.getTriple().isOSBinFormatELF())
     Builder.defineMacro("__ELF__");
+  else if (TI.getTriple().isAppleMachO())
+    // Apple MachO targets define __MACH__ even when not using DarwinTargetInfo.
+    // Hurd will also define this in some circumstances, but that's done in
+    // HurdTargetInfo. Windows targets don't define this.
+    Builder.defineMacro("__MACH__");
 
   // Target OS macro definitions.
   if (PPOpts.DefineTargetOSMacros) {
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 67c9d92..bb2a213 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -313,7 +313,7 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
     break;
 
   case llvm::Triple::UnknownOS:
-    if (triple.isWasm())
+    if (triple.isWasm() || triple.isAppleMachO())
       return false;
     break;
 
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index ede4090..c79ba97 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -1003,7 +1003,9 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
       // the 'update' clause, so we have to handle it here.  U se an assert to
       // make sure we get the right differentiator.
       assert(DirKind == OpenACCDirectiveKind::Update);
-      [[fallthrough]];
+      ParsedClause.setVarListDetails(ParseOpenACCVarList(ClauseKind),
+                                     /*IsReadOnly=*/false, /*IsZero=*/false);
+      break;
     case OpenACCClauseKind::Device:
     case OpenACCClauseKind::DeviceResident:
     case OpenACCClauseKind::Host:
@@ -1082,13 +1084,7 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
         return OpenACCCanContinue();
       }
 
-      // TODO OpenACC: as we implement the 'rest' of the above, this 'if' should
-      // be removed leaving just the 'setIntExprDetails'.
-      if (ClauseKind == OpenACCClauseKind::NumWorkers ||
-          ClauseKind == OpenACCClauseKind::DeviceNum ||
-          ClauseKind == OpenACCClauseKind::VectorLength)
-        ParsedClause.setIntExprDetails(IntExpr.get());
-
+      ParsedClause.setIntExprDetails(IntExpr.get());
       break;
     }
     case OpenACCClauseKind::DType:
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 8ba6a5dc..0710542 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -2654,10 +2654,10 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
       SeenError = false;
     break;
   }
-  if (SeenError) {
-    ExpectAndConsumeSemi(diag::err_module_expected_semi);
+  ExpectAndConsumeSemi(diag::err_module_expected_semi);
+
+  if (SeenError)
     return nullptr;
-  }
 
   DeclResult Import;
   if (HeaderUnit)
@@ -2666,7 +2666,6 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
   else if (!Path.empty())
     Import = Actions.ActOnModuleImport(StartLoc, ExportLoc, ImportLoc, Path,
                                        IsPartition);
-  ExpectAndConsumeSemi(diag::err_module_expected_semi);
   if (Import.isInvalid())
     return nullptr;
 
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index 719c3a9..3241cb5 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -79,6 +79,7 @@ add_clang_library(clangSema
   SemaStmt.cpp
   SemaStmtAsm.cpp
   SemaStmtAttr.cpp
+  SemaSPIRV.cpp
   SemaSYCL.cpp
   SemaSwift.cpp
   SemaSystemZ.cpp
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index d651751..abb46d3 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -61,6 +61,7 @@
 #include "clang/Sema/SemaPPC.h"
 #include "clang/Sema/SemaPseudoObject.h"
 #include "clang/Sema/SemaRISCV.h"
+#include "clang/Sema/SemaSPIRV.h"
 #include "clang/Sema/SemaSYCL.h"
 #include "clang/Sema/SemaSwift.h"
 #include "clang/Sema/SemaSystemZ.h"
@@ -239,6 +240,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
       PPCPtr(std::make_unique<SemaPPC>(*this)),
       PseudoObjectPtr(std::make_unique<SemaPseudoObject>(*this)),
       RISCVPtr(std::make_unique<SemaRISCV>(*this)),
+      SPIRVPtr(std::make_unique<SemaSPIRV>(*this)),
       SYCLPtr(std::make_unique<SemaSYCL>(*this)),
       SwiftPtr(std::make_unique<SemaSwift>(*this)),
       SystemZPtr(std::make_unique<SemaSystemZ>(*this)),
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 3e93b38..411baa0 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -372,7 +372,7 @@ enum ArmSMEState : unsigned {
 
 bool SemaARM::CheckImmediateArg(CallExpr *TheCall, unsigned CheckTy,
                                 unsigned ArgIdx, unsigned EltBitWidth,
-                                unsigned VecBitWidth) {
+                                unsigned ContainerBitWidth) {
   // Function that checks whether the operand (ArgIdx) is an immediate
   // that is one of a given set of values.
   auto CheckImmediateInSet = [&](std::initializer_list<int64_t> Set,
@@ -445,17 +445,17 @@ bool SemaARM::CheckImmediateArg(CallExpr *TheCall, unsigned CheckTy,
     break;
   case ImmCheckType::ImmCheckLaneIndex:
     if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
-                                        (VecBitWidth / EltBitWidth) - 1))
+                                        (ContainerBitWidth / EltBitWidth) - 1))
       return true;
     break;
   case ImmCheckType::ImmCheckLaneIndexCompRotate:
-    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
-                                        (VecBitWidth / (2 * EltBitWidth)) - 1))
+    if (SemaRef.BuiltinConstantArgRange(
+            TheCall, ArgIdx, 0, (ContainerBitWidth / (2 * EltBitWidth)) - 1))
       return true;
     break;
   case ImmCheckType::ImmCheckLaneIndexDot:
-    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgIdx, 0,
-                                        (VecBitWidth / (4 * EltBitWidth)) - 1))
+    if (SemaRef.BuiltinConstantArgRange(
+            TheCall, ArgIdx, 0, (ContainerBitWidth / (4 * EltBitWidth)) - 1))
       return true;
     break;
   case ImmCheckType::ImmCheckComplexRot90_270:
@@ -515,13 +515,13 @@ bool SemaARM::PerformNeonImmChecks(
   bool HasError = false;
 
   for (const auto &I : ImmChecks) {
-    auto [ArgIdx, CheckTy, ElementSizeInBits, VecSizeInBits] = I;
+    auto [ArgIdx, CheckTy, ElementBitWidth, VecBitWidth] = I;
 
     if (OverloadType >= 0)
-      ElementSizeInBits = NeonTypeFlags(OverloadType).getEltSizeInBits();
+      ElementBitWidth = NeonTypeFlags(OverloadType).getEltSizeInBits();
 
-    HasError |= CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementSizeInBits,
-                                  VecSizeInBits);
+    HasError |= CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementBitWidth,
+                                  VecBitWidth);
   }
 
   return HasError;
@@ -532,9 +532,9 @@ bool SemaARM::PerformSVEImmChecks(
   bool HasError = false;
 
   for (const auto &I : ImmChecks) {
-    auto [ArgIdx, CheckTy, ElementSizeInBits] = I;
+    auto [ArgIdx, CheckTy, ElementBitWidth] = I;
     HasError |=
-        CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementSizeInBits, 128);
+        CheckImmediateArg(TheCall, CheckTy, ArgIdx, ElementBitWidth, 128);
   }
 
   return HasError;
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index 44485e7..42aa68d 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -307,8 +307,8 @@ void Sema::inferLifetimeCaptureByAttribute(FunctionDecl *FD) {
       Annotate(MD);
     return;
   }
-  static const llvm::StringSet<> CapturingMethods{"insert", "push",
-                                                  "push_front", "push_back"};
+  static const llvm::StringSet<> CapturingMethods{
+      "insert", "insert_or_assign", "push", "push_front", "push_back"};
   if (!CapturingMethods.contains(MD->getName()))
     return;
   Annotate(MD);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ce846ae..28dcfaa 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -70,6 +70,7 @@
 #include "clang/Sema/SemaOpenCL.h"
 #include "clang/Sema/SemaPPC.h"
 #include "clang/Sema/SemaRISCV.h"
+#include "clang/Sema/SemaSPIRV.h"
 #include "clang/Sema/SemaSystemZ.h"
 #include "clang/Sema/SemaWasm.h"
 #include "clang/Sema/SemaX86.h"
@@ -1934,6 +1935,8 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
     return MIPS().CheckMipsBuiltinFunctionCall(TI, BuiltinID, TheCall);
+  case llvm::Triple::spirv:
+    return SPIRV().CheckSPIRVBuiltinFunctionCall(BuiltinID, TheCall);
   case llvm::Triple::systemz:
     return SystemZ().CheckSystemZBuiltinFunctionCall(BuiltinID, TheCall);
   case llvm::Triple::x86:
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index ac36663..ac5d51a1 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1401,6 +1401,8 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OpenACCWaitConstructClass:
   case Stmt::OpenACCInitConstructClass:
   case Stmt::OpenACCShutdownConstructClass:
+  case Stmt::OpenACCSetConstructClass:
+  case Stmt::OpenACCUpdateConstructClass:
     // These expressions can never throw.
     return CT_Cannot;
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 24f7d27..ae40895 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -3352,6 +3352,7 @@ ExprResult Sema::BuildDeclarationNameExpr(
   case Decl::VarTemplateSpecialization:
   case Decl::VarTemplatePartialSpecialization:
   case Decl::Decomposition:
+  case Decl::Binding:
   case Decl::OMPCapturedExpr:
     // In C, "extern void blah;" is valid and is an r-value.
     if (!getLangOpts().CPlusPlus && !type.hasQualifiers() &&
@@ -3371,20 +3372,13 @@ ExprResult Sema::BuildDeclarationNameExpr(
     // potentially-evaluated contexts? Since the variable isn't actually
     // captured in an unevaluated context, it seems that the answer is no.
     if (!isUnevaluatedContext()) {
-      QualType CapturedType = getCapturedDeclRefType(cast<VarDecl>(VD), Loc);
+      QualType CapturedType = getCapturedDeclRefType(cast<ValueDecl>(VD), Loc);
       if (!CapturedType.isNull())
         type = CapturedType;
     }
-
     break;
   }
 
-  case Decl::Binding:
-    // These are always lvalues.
-    valueKind = VK_LValue;
-    type = type.getNonReferenceType();
-    break;
-
   case Decl::Function: {
     if (unsigned BID = cast<FunctionDecl>(VD)->getBuiltinID()) {
       if (!Context.BuiltinInfo.isDirectlyAddressable(BID)) {
@@ -13297,11 +13291,24 @@ static NonConstCaptureKind isReferenceToNonConstCapture(Sema &S, Expr *E) {
   if (!DRE) return NCCK_None;
   if (!DRE->refersToEnclosingVariableOrCapture()) return NCCK_None;
 
-  // The declaration must be a variable which is not declared 'const'.
-  VarDecl *var = dyn_cast<VarDecl>(DRE->getDecl());
-  if (!var) return NCCK_None;
-  if (var->getType().isConstQualified()) return NCCK_None;
-  assert(var->hasLocalStorage() && "capture added 'const' to non-local?");
+  ValueDecl *Value = dyn_cast<ValueDecl>(DRE->getDecl());
+
+  // The declaration must be a value which is not declared 'const'.
+  if (!Value || Value->getType().isConstQualified())
+    return NCCK_None;
+
+  BindingDecl *Binding = dyn_cast<BindingDecl>(Value);
+  if (Binding) {
+    assert(S.getLangOpts().CPlusPlus && "BindingDecl outside of C++?");
+    assert(!isa<BlockDecl>(Binding->getDeclContext()));
+    return NCCK_Lambda;
+  }
+
+  VarDecl *Var = dyn_cast<VarDecl>(Value);
+  if (!Var)
+    return NCCK_None;
+
+  assert(Var->hasLocalStorage() && "capture added 'const' to non-local?");
 
   // Decide whether the first capture was for a block or a lambda.
   DeclContext *DC = S.CurContext, *Prev = nullptr;
@@ -13310,16 +13317,16 @@ static NonConstCaptureKind isReferenceToNonConstCapture(Sema &S, Expr *E) {
     // For init-capture, it is possible that the variable belongs to the
     // template pattern of the current context.
     if (auto *FD = dyn_cast<FunctionDecl>(DC))
-      if (var->isInitCapture() &&
-          FD->getTemplateInstantiationPattern() == var->getDeclContext())
+      if (Var->isInitCapture() &&
+          FD->getTemplateInstantiationPattern() == Var->getDeclContext())
         break;
-    if (DC == var->getDeclContext())
+    if (DC == Var->getDeclContext())
       break;
     Prev = DC;
     DC = DC->getParent();
   }
   // Unless we have an init-capture, we've gone one step too far.
-  if (!var->isInitCapture())
+  if (!Var->isInitCapture())
     DC = Prev;
   return (isa<BlockDecl>(DC) ? NCCK_Block : NCCK_Lambda);
 }
@@ -16585,6 +16592,13 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
         << TInfo->getTypeLoc().getSourceRange();
     }
 
+    if (TInfo->getType()->isArrayType()) {
+      DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,
+                          PDiag(diag::warn_second_parameter_to_va_arg_array)
+                              << TInfo->getType()
+                              << TInfo->getTypeLoc().getSourceRange());
+    }
+
     // Check for va_arg where arguments of the given type will be promoted
     // (i.e. this va_arg is guaranteed to have undefined behavior).
     QualType PromoteType;
@@ -19247,6 +19261,8 @@ bool Sema::NeedToCaptureVariable(ValueDecl *Var, SourceLocation Loc) {
 }
 
 QualType Sema::getCapturedDeclRefType(ValueDecl *Var, SourceLocation Loc) {
+  assert(Var && "Null value cannot be captured");
+
   QualType CaptureType;
   QualType DeclRefType;
 
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 5909457..0dd5f46 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -2030,13 +2030,8 @@ canInitializeArrayWithEmbedDataString(ArrayRef<Expr *> ExprList,
 
   if (InitType->isArrayType()) {
     const ArrayType *InitArrayType = InitType->getAsArrayTypeUnsafe();
-    QualType InitElementTy = InitArrayType->getElementType();
-    QualType EmbedExprElementTy = EE->getDataStringLiteral()->getType();
-    const bool TypesMatch =
-        Context.typesAreCompatible(InitElementTy, EmbedExprElementTy) ||
-        (InitElementTy->isCharType() && EmbedExprElementTy->isCharType());
-    if (TypesMatch)
-      return true;
+    StringLiteral *SL = EE->getDataStringLiteral();
+    return IsStringInit(SL, InitArrayType, Context) == SIF_None;
   }
   return false;
 }
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 42bbdf1..51a95f9 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -463,6 +463,14 @@ bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
       return false;
     }
   }
+  case OpenACCClauseKind::DefaultAsync: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Set:
+      return true;
+    default:
+      return false;
+    }
+  }
   }
 
   default:
@@ -490,12 +498,9 @@ bool checkAlreadyHasClauseOfKind(
 bool checkValidAfterDeviceType(
     SemaOpenACC &S, const OpenACCDeviceTypeClause &DeviceTypeClause,
     const SemaOpenACC::OpenACCParsedClause &NewClause) {
-  // This is only a requirement on compute, combined, data and loop constructs
-  // so far, so this is fine otherwise.
-  if (!isOpenACCComputeDirectiveKind(NewClause.getDirectiveKind()) &&
-      !isOpenACCCombinedDirectiveKind(NewClause.getDirectiveKind()) &&
-      NewClause.getDirectiveKind() != OpenACCDirectiveKind::Loop &&
-      NewClause.getDirectiveKind() != OpenACCDirectiveKind::Data)
+  // This is implemented for everything but 'routine', so treat as 'fine' for
+  // that.
+  if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Routine)
     return false;
 
   // OpenACC3.3: Section 2.4: Clauses that precede any device_type clause are
@@ -570,6 +575,21 @@ bool checkValidAfterDeviceType(
     default:
       break;
     }
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Set ||
+             NewClause.getDirectiveKind() == OpenACCDirectiveKind::Init ||
+             NewClause.getDirectiveKind() == OpenACCDirectiveKind::Shutdown) {
+    // There are no restrictions on 'set', 'init', or 'shutdown'.
+    return false;
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Update) {
+    // OpenACC3.3 section 2.14.4: Only the async and wait clauses may follow a
+    // device_type clause.
+    switch (NewClause.getClauseKind()) {
+    case OpenACCClauseKind::Async:
+    case OpenACCClauseKind::Wait:
+      return false;
+    default:
+      break;
+    }
   }
   S.Diag(NewClause.getBeginLoc(), diag::err_acc_clause_after_device_type)
       << NewClause.getClauseKind() << DeviceTypeClause.getClauseKind()
@@ -587,7 +607,8 @@ bool isDirectiveKindImplemented(OpenACCDirectiveKind DK) {
          isOpenACCCombinedDirectiveKind(DK) || isOpenACCDataDirectiveKind(DK) ||
          DK == OpenACCDirectiveKind::Loop || DK == OpenACCDirectiveKind::Wait ||
          DK == OpenACCDirectiveKind::Init ||
-         DK == OpenACCDirectiveKind::Shutdown;
+         DK == OpenACCDirectiveKind::Shutdown ||
+         DK == OpenACCDirectiveKind::Set;
 }
 
 class SemaOpenACCClauseVisitor {
@@ -700,18 +721,11 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitTileClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // constructs that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
-  // sense. Prose DOES exist for 'data' and 'host_data', 'enter data' and 'exit
-  // data' both don't, but other implmementations do this.  OpenACC issue 519
-  // filed for the latter two.
+  // sense. Prose DOES exist for 'data' and 'host_data', 'set', 'enter data' and
+  // 'exit data' both don't, but other implmementations do this.  OpenACC issue
+  // 519 filed for the latter two. Prose also exists for 'update'.
   // GCC allows this on init/shutdown, presumably for good reason, so we do too.
   if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Init &&
       Clause.getDirectiveKind() != OpenACCDirectiveKind::Shutdown &&
@@ -722,14 +736,14 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
   // isn't really much to do here.
 
   // If the 'if' clause is true, it makes the 'self' clause have no effect,
-  // diagnose that here.
-  // TODO OpenACC: When we add these two to other constructs, we might not
-  // want to warn on this (for example, 'update').
-  const auto *Itr =
-      llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>);
-  if (Itr != ExistingClauses.end()) {
-    SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
-    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+  // diagnose that here.  This only applies on compute/combined constructs.
+  if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Update) {
+    const auto *Itr =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>);
+    if (Itr != ExistingClauses.end()) {
+      SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
+      SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    }
   }
 
   return OpenACCIfClause::Create(Ctx, Clause.getBeginLoc(),
@@ -739,16 +753,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitSelfClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute' constructs, and
-  // 'compute' constructs are the only construct that can do anything with
-  // this yet, so skip/treat as unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
-  // TODO OpenACC: When we implement this for 'update', this takes a
-  // 'var-list' instead of a condition expression, so semantics/handling has
-  // to happen differently here.
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
   // sense.
@@ -756,9 +760,12 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitSelfClause(
     return nullptr;
 
   // If the 'if' clause is true, it makes the 'self' clause have no effect,
-  // diagnose that here.
-  // TODO OpenACC: When we add these two to other constructs, we might not
-  // want to warn on this (for example, 'update').
+  // diagnose that here.  This only applies on compute/combined constructs.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Update)
+    return OpenACCSelfClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), Clause.getVarList(),
+                                     Clause.getEndLoc());
+
   const auto *Itr =
       llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCIfClause>);
   if (Itr != ExistingClauses.end()) {
@@ -935,13 +942,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
   // sense.
@@ -963,6 +963,12 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceNumClause(
   if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
     return isNotImplemented();
 
+  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
+  // same directive.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set &&
+      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
   assert(Clause.getNumIntExprs() == 1 &&
          "Invalid number of expressions for device_num");
   return OpenACCDeviceNumClause::Create(
@@ -970,6 +976,20 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceNumClause(
       Clause.getEndLoc());
 }
 
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDefaultAsyncClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
+  // same directive.
+  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
+  assert(Clause.getNumIntExprs() == 1 &&
+         "Invalid number of expressions for default_async");
+  return OpenACCDefaultAsyncClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
+      Clause.getEndLoc());
+}
+
 OpenACCClause *SemaOpenACCClauseVisitor::VisitPrivateClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
   // ActOnVar ensured that everything is a valid variable reference, so there
@@ -1156,13 +1176,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDevicePtrClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   return OpenACCWaitClause::Create(
       Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getDevNumExpr(),
       Clause.getQueuesLoc(), Clause.getQueueIdExprs(), Clause.getEndLoc());
@@ -1170,13 +1183,16 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute', 'combined', 'data' and
-  // 'loop' constructs, and 'compute'/'combined'/'data'/'loop' constructs are
-  // the only construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+  // Restrictions implemented properly on everything except 'routine'.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Routine)
     return isNotImplemented();
 
+  // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
+  // same directive.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set &&
+      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+    return nullptr;
+
   // TODO OpenACC: Once we get enough of the CodeGen implemented that we have
   // a source for the list of valid architectures, we need to warn on unknown
   // identifiers here.
@@ -1709,8 +1725,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitFinalizeClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitIfPresentClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
   // There isn't anything to do here, this is only valid on one construct, and
   // has no associated rules.
   return OpenACCIfPresentClause::Create(Ctx, Clause.getBeginLoc(),
@@ -1900,6 +1914,8 @@ bool PreserveLoopRAIIDepthInAssociatedStmtRAII(OpenACCDirectiveKind DK) {
   case OpenACCDirectiveKind::Wait:
   case OpenACCDirectiveKind::Init:
   case OpenACCDirectiveKind::Shutdown:
+  case OpenACCDirectiveKind::Set:
+  case OpenACCDirectiveKind::Update:
     llvm_unreachable("Doesn't have an associated stmt");
   default:
   case OpenACCDirectiveKind::Invalid:
@@ -2328,6 +2344,8 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::HostData:
   case OpenACCDirectiveKind::Init:
   case OpenACCDirectiveKind::Shutdown:
+  case OpenACCDirectiveKind::Set:
+  case OpenACCDirectiveKind::Update:
     // Nothing to do here, there is no real legalization that needs to happen
     // here as these constructs do not take any arguments.
     break;
@@ -3661,6 +3679,24 @@ bool SemaOpenACC::ActOnStartStmtDirective(
     return Diag(StartLoc, diag::err_acc_construct_one_clause_of)
            << K << GetListOfClauses({OpenACCClauseKind::UseDevice});
 
+  // OpenACC3.3 2.14.3: At least one default_async, device_num, or device_type
+  // clause must appear.
+  if (K == OpenACCDirectiveKind::Set &&
+      llvm::find_if(
+          Clauses,
+          llvm::IsaPred<OpenACCDefaultAsyncClause, OpenACCDeviceNumClause,
+                        OpenACCDeviceTypeClause, OpenACCIfClause>) ==
+          Clauses.end())
+    return Diag(StartLoc, diag::err_acc_construct_one_clause_of)
+           << K
+           << GetListOfClauses({OpenACCClauseKind::DefaultAsync,
+                                OpenACCClauseKind::DeviceNum,
+                                OpenACCClauseKind::DeviceType,
+                                OpenACCClauseKind::If});
+
+  // TODO: OpenACC: 'Update' construct needs to have one of 'self', 'host', or
+  // 'device'.  Implement here.
+
   return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/true);
 }
 
@@ -3724,6 +3760,14 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(
     return OpenACCShutdownConstruct::Create(getASTContext(), StartLoc, DirLoc,
                                             EndLoc, Clauses);
   }
+  case OpenACCDirectiveKind::Set: {
+    return OpenACCSetConstruct::Create(getASTContext(), StartLoc, DirLoc,
+                                       EndLoc, Clauses);
+  }
+  case OpenACCDirectiveKind::Update: {
+    return OpenACCUpdateConstruct::Create(getASTContext(), StartLoc, DirLoc,
+                                          EndLoc, Clauses);
+  }
   }
   llvm_unreachable("Unhandled case in directive handling?");
 }
@@ -3739,6 +3783,7 @@ StmtResult SemaOpenACC::ActOnAssociatedStmt(
   case OpenACCDirectiveKind::Wait:
   case OpenACCDirectiveKind::Init:
   case OpenACCDirectiveKind::Shutdown:
+  case OpenACCDirectiveKind::Set:
     llvm_unreachable(
         "these don't have associated statements, so shouldn't get here");
   case OpenACCDirectiveKind::Parallel:
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index fff49b7..7589701 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6977,11 +6977,26 @@ void Sema::AddOverloadCandidate(
     /// have linkage. So that all entities of the same should share one
     /// linkage. But in clang, different entities of the same could have
     /// different linkage.
-    NamedDecl *ND = Function;
-    if (auto *SpecInfo = Function->getTemplateSpecializationInfo())
+    const NamedDecl *ND = Function;
+    bool IsImplicitlyInstantiated = false;
+    if (auto *SpecInfo = Function->getTemplateSpecializationInfo()) {
       ND = SpecInfo->getTemplate();
-
-    if (ND->getFormalLinkage() == Linkage::Internal) {
+      IsImplicitlyInstantiated = SpecInfo->getTemplateSpecializationKind() ==
+                                 TSK_ImplicitInstantiation;
+    }
+
+    /// Don't remove inline functions with internal linkage from the overload
+    /// set if they are declared in a GMF, in violation of C++ [basic.link]p17.
+    /// However:
+    /// - Inline functions with internal linkage are a common pattern in
+    ///   headers to avoid ODR issues.
+    /// - The global module is meant to be a transition mechanism for C and C++
+    ///   headers, and the current rules as written work against that goal.
+    const bool IsInlineFunctionInGMF =
+        Function->isFromGlobalModule() &&
+        (IsImplicitlyInstantiated || Function->isInlined());
+
+    if (ND->getFormalLinkage() == Linkage::Internal && !IsInlineFunctionInGMF) {
       Candidate.Viable = false;
       Candidate.FailureKind = ovl_fail_module_mismatched;
       return;
diff --git a/clang/lib/Sema/SemaSPIRV.cpp b/clang/lib/Sema/SemaSPIRV.cpp
new file mode 100644
index 0000000..d2de648
--- /dev/null
+++ b/clang/lib/Sema/SemaSPIRV.cpp
@@ -0,0 +1,57 @@
+//===- SemaSPIRV.cpp - Semantic Analysis for SPIRV constructs--------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This implements Semantic Analysis for SPIRV constructs.
+//===----------------------------------------------------------------------===//
+
+#include "clang/Sema/SemaSPIRV.h"
+#include "clang/Basic/TargetBuiltins.h"
+#include "clang/Sema/Sema.h"
+
+namespace clang {
+
+SemaSPIRV::SemaSPIRV(Sema &S) : SemaBase(S) {}
+
+bool SemaSPIRV::CheckSPIRVBuiltinFunctionCall(unsigned BuiltinID,
+                                              CallExpr *TheCall) {
+  switch (BuiltinID) {
+  case SPIRV::BI__builtin_spirv_distance: {
+    if (SemaRef.checkArgCount(TheCall, 2))
+      return true;
+
+    ExprResult A = TheCall->getArg(0);
+    QualType ArgTyA = A.get()->getType();
+    auto *VTyA = ArgTyA->getAs<VectorType>();
+    if (VTyA == nullptr) {
+      SemaRef.Diag(A.get()->getBeginLoc(),
+                   diag::err_typecheck_convert_incompatible)
+          << ArgTyA
+          << SemaRef.Context.getVectorType(ArgTyA, 2, VectorKind::Generic) << 1
+          << 0 << 0;
+      return true;
+    }
+
+    ExprResult B = TheCall->getArg(1);
+    QualType ArgTyB = B.get()->getType();
+    auto *VTyB = ArgTyB->getAs<VectorType>();
+    if (VTyB == nullptr) {
+      SemaRef.Diag(A.get()->getBeginLoc(),
+                   diag::err_typecheck_convert_incompatible)
+          << ArgTyB
+          << SemaRef.Context.getVectorType(ArgTyB, 2, VectorKind::Generic) << 1
+          << 0 << 0;
+      return true;
+    }
+
+    QualType RetTy = VTyA->getElementType();
+    TheCall->setType(RetTy);
+    break;
+  }
+  }
+  return false;
+}
+} // namespace clang
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index d9149f7..25a07d0 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -625,6 +625,15 @@ Sema::ActOnLabelStmt(SourceLocation IdentLoc, LabelDecl *TheDecl,
   if (getCurScope()->isInOpenACCComputeConstructScope())
     setFunctionHasBranchProtectedScope();
 
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(SubStmt)) {
+    Diag(SubStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*Label*/ 4;
+    SubStmt = new (Context) NullStmt(SubStmt->getBeginLoc());
+  }
+
   // Otherwise, things are good.  Fill in the declaration and return it.
   LabelStmt *LS = new (Context) LabelStmt(IdentLoc, TheDecl, SubStmt);
   TheDecl->setStmt(LS);
@@ -1019,6 +1028,15 @@ StmtResult Sema::ActOnIfStmt(SourceLocation IfLoc,
       Diags.Report(IfLoc, diag::warn_consteval_if_always_true) << Immediate;
   }
 
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(thenStmt)) {
+    Diag(thenStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*if*/ 0;
+    thenStmt = new (Context) NullStmt(thenStmt->getBeginLoc());
+  }
+
   return BuildIfStmt(IfLoc, StatementKind, LParenLoc, InitStmt, Cond, RParenLoc,
                      thenStmt, ElseLoc, elseStmt);
 }
@@ -1297,6 +1315,16 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
   getCurFunction()->SwitchStack.pop_back();
 
   if (!BodyStmt) return StmtError();
+
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(BodyStmt)) {
+    Diag(BodyStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*switch*/ 3;
+    BodyStmt = new (Context) NullStmt(BodyStmt->getBeginLoc());
+  }
+
   SS->setBody(BodyStmt, SwitchLoc);
 
   Expr *CondExpr = SS->getCond();
@@ -1774,6 +1802,15 @@ StmtResult Sema::ActOnWhileStmt(SourceLocation WhileLoc,
       !Diags.isIgnored(diag::warn_comma_operator, CondVal.second->getExprLoc()))
     CommaVisitor(*this).Visit(CondVal.second);
 
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(Body)) {
+    Diag(Body->getBeginLoc(), diag::err_acc_update_as_body) << /*while*/ 1;
+    Body = new (Context) NullStmt(Body->getBeginLoc());
+  }
+
   if (isa<NullStmt>(Body))
     getCurCompoundScope().setHasEmptyLoopBodies();
 
@@ -1803,6 +1840,15 @@ Sema::ActOnDoStmt(SourceLocation DoLoc, Stmt *Body,
       !Diags.isIgnored(diag::warn_comma_operator, Cond->getExprLoc()))
     CommaVisitor(*this).Visit(Cond);
 
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(Body)) {
+    Diag(Body->getBeginLoc(), diag::err_acc_update_as_body) << /*do*/ 2;
+    Body = new (Context) NullStmt(Body->getBeginLoc());
+  }
+
   return new (Context) DoStmt(Body, Cond, DoLoc, WhileLoc, CondRParen);
 }
 
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 5e7a3c8..ce672b0 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1228,7 +1228,7 @@ bool Sema::AttachTypeConstraint(AutoTypeLoc TL,
                                 NonTypeTemplateParmDecl *NewConstrainedParm,
                                 NonTypeTemplateParmDecl *OrigConstrainedParm,
                                 SourceLocation EllipsisLoc) {
-  if (NewConstrainedParm->getType() != TL.getType() ||
+  if (NewConstrainedParm->getType().getNonPackExpansionType() != TL.getType() ||
       TL.getAutoKeyword() != AutoTypeKeyword::Auto) {
     Diag(NewConstrainedParm->getTypeSourceInfo()->getTypeLoc().getBeginLoc(),
          diag::err_unsupported_placeholder_constraint)
@@ -1530,9 +1530,19 @@ NamedDecl *Sema::ActOnNonTypeTemplateParameter(Scope *S, Declarator &D,
   Param->setAccess(AS_public);
 
   if (AutoTypeLoc TL = TInfo->getTypeLoc().getContainedAutoTypeLoc())
-    if (TL.isConstrained())
-      if (AttachTypeConstraint(TL, Param, Param, D.getEllipsisLoc()))
+    if (TL.isConstrained()) {
+      if (D.getEllipsisLoc().isInvalid() &&
+          T->containsUnexpandedParameterPack()) {
+        assert(TL.getConceptReference()->getTemplateArgsAsWritten());
+        for (auto &Loc :
+             TL.getConceptReference()->getTemplateArgsAsWritten()->arguments())
+          Invalid |= DiagnoseUnexpandedParameterPack(
+              Loc, UnexpandedParameterPackContext::UPPC_TypeConstraint);
+      }
+      if (!Invalid &&
+          AttachTypeConstraint(TL, Param, Param, D.getEllipsisLoc()))
         Invalid = true;
+    }
 
   if (Invalid)
     Param->setInvalidDecl();
@@ -4547,6 +4557,9 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
                              const TemplateArgumentListInfo *TemplateArgs) {
   assert(NamedConcept && "A concept template id without a template?");
 
+  if (NamedConcept->isInvalidDecl())
+    return ExprError();
+
   llvm::SmallVector<TemplateArgument, 4> SugaredConverted, CanonicalConverted;
   if (CheckTemplateArgumentList(
           NamedConcept, ConceptNameInfo.getLoc(),
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index fad20b3..1c1f6e3 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -857,7 +857,10 @@ private:
       if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(
               TemplateParams->getParam(Index))) {
         if (!NTTP->isExpandedParameterPack())
-          if (auto *Expansion = dyn_cast<PackExpansionType>(NTTP->getType()))
+          // FIXME: CWG2982 suggests a type-constraint forms a non-deduced
+          // context, however it is not yet resolved.
+          if (auto *Expansion = dyn_cast<PackExpansionType>(
+                  S.Context.getUnconstrainedType(NTTP->getType())))
             ExtraDeductions.push_back(Expansion->getPattern());
       }
       // FIXME: Also collect the unexpanded packs in any type and template
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 5d43d98..d00ad5a 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4169,6 +4169,24 @@ public:
         SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {});
   }
 
+  StmtResult RebuildOpenACCSetConstruct(SourceLocation BeginLoc,
+                                        SourceLocation DirLoc,
+                                        SourceLocation EndLoc,
+                                        ArrayRef<OpenACCClause *> Clauses) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::Set, BeginLoc, DirLoc, SourceLocation{},
+        SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {});
+  }
+
+  StmtResult RebuildOpenACCUpdateConstruct(SourceLocation BeginLoc,
+                                           SourceLocation DirLoc,
+                                           SourceLocation EndLoc,
+                                           ArrayRef<OpenACCClause *> Clauses) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::Update, BeginLoc, DirLoc, SourceLocation{},
+        SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {});
+  }
+
   StmtResult RebuildOpenACCWaitConstruct(
       SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation LParenLoc,
       Expr *DevNumExpr, SourceLocation QueuesLoc, ArrayRef<Expr *> QueueIdExprs,
@@ -11629,22 +11647,48 @@ template <typename Derived>
 void OpenACCClauseTransform<Derived>::VisitSelfClause(
     const OpenACCSelfClause &C) {
 
-  if (C.hasConditionExpr()) {
-    Expr *Cond = const_cast<Expr *>(C.getConditionExpr());
-    Sema::ConditionResult Res =
-        Self.TransformCondition(Cond->getExprLoc(), /*Var=*/nullptr, Cond,
-                                Sema::ConditionKind::Boolean);
+  // If this is an 'update' 'self' clause, this is actually a var list instead.
+  if (ParsedClause.getDirectiveKind() == OpenACCDirectiveKind::Update) {
+    llvm::SmallVector<Expr *> InstantiatedVarList;
+    for (Expr *CurVar : C.getVarList()) {
+      ExprResult Res = Self.TransformExpr(CurVar);
 
-    if (Res.isInvalid() || !Res.get().second)
-      return;
+      if (!Res.isUsable())
+        continue;
 
-    ParsedClause.setConditionDetails(Res.get().second);
-  }
+      Res = Self.getSema().OpenACC().ActOnVar(ParsedClause.getClauseKind(),
+                                              Res.get());
 
-  NewClause = OpenACCSelfClause::Create(
-      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
-      ParsedClause.getLParenLoc(), ParsedClause.getConditionExpr(),
-      ParsedClause.getEndLoc());
+      if (Res.isUsable())
+        InstantiatedVarList.push_back(Res.get());
+    }
+
+    ParsedClause.setVarListDetails(InstantiatedVarList,
+                                   /*IsReadOnly=*/false, /*IsZero=*/false);
+
+    NewClause = OpenACCSelfClause::Create(
+        Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+        ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
+        ParsedClause.getEndLoc());
+  } else {
+
+    if (C.hasConditionExpr()) {
+      Expr *Cond = const_cast<Expr *>(C.getConditionExpr());
+      Sema::ConditionResult Res =
+          Self.TransformCondition(Cond->getExprLoc(), /*Var=*/nullptr, Cond,
+                                  Sema::ConditionKind::Boolean);
+
+      if (Res.isInvalid() || !Res.get().second)
+        return;
+
+      ParsedClause.setConditionDetails(Res.get().second);
+    }
+
+    NewClause = OpenACCSelfClause::Create(
+        Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+        ParsedClause.getLParenLoc(), ParsedClause.getConditionExpr(),
+        ParsedClause.getEndLoc());
+  }
 }
 
 template <typename Derived>
@@ -11901,6 +11945,29 @@ void OpenACCClauseTransform<Derived>::VisitDeviceNumClause (
 }
 
 template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitDefaultAsyncClause(
+    const OpenACCDefaultAsyncClause &C) {
+  Expr *IntExpr = const_cast<Expr *>(C.getIntExpr());
+  assert(IntExpr && "default_async clause constructed with invalid int expr");
+
+  ExprResult Res = Self.TransformExpr(IntExpr);
+  if (!Res.isUsable())
+    return;
+
+  Res = Self.getSema().OpenACC().ActOnIntExpr(OpenACCDirectiveKind::Invalid,
+                                              C.getClauseKind(),
+                                              C.getBeginLoc(), Res.get());
+  if (!Res.isUsable())
+    return;
+
+  ParsedClause.setIntExprDetails(Res.get());
+  NewClause = OpenACCDefaultAsyncClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getIntExprs()[0],
+      ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
 void OpenACCClauseTransform<Derived>::VisitVectorLengthClause(
     const OpenACCVectorLengthClause &C) {
   Expr *IntExpr = const_cast<Expr *>(C.getIntExpr());
@@ -12422,6 +12489,39 @@ StmtResult TreeTransform<Derived>::TransformOpenACCShutdownConstruct(
       C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
       TransformedClauses);
 }
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOpenACCSetConstruct(OpenACCSetConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(
+          C->getDirectiveKind(), C->getBeginLoc(), TransformedClauses))
+    return StmtError();
+
+  return getDerived().RebuildOpenACCSetConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses);
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCUpdateConstruct(
+    OpenACCUpdateConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(
+          C->getDirectiveKind(), C->getBeginLoc(), TransformedClauses))
+    return StmtError();
+
+  return getDerived().RebuildOpenACCUpdateConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses);
+}
 
 template <typename Derived>
 StmtResult
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index fccd79b..0368990 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12387,9 +12387,18 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   }
   case OpenACCClauseKind::Self: {
     SourceLocation LParenLoc = readSourceLocation();
-    Expr *CondExpr = readBool() ? readSubExpr() : nullptr;
-    return OpenACCSelfClause::Create(getContext(), BeginLoc, LParenLoc,
-                                     CondExpr, EndLoc);
+    bool isConditionExprClause = readBool();
+    if (isConditionExprClause) {
+      Expr *CondExpr = readBool() ? readSubExpr() : nullptr;
+      return OpenACCSelfClause::Create(getContext(), BeginLoc, LParenLoc,
+                                       CondExpr, EndLoc);
+    }
+    unsigned NumVars = readInt();
+    llvm::SmallVector<Expr *> VarList;
+    for (unsigned I = 0; I < NumVars; ++I)
+      VarList.push_back(readSubExpr());
+    return OpenACCSelfClause::Create(getContext(), BeginLoc, LParenLoc, VarList,
+                                     EndLoc);
   }
   case OpenACCClauseKind::NumGangs: {
     SourceLocation LParenLoc = readSourceLocation();
@@ -12412,6 +12421,12 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     return OpenACCDeviceNumClause::Create(getContext(), BeginLoc, LParenLoc,
                                            IntExpr, EndLoc);
   }
+  case OpenACCClauseKind::DefaultAsync: {
+    SourceLocation LParenLoc = readSourceLocation();
+    Expr *IntExpr = readSubExpr();
+    return OpenACCDefaultAsyncClause::Create(getContext(), BeginLoc, LParenLoc,
+                                             IntExpr, EndLoc);
+  }
   case OpenACCClauseKind::VectorLength: {
     SourceLocation LParenLoc = readSourceLocation();
     Expr *IntExpr = readSubExpr();
@@ -12601,7 +12616,6 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   case OpenACCClauseKind::Host:
   case OpenACCClauseKind::Link:
   case OpenACCClauseKind::Bind:
-  case OpenACCClauseKind::DefaultAsync:
   case OpenACCClauseKind::Invalid:
     llvm_unreachable("Clause serialization not yet implemented");
   }
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 719bc0d..8c60e85 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2663,7 +2663,8 @@ void ASTDeclReader::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
 
   D->setDeclaredWithTypename(Record.readInt());
 
-  if (D->hasTypeConstraint()) {
+  bool TypeConstraintInitialized = D->hasTypeConstraint() && Record.readBool();
+  if (TypeConstraintInitialized) {
     ConceptReference *CR = nullptr;
     if (Record.readBool())
       CR = Record.readConceptReference();
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 9e8cf19..4766f34 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2875,6 +2875,16 @@ void ASTStmtReader::VisitOpenACCShutdownConstruct(OpenACCShutdownConstruct *S) {
   VisitOpenACCConstructStmt(S);
 }
 
+void ASTStmtReader::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
+void ASTStmtReader::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
 void ASTStmtReader::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
@@ -4407,6 +4417,16 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = OpenACCShutdownConstruct::CreateEmpty(Context, NumClauses);
       break;
     }
+    case STMT_OPENACC_SET_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCSetConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
+    case STMT_OPENACC_UPDATE_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCUpdateConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
     case EXPR_REQUIRES: {
       unsigned numLocalParameters = Record[ASTStmtReader::NumExprFields];
       unsigned numRequirement = Record[ASTStmtReader::NumExprFields + 1];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 4a60279..8d9396e 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7230,6 +7230,10 @@ void ASTWriter::CompletedImplicitDefinition(const FunctionDecl *D) {
   if (!D->isFromASTFile())
     return; // Declaration not imported from PCH.
 
+  // The function definition may not have a body due to parsing errors.
+  if (!D->doesThisDeclarationHaveABody())
+    return;
+
   // Implicit function decl from a PCH was defined.
   DeclUpdates[D].push_back(DeclUpdate(UPD_CXX_ADDED_FUNCTION_DEFINITION));
 }
@@ -7249,6 +7253,10 @@ void ASTWriter::FunctionDefinitionInstantiated(const FunctionDecl *D) {
   if (!D->isFromASTFile())
     return;
 
+  // The function definition may not have a body due to parsing errors.
+  if (!D->doesThisDeclarationHaveABody())
+    return;
+
   DeclUpdates[D].push_back(DeclUpdate(UPD_CXX_ADDED_FUNCTION_DEFINITION));
 }
 
@@ -8313,9 +8321,16 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::Self: {
     const auto *SC = cast<OpenACCSelfClause>(C);
     writeSourceLocation(SC->getLParenLoc());
-    writeBool(SC->hasConditionExpr());
-    if (SC->hasConditionExpr())
-      AddStmt(const_cast<Expr*>(SC->getConditionExpr()));
+    writeBool(SC->isConditionExprClause());
+    if (SC->isConditionExprClause()) {
+      writeBool(SC->hasConditionExpr());
+      if (SC->hasConditionExpr())
+        AddStmt(const_cast<Expr *>(SC->getConditionExpr()));
+    } else {
+      writeUInt32(SC->getVarList().size());
+      for (Expr *E : SC->getVarList())
+        AddStmt(E);
+    }
     return;
   }
   case OpenACCClauseKind::NumGangs: {
@@ -8332,6 +8347,12 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     AddStmt(const_cast<Expr*>(DNC->getIntExpr()));
     return;
   }
+  case OpenACCClauseKind::DefaultAsync: {
+    const auto *DAC = cast<OpenACCDefaultAsyncClause>(C);
+    writeSourceLocation(DAC->getLParenLoc());
+    AddStmt(const_cast<Expr *>(DAC->getIntExpr()));
+    return;
+  }
   case OpenACCClauseKind::NumWorkers: {
     const auto *NWC = cast<OpenACCNumWorkersClause>(C);
     writeSourceLocation(NWC->getLParenLoc());
@@ -8528,7 +8549,6 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::Host:
   case OpenACCClauseKind::Link:
   case OpenACCClauseKind::Bind:
-  case OpenACCClauseKind::DefaultAsync:
   case OpenACCClauseKind::Invalid:
     llvm_unreachable("Clause serialization not yet implemented");
   }
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 75c1d9a..f8ed155 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1951,7 +1951,8 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   Record.push_back(D->wasDeclaredWithTypename());
 
   const TypeConstraint *TC = D->getTypeConstraint();
-  assert((bool)TC == D->hasTypeConstraint());
+  if (D->hasTypeConstraint())
+    Record.push_back(/*TypeConstraintInitialized=*/TC != nullptr);
   if (TC) {
     auto *CR = TC->getConceptReference();
     Record.push_back(CR != nullptr);
@@ -1969,7 +1970,7 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   if (OwnsDefaultArg)
     Record.AddTemplateArgumentLoc(D->getDefaultArgument());
 
-  if (!TC && !OwnsDefaultArg &&
+  if (!D->hasTypeConstraint() && !OwnsDefaultArg &&
       D->getDeclContext() == D->getLexicalDeclContext() &&
       !D->isInvalidDecl() && !D->hasAttrs() &&
       !D->isTopLevelDeclInObjCContainer() && !D->isImplicit() &&
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 1d42b43..7eedf7d 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2957,6 +2957,18 @@ void ASTStmtWriter::VisitOpenACCShutdownConstruct(OpenACCShutdownConstruct *S) {
   Code = serialization::STMT_OPENACC_SHUTDOWN_CONSTRUCT;
 }
 
+void ASTStmtWriter::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_SET_CONSTRUCT;
+}
+
+void ASTStmtWriter::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_UPDATE_CONSTRUCT;
+}
+
 void ASTStmtWriter::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp
index 7a8a951..a3189bb 100644
--- a/clang/lib/Serialization/GeneratePCH.cpp
+++ b/clang/lib/Serialization/GeneratePCH.cpp
@@ -102,12 +102,13 @@ void PCHGenerator::anchor() {}
 CXX20ModulesGenerator::CXX20ModulesGenerator(Preprocessor &PP,
                                              InMemoryModuleCache &ModuleCache,
                                              StringRef OutputFile,
-                                             bool GeneratingReducedBMI)
+                                             bool GeneratingReducedBMI,
+                                             bool AllowASTWithErrors)
     : PCHGenerator(
           PP, ModuleCache, OutputFile, llvm::StringRef(),
           std::make_shared<PCHBuffer>(),
           /*Extensions=*/ArrayRef<std::shared_ptr<ModuleFileExtension>>(),
-          /*AllowASTWithErrors*/ false, /*IncludeTimestamps=*/false,
+          AllowASTWithErrors, /*IncludeTimestamps=*/false,
           /*BuildingImplicitModule=*/false, /*ShouldCacheASTInMemory=*/false,
           GeneratingReducedBMI) {}
 
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 0fdef74..bb4a39f 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -435,27 +435,27 @@ static SVal processArgument(SVal Value, const Expr *ArgumentExpr,
 /// runtime definition don't match in terms of argument and parameter count.
 static SVal castArgToParamTypeIfNeeded(const CallEvent &Call, unsigned ArgIdx,
                                        SVal ArgVal, SValBuilder &SVB) {
-  const FunctionDecl *RTDecl =
-      Call.getRuntimeDefinition().getDecl()->getAsFunction();
   const auto *CallExprDecl = dyn_cast_or_null<FunctionDecl>(Call.getDecl());
-
-  if (!RTDecl || !CallExprDecl)
+  if (!CallExprDecl)
     return ArgVal;
 
+  const FunctionDecl *Definition = CallExprDecl;
+  Definition->hasBody(Definition);
+
   // The function decl of the Call (in the AST) will not have any parameter
   // declarations, if it was 'only' declared without a prototype. However, the
   // engine will find the appropriate runtime definition - basically a
   // redeclaration, which has a function body (and a function prototype).
-  if (CallExprDecl->hasPrototype() || !RTDecl->hasPrototype())
+  if (CallExprDecl->hasPrototype() || !Definition->hasPrototype())
     return ArgVal;
 
   // Only do this cast if the number arguments at the callsite matches with
   // the parameters at the runtime definition.
-  if (Call.getNumArgs() != RTDecl->getNumParams())
+  if (Call.getNumArgs() != Definition->getNumParams())
     return UnknownVal();
 
   const Expr *ArgExpr = Call.getArgExpr(ArgIdx);
-  const ParmVarDecl *Param = RTDecl->getParamDecl(ArgIdx);
+  const ParmVarDecl *Param = Definition->getParamDecl(ArgIdx);
   return SVB.evalCast(ArgVal, Param->getType(), ArgExpr->getType());
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp
index 67b7d30..775a22e1 100644
--- a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp
@@ -444,7 +444,8 @@ void CoreEngine::HandleBranch(const Stmt *Cond, const Stmt *Term,
   NodeBuilderContext Ctx(*this, B, Pred);
   ExplodedNodeSet Dst;
   ExprEng.processBranch(Cond, Ctx, Pred, Dst, *(B->succ_begin()),
-                       *(B->succ_begin() + 1));
+                        *(B->succ_begin() + 1),
+                        getCompletedIterationCount(B, Pred));
   // Enqueue the new frontier onto the worklist.
   enqueue(Dst);
 }
@@ -591,6 +592,30 @@ ExplodedNode *CoreEngine::generateCallExitBeginNode(ExplodedNode *N,
   return isNew ? Node : nullptr;
 }
 
+std::optional<unsigned>
+CoreEngine::getCompletedIterationCount(const CFGBlock *B,
+                                       ExplodedNode *Pred) const {
+  const LocationContext *LC = Pred->getLocationContext();
+  BlockCounter Counter = WList->getBlockCounter();
+  unsigned BlockCount =
+      Counter.getNumVisited(LC->getStackFrame(), B->getBlockID());
+
+  const Stmt *Term = B->getTerminatorStmt();
+  if (isa<ForStmt, WhileStmt, CXXForRangeStmt>(Term)) {
+    assert(BlockCount >= 1 &&
+           "Block count of currently analyzed block must be >= 1");
+    return BlockCount - 1;
+  }
+  if (isa<DoStmt>(Term)) {
+    // In a do-while loop one iteration happens before the first evaluation of
+    // the loop condition, so we don't subtract one.
+    return BlockCount;
+  }
+  // ObjCForCollectionStmt is skipped intentionally because the current
+  // application of the iteration counts is not relevant for it.
+  return std::nullopt;
+}
+
 void CoreEngine::enqueue(ExplodedNodeSet &Set) {
   for (const auto I : Set)
     WList->enqueue(I);
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index db385e8..ff8bdce 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1832,6 +1832,8 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OpenACCWaitConstructClass:
     case Stmt::OpenACCInitConstructClass:
     case Stmt::OpenACCShutdownConstructClass:
+    case Stmt::OpenACCSetConstructClass:
+    case Stmt::OpenACCUpdateConstructClass:
     case Stmt::OMPUnrollDirectiveClass:
     case Stmt::OMPMetaDirectiveClass:
     case Stmt::HLSLOutArgExprClass: {
@@ -2760,12 +2762,10 @@ assumeCondition(const Stmt *Condition, ExplodedNode *N) {
   return State->assume(V);
 }
 
-void ExprEngine::processBranch(const Stmt *Condition,
-                               NodeBuilderContext& BldCtx,
-                               ExplodedNode *Pred,
-                               ExplodedNodeSet &Dst,
-                               const CFGBlock *DstT,
-                               const CFGBlock *DstF) {
+void ExprEngine::processBranch(
+    const Stmt *Condition, NodeBuilderContext &BldCtx, ExplodedNode *Pred,
+    ExplodedNodeSet &Dst, const CFGBlock *DstT, const CFGBlock *DstF,
+    std::optional<unsigned> IterationsCompletedInLoop) {
   assert((!Condition || !isa<CXXBindTemporaryExpr>(Condition)) &&
          "CXXBindTemporaryExprs are handled by processBindTemporary.");
   const LocationContext *LCtx = Pred->getLocationContext();
@@ -2808,8 +2808,35 @@ void ExprEngine::processBranch(const Stmt *Condition,
     if (StTrue && StFalse)
       assert(!isa<ObjCForCollectionStmt>(Condition));
 
-    if (StTrue)
-      Builder.generateNode(StTrue, true, PredN);
+    if (StTrue) {
+      // If we are processing a loop condition where two iterations have
+      // already been completed and the false branch is also feasible, then
+      // don't assume a third iteration because it is a redundant execution
+      // path (unlikely to be different from earlier loop exits) and can cause
+      // false positives if e.g. the loop iterates over a two-element structure
+      // with an opaque condition.
+      //
+      // The iteration count "2" is hardcoded because it's the natural limit:
+      // * the fact that the programmer wrote a loop (and not just an `if`)
+      //   implies that they thought that the loop body might be executed twice;
+      // * however, there are situations where the programmer knows that there
+      //   are at most two iterations but writes a loop that appears to be
+      //   generic, because there is no special syntax for "loop with at most
+      //   two iterations". (This pattern is common in FFMPEG and appears in
+      //   many other projects as well.)
+      bool CompletedTwoIterations = IterationsCompletedInLoop.value_or(0) >= 2;
+      bool FalseAlsoFeasible =
+          StFalse ||
+          didEagerlyAssumeBifurcateAt(PrevState, dyn_cast<Expr>(Condition));
+      bool SkipTrueBranch = CompletedTwoIterations && FalseAlsoFeasible;
+
+      // FIXME: This "don't assume third iteration" heuristic partially
+      // conflicts with the widen-loop analysis option (which is off by
+      // default). If we intend to support and stabilize the loop widening,
+      // we must ensure that it 'plays nicely' with this logic.
+      if (!SkipTrueBranch || AMgr.options.ShouldWidenLoops)
+        Builder.generateNode(StTrue, true, PredN);
+    }
 
     if (StFalse)
       Builder.generateNode(StFalse, false, PredN);
@@ -3731,6 +3758,12 @@ ExprEngine::getEagerlyAssumeBifurcationTags() {
   return std::make_pair(&TrueTag, &FalseTag);
 }
 
+/// If the last EagerlyAssume attempt was successful (i.e. the true and false
+/// cases were both feasible), this state trait stores the expression where it
+/// happened; otherwise this holds nullptr.
+REGISTER_TRAIT_WITH_PROGRAMSTATE(LastEagerlyAssumeExprIfSuccessful,
+                                 const Expr *)
+
 void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst,
                                               ExplodedNodeSet &Src,
                                               const Expr *Ex) {
@@ -3746,6 +3779,7 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst,
     }
 
     ProgramStateRef State = Pred->getState();
+    State = State->set<LastEagerlyAssumeExprIfSuccessful>(nullptr);
     SVal V = State->getSVal(Ex, Pred->getLocationContext());
     std::optional<nonloc::SymbolVal> SEV = V.getAs<nonloc::SymbolVal>();
     if (SEV && SEV->isExpression()) {
@@ -3753,6 +3787,11 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst,
 
       auto [StateTrue, StateFalse] = State->assume(*SEV);
 
+      if (StateTrue && StateFalse) {
+        StateTrue = StateTrue->set<LastEagerlyAssumeExprIfSuccessful>(Ex);
+        StateFalse = StateFalse->set<LastEagerlyAssumeExprIfSuccessful>(Ex);
+      }
+
       // First assume that the condition is true.
       if (StateTrue) {
         SVal Val = svalBuilder.makeIntVal(1U, Ex->getType());
@@ -3770,6 +3809,11 @@ void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst,
   }
 }
 
+bool ExprEngine::didEagerlyAssumeBifurcateAt(ProgramStateRef State,
+                                             const Expr *Ex) const {
+  return Ex && State->get<LastEagerlyAssumeExprIfSuccessful>() == Ex;
+}
+
 void ExprEngine::VisitGCCAsmStmt(const GCCAsmStmt *A, ExplodedNode *Pred,
                                  ExplodedNodeSet &Dst) {
   StmtNodeBuilder Bldr(Pred, Dst, *currBldrCtx);
diff --git a/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp b/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
index 96f5d7c..01d87b0 100644
--- a/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
+++ b/clang/lib/StaticAnalyzer/Core/LoopUnrolling.cpp
@@ -283,10 +283,10 @@ static bool shouldCompletelyUnroll(const Stmt *LoopStmt, ASTContext &ASTCtx,
   llvm::APInt InitNum =
       Matches[0].getNodeAs<IntegerLiteral>("initNum")->getValue();
   auto CondOp = Matches[0].getNodeAs<BinaryOperator>("conditionOperator");
-  if (InitNum.getBitWidth() != BoundNum.getBitWidth()) {
-    InitNum = InitNum.zext(BoundNum.getBitWidth());
-    BoundNum = BoundNum.zext(InitNum.getBitWidth());
-  }
+  unsigned MaxWidth = std::max(InitNum.getBitWidth(), BoundNum.getBitWidth());
+
+  InitNum = InitNum.zext(MaxWidth);
+  BoundNum = BoundNum.zext(MaxWidth);
 
   if (CondOp->getOpcode() == BO_GE || CondOp->getOpcode() == BO_LE)
     maxStep = (BoundNum - InitNum + 1).abs().getZExtValue();
diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
index f21e5c3..738b6a1 100644
--- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
@@ -170,9 +170,8 @@ SymbolManager::getRegionValueSymbol(const TypedValueRegion* R) {
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolRegionValue(SymbolCounter, R);
+    SD = Alloc.make<SymbolRegionValue>(R);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolRegionValue>(SD);
@@ -188,9 +187,8 @@ const SymbolConjured* SymbolManager::conjureSymbol(const Stmt *E,
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolConjured(SymbolCounter, E, LCtx, T, Count, SymbolTag);
+    SD = Alloc.make<SymbolConjured>(E, LCtx, T, Count, SymbolTag);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolConjured>(SD);
@@ -204,9 +202,8 @@ SymbolManager::getDerivedSymbol(SymbolRef parentSymbol,
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolDerived(SymbolCounter, parentSymbol, R);
+    SD = Alloc.make<SymbolDerived>(parentSymbol, R);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolDerived>(SD);
@@ -219,9 +216,8 @@ SymbolManager::getExtentSymbol(const SubRegion *R) {
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolExtent(SymbolCounter, R);
+    SD = Alloc.make<SymbolExtent>(R);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolExtent>(SD);
@@ -236,9 +232,8 @@ SymbolManager::getMetadataSymbol(const MemRegion* R, const Stmt *S, QualType T,
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolMetadata(SymbolCounter, R, S, T, LCtx, Count, SymbolTag);
+    SD = Alloc.make<SymbolMetadata>(R, S, T, LCtx, Count, SymbolTag);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolMetadata>(SD);
@@ -252,7 +247,7 @@ SymbolManager::getCastSymbol(const SymExpr *Op,
   void *InsertPos;
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
   if (!data) {
-    data = new (BPAlloc) SymbolCast(Op, From, To);
+    data = Alloc.make<SymbolCast>(Op, From, To);
     DataSet.InsertNode(data, InsertPos);
   }
 
@@ -268,7 +263,7 @@ const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs,
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
 
   if (!data) {
-    data = new (BPAlloc) SymIntExpr(lhs, op, v, t);
+    data = Alloc.make<SymIntExpr>(lhs, op, v, t);
     DataSet.InsertNode(data, InsertPos);
   }
 
@@ -284,7 +279,7 @@ const IntSymExpr *SymbolManager::getIntSymExpr(APSIntPtr lhs,
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
 
   if (!data) {
-    data = new (BPAlloc) IntSymExpr(lhs, op, rhs, t);
+    data = Alloc.make<IntSymExpr>(lhs, op, rhs, t);
     DataSet.InsertNode(data, InsertPos);
   }
 
@@ -301,7 +296,7 @@ const SymSymExpr *SymbolManager::getSymSymExpr(const SymExpr *lhs,
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
 
   if (!data) {
-    data = new (BPAlloc) SymSymExpr(lhs, op, rhs, t);
+    data = Alloc.make<SymSymExpr>(lhs, op, rhs, t);
     DataSet.InsertNode(data, InsertPos);
   }
 
@@ -316,7 +311,7 @@ const UnarySymExpr *SymbolManager::getUnarySymExpr(const SymExpr *Operand,
   void *InsertPos;
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
   if (!data) {
-    data = new (BPAlloc) UnarySymExpr(Operand, Opc, T);
+    data = Alloc.make<UnarySymExpr>(Operand, Opc, T);
     DataSet.InsertNode(data, InsertPos);
   }
 
diff --git a/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp b/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
index 739db95..c4dd016 100644
--- a/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
@@ -21,6 +21,10 @@
 
 #define DEBUG_TYPE "Z3CrosscheckOracle"
 
+// Queries attempted at most `Z3CrosscheckMaxAttemptsPerQuery` number of times.
+// Multiple `check()` calls might be called on the same query if previous
+// attempts of the same query resulted in UNSAT for any reason. Each query is
+// only counted once for these statistics, the retries are not accounted for.
 STATISTIC(NumZ3QueriesDone, "Number of Z3 queries done");
 STATISTIC(NumTimesZ3TimedOut, "Number of times Z3 query timed out");
 STATISTIC(NumTimesZ3ExhaustedRLimit,
@@ -77,16 +81,32 @@ void Z3CrosscheckVisitor::finalizeVisitor(BugReporterContext &BRC,
     RefutationSolver->addConstraint(SMTConstraints);
   }
 
-  // And check for satisfiability
-  llvm::TimeRecord Start = llvm::TimeRecord::getCurrentTime(/*Start=*/true);
-  std::optional<bool> IsSAT = RefutationSolver->check();
-  llvm::TimeRecord Diff = llvm::TimeRecord::getCurrentTime(/*Start=*/false);
-  Diff -= Start;
-  Result = Z3Result{
-      IsSAT,
-      static_cast<unsigned>(Diff.getWallTime() * 1000),
-      RefutationSolver->getStatistics()->getUnsigned("rlimit count"),
+  auto GetUsedRLimit = [](const llvm::SMTSolverRef &Solver) {
+    return Solver->getStatistics()->getUnsigned("rlimit count");
+  };
+
+  auto AttemptOnce = [&](const llvm::SMTSolverRef &Solver) -> Z3Result {
+    constexpr auto getCurrentTime = llvm::TimeRecord::getCurrentTime;
+    unsigned InitialRLimit = GetUsedRLimit(Solver);
+    double Start = getCurrentTime(/*Start=*/true).getWallTime();
+    std::optional<bool> IsSAT = Solver->check();
+    double End = getCurrentTime(/*Start=*/false).getWallTime();
+    return {
+        IsSAT,
+        static_cast<unsigned>((End - Start) * 1000),
+        GetUsedRLimit(Solver) - InitialRLimit,
+    };
   };
+
+  // And check for satisfiability
+  unsigned MinQueryTimeAcrossAttempts = std::numeric_limits<unsigned>::max();
+  for (unsigned I = 0; I < Opts.Z3CrosscheckMaxAttemptsPerQuery; ++I) {
+    Result = AttemptOnce(RefutationSolver);
+    Result.Z3QueryTimeMilliseconds =
+        std::min(MinQueryTimeAcrossAttempts, Result.Z3QueryTimeMilliseconds);
+    if (Result.IsSAT.has_value())
+      return;
+  }
 }
 
 void Z3CrosscheckVisitor::addConstraints(
diff --git a/clang/test/AST/ByteCode/builtin-constant-p.cpp b/clang/test/AST/ByteCode/builtin-constant-p.cpp
index 0d222d1..62899b6 100644
--- a/clang/test/AST/ByteCode/builtin-constant-p.cpp
+++ b/clang/test/AST/ByteCode/builtin-constant-p.cpp
@@ -12,3 +12,9 @@ static_assert(__builtin_constant_p(I + 10.0), "");
 static_assert(__builtin_constant_p(nullptr), "");
 static_assert(__builtin_constant_p(&I), ""); // both-error {{failed due to requirement}}
 static_assert(__builtin_constant_p((void)I), ""); // both-error {{failed due to requirement}}
+
+extern int z;
+constexpr int foo(int &a) {
+  return __builtin_constant_p(a);
+}
+static_assert(!foo(z));
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index c1fd1bc..7237640 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1244,6 +1244,34 @@ namespace BuiltinMemcpy {
   }
   static_assert(cpyptr());
 
+#ifndef __AVR__
+  constexpr int test_memmove(int a, int b, int n) {
+    int arr[4] = {1, 2, 3, 4};
+    __builtin_memmove(arr + a, arr + b, n); // both-note {{destination is not a contiguous array of at least 3 elements of type 'int'}}
+    return result(arr);
+  }
+  static_assert(test_memmove(2, 0, 12) == 4234); // both-error {{constant}} \
+                                                 // both-note {{in call}}
+#endif
+
+  struct Trivial { char k; short s; constexpr bool ok() { return k == 3 && s == 4; } };
+  constexpr bool test_trivial() {
+    Trivial arr[3] = {{1, 2}, {3, 4}, {5, 6}};
+    __builtin_memcpy(arr, arr+1, sizeof(Trivial));
+    __builtin_memmove(arr+1, arr, 2 * sizeof(Trivial));
+
+    return arr[0].ok() && arr[1].ok() && arr[2].ok();
+  }
+  static_assert(test_trivial());
+
+  // Check that an incomplete array is rejected.
+  constexpr int test_incomplete_array_type() { // both-error {{never produces a constant}}
+    extern int arr[];
+    __builtin_memmove(arr, arr, 4 * sizeof(arr[0]));
+    // both-note@-1 2{{'memmove' not supported: source is not a contiguous array of at least 4 elements of type 'int'}}
+    return arr[0] * 1000 + arr[1] * 100 + arr[2] * 10 + arr[3];
+  }
+  static_assert(test_incomplete_array_type() == 1234); // both-error {{constant}} both-note {{in call}}
 }
 
 namespace Memcmp {
diff --git a/clang/test/AST/ByteCode/cxx2a.cpp b/clang/test/AST/ByteCode/cxx2a.cpp
index eaae978..f600688 100644
--- a/clang/test/AST/ByteCode/cxx2a.cpp
+++ b/clang/test/AST/ByteCode/cxx2a.cpp
@@ -110,3 +110,63 @@ namespace DtorOrder {
   }
   static_assert(check_abnormal_termination());
 }
+
+namespace std {
+  struct type_info;
+}
+
+namespace TypeId {
+  struct A {
+    const std::type_info &ti = typeid(*this);
+  };
+  struct A2 : A {};
+  static_assert(&A().ti == &typeid(A));
+  static_assert(&typeid((A2())) == &typeid(A2));
+  extern A2 extern_a2;
+  static_assert(&typeid(extern_a2) == &typeid(A2));
+
+  constexpr A2 a2;
+  constexpr const A &a1 = a2;
+  static_assert(&typeid(a1) == &typeid(A));
+
+  struct B {
+    virtual void f();
+    const std::type_info &ti1 = typeid(*this);
+  };
+  struct B2 : B {
+    const std::type_info &ti2 = typeid(*this);
+  };
+  static_assert(&B2().ti1 == &typeid(B));
+  static_assert(&B2().ti2 == &typeid(B2));
+  extern B2 extern_b2;
+  static_assert(&typeid(extern_b2) == &typeid(B2)); // both-error {{constant expression}} \
+                                                    // both-note{{typeid applied to object 'extern_b2' whose dynamic type is not constant}}
+
+
+  constexpr B2 b2;
+  constexpr const B &b1 = b2;
+  static_assert(&typeid(b1) == &typeid(B2));
+
+  constexpr bool side_effects() {
+    // Not polymorphic nor a glvalue.
+    bool OK = true;
+    (void)typeid(OK = false, A2()); // both-warning {{has no effect}}
+    if (!OK) return false;
+
+    // Not polymorphic.
+    A2 a2;
+    (void)typeid(OK = false, a2); // both-warning {{has no effect}}
+    if (!OK) return false;
+
+    // Not a glvalue.
+    (void)typeid(OK = false, B2()); // both-warning {{has no effect}}
+    if (!OK) return false;
+
+    // Polymorphic glvalue: operand evaluated.
+    OK = false;
+    B2 b2;
+    (void)typeid(OK = true, b2); // both-warning {{will be evaluated}}
+    return OK;
+  }
+  static_assert(side_effects());
+}
diff --git a/clang/test/AST/ast-print-openacc-set-construct.cpp b/clang/test/AST/ast-print-openacc-set-construct.cpp
new file mode 100644
index 0000000..a801ae1
--- /dev/null
+++ b/clang/test/AST/ast-print-openacc-set-construct.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s
+
+unsigned Int;
+
+void uses() {
+// CHECK: #pragma acc set default_async(Int) if(Int == 5) device_type(I) device_num(Int)
+#pragma acc set default_async(Int) if (Int == 5) device_type(I) device_num(Int)
+// CHECK: #pragma acc set default_async(Int) device_type(I) device_num(Int)
+#pragma acc set default_async(Int) device_type(I) device_num(Int)
+// CHECK: #pragma acc set default_async(Int) if(Int == 5) device_num(Int)
+#pragma acc set default_async(Int) if (Int == 5) device_num(Int)
+// CHECK: #pragma acc set default_async(Int) if(Int == 5) device_type(I)
+#pragma acc set default_async(Int) if (Int == 5) device_type(I)
+// CHECK: #pragma acc set if(Int == 5) device_type(I) device_num(Int)
+#pragma acc set if (Int == 5) device_type(I) device_num(Int)
+// CHECK: #pragma acc set default_async(Int)
+#pragma acc set default_async(Int)
+// CHECK: #pragma acc set if(Int == 5)
+#pragma acc set if (Int == 5)
+// CHECK: #pragma acc set device_type(I)
+#pragma acc set device_type(I)
+// CHECK: #pragma acc set device_num(Int)
+#pragma acc set device_num(Int)
+}
diff --git a/clang/test/AST/ast-print-openacc-update-construct.cpp b/clang/test/AST/ast-print-openacc-update-construct.cpp
new file mode 100644
index 0000000..a7f5b2a
--- /dev/null
+++ b/clang/test/AST/ast-print-openacc-update-construct.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s
+void uses(bool cond) {
+  int I;
+  int *iPtr;
+  int array[5];
+  // CHECK: #pragma acc update
+#pragma acc update
+
+// CHECK: #pragma acc update if_present
+#pragma acc update if_present
+// CHECK: #pragma acc update if(cond)
+#pragma acc update if(cond)
+
+// CHECK: #pragma acc update async
+#pragma acc update async
+// CHECK: #pragma acc update async(*iPtr)
+#pragma acc update async(*iPtr)
+// CHECK: #pragma acc update async(I)
+#pragma acc update async(I)
+
+// CHECK: #pragma acc update wait(*iPtr, I) async
+#pragma acc update wait(*iPtr, I) async
+
+// CHECK: #pragma acc update wait(queues: *iPtr, I) async(*iPtr)
+#pragma acc update wait(queues:*iPtr, I) async(*iPtr)
+
+// CHECK: #pragma acc update wait(devnum: I : *iPtr, I) async(I)
+#pragma acc update wait(devnum:I:*iPtr, I) async(I)
+
+// CHECK: #pragma acc update wait(devnum: I : queues: *iPtr, I) if(I == array[I]) async(I)
+#pragma acc update wait(devnum:I:queues:*iPtr, I) if(I == array[I]) async(I)
+
+// CHECK: #pragma acc update device_type(I) dtype(H)
+#pragma acc update device_type(I) dtype(H)
+
+// CHECK: #pragma acc update device_type(J) dtype(K)
+#pragma acc update device_type(J) dtype(K)
+
+// CHECK: #pragma acc update self(I, iPtr, array, array[1], array[1:2])
+#pragma acc update self(I, iPtr, array, array[1], array[1:2])
+}
diff --git a/clang/test/Analysis/PR121201.cpp b/clang/test/Analysis/PR121201.cpp
new file mode 100644
index 0000000..acd2492
--- /dev/null
+++ b/clang/test/Analysis/PR121201.cpp
@@ -0,0 +1,67 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify %s \
+// RUN:    -analyzer-config unroll-loops=true
+
+// expected-no-diagnostics
+
+template <bool, typename T, typename> using conditional_t = T;
+class basic_format_arg;
+template <typename> struct formatter;
+
+template <typename Context> struct value {
+  template <typename T> value(T) {
+    using value_type = T;
+    (void)format_custom_arg<value_type,
+                      typename Context::template formatter_type<value_type>>;
+  }
+
+  template <typename, typename Formatter> static void format_custom_arg() {
+    Context ctx;
+    auto f = Formatter();
+    f.format(0, ctx);
+  }
+};
+
+struct context {
+  template <typename T> using formatter_type = formatter<T>;
+};
+
+enum { max_packed_args };
+
+template <typename Context, long>
+using arg_t = conditional_t<max_packed_args, value<Context>, basic_format_arg>;
+
+template <int NUM_ARGS> struct format_arg_store {
+  arg_t<context, NUM_ARGS> args;
+};
+
+template <typename... T, long NUM_ARGS = sizeof...(T)>
+auto make_format_args(T... args) -> format_arg_store<NUM_ARGS> {
+  return {args...};
+}
+
+template <typename F> void write_padded(F write) { write(0); }
+
+template <typename... T> void format(T... args) { make_format_args(args...); }
+
+template <int> struct bitset {
+  bitset(long);
+};
+
+template <long N> struct formatter<bitset<N>> {
+  struct writer {
+    bitset<N> bs;
+
+    template <typename OutputIt> void operator()(OutputIt) {
+      for (auto pos = N; pos > 0; --pos) // no-crash
+        ;
+    }
+  };
+
+  template <typename FormatContext> void format(bitset<N> bs, FormatContext) {
+    write_padded(writer{bs});
+  }
+};
+
+bitset<6> TestBody_bs(2);
+
+void TestBody() { format(TestBody_bs); }
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 8ce6184..d5eb790 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -41,6 +41,7 @@
 // CHECK-NEXT: cplusplus.SmartPtrModeling:ModelSmartPtrDereference = false
 // CHECK-NEXT: crosscheck-with-z3 = false
 // CHECK-NEXT: crosscheck-with-z3-eqclass-timeout-threshold = 0
+// CHECK-NEXT: crosscheck-with-z3-max-attempts-per-query = 3
 // CHECK-NEXT: crosscheck-with-z3-rlimit-threshold = 0
 // CHECK-NEXT: crosscheck-with-z3-timeout-threshold = 15000
 // CHECK-NEXT: ctu-dir = ""
diff --git a/clang/test/Analysis/dump_egraph.cpp b/clang/test/Analysis/dump_egraph.cpp
index d1229b26..1345969 100644
--- a/clang/test/Analysis/dump_egraph.cpp
+++ b/clang/test/Analysis/dump_egraph.cpp
@@ -21,7 +21,7 @@ void foo() {
 
 // CHECK: \"location_context\": \"#0 Call\", \"calling\": \"T::T\", \"location\": \{ \"line\": 15, \"column\": 5, \"file\": \"{{.*}}dump_egraph.cpp\" \}, \"items\": [\l&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\{ \"init_id\": {{[0-9]+}}, \"kind\": \"construct into member variable\", \"argument_index\": null, \"pretty\": \"s\", \"value\": \"&t.s\"
 
-// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$2\{int, LC5, no stmt, #1\}\"
+// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$3\{int, LC5, no stmt, #1\}\"
 
 // CHECK: \"dynamic_types\": [\l&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\{ \"region\": \"HeapSymRegion\{conj_$1\{S *, LC1, S{{[0-9]+}}, #1\}\}\", \"dyn_type\": \"S\", \"sub_classable\": false \}\l
 
diff --git a/clang/test/Analysis/embed.c b/clang/test/Analysis/embed.c
index 32f6c13..db8c270 100644
--- a/clang/test/Analysis/embed.c
+++ b/clang/test/Analysis/embed.c
@@ -8,5 +8,5 @@ int main() {
         #embed "embed.c"
     };
     clang_analyzer_dump_ptr(SelfBytes); // expected-warning {{&Element{SelfBytes,0 S64b,unsigned char}}}
-    clang_analyzer_dump(SelfBytes[0]); // expected-warning {{Unknown}} FIXME: This should be the `/` character.
+    clang_analyzer_dump(SelfBytes[0]); // expected-warning {{47 U8b}}
 }
diff --git a/clang/test/Analysis/expr-inspection-printState-diseq-info.c b/clang/test/Analysis/expr-inspection-printState-diseq-info.c
index c5c3178..515fcbb 100644
--- a/clang/test/Analysis/expr-inspection-printState-diseq-info.c
+++ b/clang/test/Analysis/expr-inspection-printState-diseq-info.c
@@ -18,17 +18,17 @@ void test_disequality_info(int e0, int b0, int b1, int c0) {
  // CHECK-NEXT:     {
  // CHECK-NEXT:       "class": [ "(reg_$0<int e0>) - 2" ],
  // CHECK-NEXT:       "disequal_to": [
- // CHECK-NEXT:         [ "reg_$2<int b1>" ]]
+ // CHECK-NEXT:         [ "reg_$7<int b1>" ]]
  // CHECK-NEXT:     },
  // CHECK-NEXT:     {
- // CHECK-NEXT:       "class": [ "reg_$2<int b1>" ],
+ // CHECK-NEXT:       "class": [ "reg_$15<int c0>" ],
  // CHECK-NEXT:       "disequal_to": [
- // CHECK-NEXT:         [ "(reg_$0<int e0>) - 2" ],
- // CHECK-NEXT:         [ "reg_$3<int c0>" ]]
+ // CHECK-NEXT:         [ "reg_$7<int b1>" ]]
  // CHECK-NEXT:     },
  // CHECK-NEXT:     {
- // CHECK-NEXT:       "class": [ "reg_$3<int c0>" ],
+ // CHECK-NEXT:       "class": [ "reg_$7<int b1>" ],
  // CHECK-NEXT:       "disequal_to": [
- // CHECK-NEXT:         [ "reg_$2<int b1>" ]]
+ // CHECK-NEXT:         [ "(reg_$0<int e0>) - 2" ],
+ // CHECK-NEXT:         [ "reg_$15<int c0>" ]]
  // CHECK-NEXT:     }
  // CHECK-NEXT:   ],
diff --git a/clang/test/Analysis/expr-inspection-printState-eq-classes.c b/clang/test/Analysis/expr-inspection-printState-eq-classes.c
index 38e23d6..19cc137 100644
--- a/clang/test/Analysis/expr-inspection-printState-eq-classes.c
+++ b/clang/test/Analysis/expr-inspection-printState-eq-classes.c
@@ -16,6 +16,6 @@ void test_equivalence_classes(int a, int b, int c, int d) {
 }
 
 // CHECK:      "equivalence_classes": [
-// CHECK-NEXT:     [ "(reg_$0<int a>) != (reg_$2<int c>)" ],
-// CHECK-NEXT:     [ "reg_$0<int a>", "reg_$2<int c>", "reg_$3<int d>" ]
+// CHECK-NEXT:     [ "(reg_$0<int a>) != (reg_$5<int c>)" ],
+// CHECK-NEXT:     [ "reg_$0<int a>", "reg_$20<int d>", "reg_$5<int c>" ]
 // CHECK-NEXT: ],
diff --git a/clang/test/Analysis/loop-assumptions.c b/clang/test/Analysis/loop-assumptions.c
new file mode 100644
index 0000000..eb0ffdc
--- /dev/null
+++ b/clang/test/Analysis/loop-assumptions.c
@@ -0,0 +1,219 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \
+// RUN:     -verify=expected,eagerlyassume %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \
+// RUN:     -analyzer-config eagerly-assume=false \
+// RUN:     -verify=expected,noeagerlyassume %s
+
+// These tests validate the logic within `ExprEngine::processBranch` which
+// ensures that in loops with opaque conditions we don't assume execution paths
+// if the code does not imply that they are possible.
+
+void clang_analyzer_numTimesReached(void);
+void clang_analyzer_warnIfReached(void);
+void clang_analyzer_dump(int);
+
+void clearCondition(void) {
+  // If the analyzer can definitely determine the value of the loop condition,
+  // then this corrective logic doesn't activate and the engine executes
+  // `-analyzer-max-loop` iterations (by default, 4).
+  for (int i = 0; i < 10; i++)
+    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+
+  clang_analyzer_warnIfReached(); // unreachable
+}
+
+void opaqueCondition(int arg) {
+  // If the loop condition is opaque, don't assume more than two iterations,
+  // because the presence of a loop does not imply that the programmer thought
+  // that more than two iterations are possible. (It _does_ imply that two
+  // iterations may be possible at least in some cases, because otherwise an
+  // `if` would've been enough.)
+  for (int i = 0; i < arg; i++)
+    clang_analyzer_numTimesReached(); // expected-warning {{2}}
+
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+int check(void);
+
+void opaqueConditionCall(int arg) {
+  // Same situation as `opaqueCondition()` but with a `while ()` loop. This
+  // is also an example for a situation where the programmer cannot easily
+  // insert an assertion to guide the analyzer and rule out more than two
+  // iterations (so the analyzer needs to proactively avoid those unjustified
+  // branches).
+  while (check())
+    clang_analyzer_numTimesReached(); // expected-warning {{2}}
+
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void opaqueConditionDoWhile(int arg) {
+  // Same situation as `opaqueCondition()` but with a `do {} while ()` loop.
+  // This is tested separately because this loop type is a special case in the
+  // iteration count calculation.
+  int i = 0;
+  do {
+    clang_analyzer_numTimesReached(); // expected-warning {{2}}
+  } while (i++ < arg);
+
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void dontRememberOldBifurcation(int arg) {
+  // In this (slightly contrived) test case the analyzer performs an assumption
+  // at the first iteration of the loop, but does not make any new assumptions
+  // in the subsequent iterations, so the analyzer should continue evaluating
+  // the loop.
+  // Previously this was mishandled in `eagerly-assume` mode (which is enabled
+  // by default), because the code remembered that there was a bifurcation on
+  // the first iteration of the loop and didn't realize that this is obsolete.
+
+  // NOTE: The variable `i` is introduced to ensure that the iterations of the
+  // loop change the state -- otherwise the analyzer stops iterating because it
+  // returns to the same `ExplodedNode`.
+  int i = 0;
+  while (arg > 3) {
+    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+    i++;
+  }
+
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void dontAssumeFourthIterartion(int arg) {
+  if (arg == 2)
+    return;
+
+  // In this function the analyzer cannot leave the loop after exactly two
+  // iterations (because it knows that `arg != 2` at that point), so it
+  // performs a third iteration, but it does not assume that a fourth iteration
+  // is also possible.
+  for (int i = 0; i < arg; i++)
+    clang_analyzer_numTimesReached(); // expected-warning {{3}}
+
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+#define TRUE 1
+void shortCircuitInLoopCondition(int arg) {
+  // When the loop condition expression contains short-circuiting operators, it
+  // performs "inner" bifurcations for those operators and only considers the
+  // last (rightmost) operand as the branch condition that is associated with
+  // the loop itself (as its loop condition).
+  // This means that assumptions taken in the left-hand side of a short-circuiting
+  // operator are not recognized as "opaque" loop condition, so the loop in
+  // this test case is allowed to finish four iterations.
+  // FIXME: This corner case is responsible for at least one out-of-bounds
+  // false positive on the ffmpeg codebase. Eventually we should properly
+  // recognize the full syntactical loop condition expression as "the loop
+  // condition", but this will be complicated to implement.
+  for (int i = 0; i < arg && TRUE; i++) {
+    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+  }
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void shortCircuitInLoopConditionRHS(int arg) {
+  // Unlike `shortCircuitInLoopCondition()`, this case is handled properly
+  // because the analyzer thinks that the right hand side of the `&&` is the
+  // loop condition.
+  for (int i = 0; TRUE && i < arg; i++) {
+    clang_analyzer_numTimesReached(); // expected-warning {{2}}
+  }
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void eagerlyAssumeInSubexpression(int arg) {
+  // The `EagerlyAssume` logic is another complication that can "split the
+  // state" within the loop condition, but before the `processBranch()` call
+  // which is (in theory) responsible for evaluating the loop condition.
+  // The current implementation partially compensates this by noticing the
+  // cases where the loop condition is targeted by `EagerlyAssume`, but does
+  // not handle the (fortunately rare) case when `EagerlyAssume` hits a
+  // sub-expression of the loop condition (as in this contrived test case).
+  // FIXME: I don't know a real-world example for this inconsistency, but it
+  // would be good to eliminate it eventually.
+  for (int i = 0; (i >= arg) - 1; i++) {
+    clang_analyzer_numTimesReached(); // eagerlyassume-warning {{4}} noeagerlyassume-warning {{2}}
+  }
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void calledTwice(int arg, int isFirstCall) {
+  // This function is called twice (with two different unknown 'arg' values) to
+  // check the iteration count handling in this situation.
+  for (int i = 0; i < arg; i++) {
+    if (isFirstCall) {
+      clang_analyzer_numTimesReached(); // expected-warning {{2}}
+    } else {
+      clang_analyzer_numTimesReached(); // expected-warning {{2}}
+    }
+  }
+}
+
+void caller(int arg, int arg2) {
+  // Entry point for `calledTwice()`.
+  calledTwice(arg, 1);
+  calledTwice(arg2, 0);
+}
+
+void innerLoopClearCondition(void) {
+  // A "control group" test case for the behavior of an inner loop. Notice that
+  // although the (default) value of `-analyzer-max-loop` is 4, we only see 3 iterations
+  // of the inner loop, because `-analyzer-max-loop` limits the number of
+  // evaluations of _the loop condition of the inner loop_ and in addition to
+  // the 3 evaluations before the 3 iterations, there is also a step where it
+  // evaluates to false (in the first iteration of the outer loop).
+  for (int outer = 0; outer < 2; outer++) {
+    int limit = 0;
+    if (outer)
+      limit = 10;
+    clang_analyzer_dump(limit); // expected-warning {{0}} expected-warning {{10}}
+    for (int i = 0; i < limit; i++) {
+      clang_analyzer_numTimesReached(); // expected-warning {{3}}
+    }
+  }
+}
+
+void innerLoopOpaqueCondition(int arg) {
+  // In this test case the engine doesn't assume a second iteration within the
+  // inner loop (in the second iteration of the outer loop, when the limit is
+  // opaque) because `CoreEngine::getCompletedIterationCount()` is based on the
+  // `BlockCount` values queried from the `BlockCounter` which count _all_
+  // evaluations of a given `CFGBlock` (in our case, the loop condition) and
+  // not just the evaluations within the current iteration of the outer loop.
+  // FIXME: This inaccurate iteration count could in theory cause some false
+  // negatives, although I think this would be unusual in practice, as the
+  // small default value of `-analyzer-max-loop` means that this is only
+  // relevant if the analyzer can deduce that the inner loop performs 0 or 1
+  // iterations within the first iteration of the outer loop (and then the
+  // condition of the inner loop is opaque within the second iteration of the
+  // outer loop).
+  for (int outer = 0; outer < 2; outer++) {
+    int limit = 0;
+    if (outer)
+      limit = arg;
+    clang_analyzer_dump(limit); // expected-warning {{0}} expected-warning {{reg_$}}
+    for (int i = 0; i < limit; i++) {
+      clang_analyzer_numTimesReached(); // expected-warning {{1}}
+    }
+  }
+}
+
+void onlyLoopConditions(int arg) {
+  // This "don't assume third iteration" logic only examines the conditions of
+  // loop statements and does not affect the analysis of code that implements
+  // similar behavior with different language features like if + break, goto,
+  // recursive functions, ...
+  int i = 0;
+  while (1) {
+    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+
+    // This is not a loop condition.
+    if (i++ > arg)
+      break;
+  }
+
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
diff --git a/clang/test/Analysis/loop-unrolling.cpp b/clang/test/Analysis/loop-unrolling.cpp
index 66a828a..bf05a77 100644
--- a/clang/test/Analysis/loop-unrolling.cpp
+++ b/clang/test/Analysis/loop-unrolling.cpp
@@ -63,7 +63,7 @@ int simple_no_unroll1() {
   int a[9];
   int k = 42;
   for (int i = 0; i < 9; i++) {
-    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+    clang_analyzer_numTimesReached(); // expected-warning {{2}}
     a[i] = 42;
     foo(i);
   }
@@ -76,7 +76,7 @@ int simple_no_unroll2() {
   int k = 42;
   int i;
   for (i = 0; i < 9; i++) {
-    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+    clang_analyzer_numTimesReached(); // expected-warning {{2}}
     a[i] = 42;
     i += getNum();
   }
@@ -309,9 +309,9 @@ int nested_inner_unrolled() {
   int k = 42;
   int j = 0;
   for (int i = 0; i < getNum(); i++) {
-    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+    clang_analyzer_numTimesReached(); // expected-warning {{2}}
     for (j = 0; j < 8; ++j) {
-      clang_analyzer_numTimesReached(); // expected-warning {{32}}
+      clang_analyzer_numTimesReached(); // expected-warning {{16}}
       a[j] = 22;
     }
     a[i] = 42;
@@ -346,11 +346,7 @@ int simple_known_bound_loop() {
 
 int simple_unknown_bound_loop() {
   for (int i = 2; i < getNum(); i++) {
-#ifdef DFS
-    clang_analyzer_numTimesReached(); // expected-warning {{16}}
-#else
     clang_analyzer_numTimesReached(); // expected-warning {{8}}
-#endif
   }
   return 0;
 }
@@ -368,11 +364,7 @@ int nested_inlined_unroll1() {
 int nested_inlined_no_unroll1() {
   int k;
   for (int i = 0; i < 9; i++) {
-#ifdef DFS
-    clang_analyzer_numTimesReached(); // expected-warning {{18}}
-#else
-    clang_analyzer_numTimesReached(); // expected-warning {{14}}
-#endif
+    clang_analyzer_numTimesReached(); // expected-warning {{10}}
     k = simple_unknown_bound_loop();  // reevaluation without inlining, splits the state as well
   }
   int a = 22 / k; // no-warning
@@ -475,9 +467,13 @@ int num_steps_over_limit2() {
 
 int num_steps_on_limit3() {
   for (int i = 0; i < getNum(); i++) {
-    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+    clang_analyzer_numTimesReached(); // expected-warning {{2}}
     for (int j = 0; j < 32; j++) {
-      clang_analyzer_numTimesReached(); // expected-warning {{128}}
+      // Here the loop unrollig logic calculates with four potential iterations
+      // in the outer loop where it cannot determine the iteration count in
+      // advance; but after two loops the analyzer conservatively assumes that
+      // the (still opaque) loop condition is false.
+      clang_analyzer_numTimesReached(); // expected-warning {{64}}
     }
   }
   return 0;
@@ -493,6 +489,15 @@ int num_steps_over_limit3() {
   return 0;
 }
 
+int num_steps_on_limit4() {
+  for (int i = 0; i < 4; i++) {
+    clang_analyzer_numTimesReached(); // expected-warning {{4}}
+    for (int j = 0; j < 32; j++) {
+      clang_analyzer_numTimesReached(); // expected-warning {{128}}
+    }
+  }
+  return 0;
+}
 
 void pr34943() {
   for (int i = 0; i < 6L; ++i) {
diff --git a/clang/test/Analysis/misc-ps-region-store.m b/clang/test/Analysis/misc-ps-region-store.m
index 668b5ff..a882e7e 100644
--- a/clang/test/Analysis/misc-ps-region-store.m
+++ b/clang/test/Analysis/misc-ps-region-store.m
@@ -910,13 +910,13 @@ void pr6302(id x, Class y) {
 
 //===----------------------------------------------------------------------===//
 // Specially handle global variables that are declared constant.  In the
-// example below, this forces the loop to take exactly 2 iterations.
+// example below, this forces the loop to take exactly 1 iteration.
 //===----------------------------------------------------------------------===//
 
-const int pr6288_L_N = 2;
+const int pr6288_L_N = 1;
 void pr6288_(void) {
-  int x[2];
-  int *px[2];
+  int x[1];
+  int *px[1];
   int i;
   for (i = 0; i < pr6288_L_N; i++)
     px[i] = &x[i];
@@ -924,8 +924,8 @@ void pr6288_(void) {
 }
 
 void pr6288_pos(int z) {
-  int x[2];
-  int *px[2];
+  int x[1];
+  int *px[1];
   int i;
   for (i = 0; i < z; i++)
     px[i] = &x[i]; // expected-warning{{Access out-of-bound array element (buffer overflow)}}
@@ -933,15 +933,28 @@ void pr6288_pos(int z) {
 }
 
 void pr6288_b(void) {
-  const int L_N = 2;
-  int x[2];
-  int *px[2];
+  const int L_N = 1;
+  int x[1];
+  int *px[1];
   int i;
   for (i = 0; i < L_N; i++)
     px[i] = &x[i];
   *(px[0]) = 0; // no-warning
 }
 
+void pr6288_no_third_iter(int z) {
+  int x[2];
+  int *px[2];
+  int i;
+  // If the loop condition is opaque, we assume that there may be two
+  // iterations (becasuse otherwise the loop could be replaced by an if); but
+  // we do not assume that there may be a third iteration. Therefore,
+  // unlike 'pr6288_pos', this testcase does not produce an out-of-bounds error.
+  for (i = 0; i < z; i++)
+    px[i] = &x[i];
+  *(px[0]) = 0; // expected-warning{{Dereference of undefined pointer value}}
+}
+
 // A bug in RemoveDeadBindings was causing instance variable bindings to get
 // prematurely pruned from the state.
 @interface Rdar7817800 {
diff --git a/clang/test/Analysis/ptr-arith.cpp b/clang/test/Analysis/ptr-arith.cpp
index a1264a1..ec1c75c 100644
--- a/clang/test/Analysis/ptr-arith.cpp
+++ b/clang/test/Analysis/ptr-arith.cpp
@@ -139,10 +139,10 @@ struct parse_t {
 int parse(parse_t *p) {
   unsigned copy = p->bits2;
   clang_analyzer_dump(copy);
-  // expected-warning@-1 {{reg_$1<unsigned int Element{SymRegion{reg_$0<parse_t * p>},0 S64b,struct Bug_55934::parse_t}.bits2>}}
+  // expected-warning@-1 {{reg_$2<unsigned int Element{SymRegion{reg_$0<parse_t * p>},0 S64b,struct Bug_55934::parse_t}.bits2>}}
   header *bits = (header *)&copy;
   clang_analyzer_dump(bits->b);
-  // expected-warning@-1 {{derived_$2{reg_$1<unsigned int Element{SymRegion{reg_$0<parse_t * p>},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}}
+  // expected-warning@-1 {{derived_$4{reg_$2<unsigned int Element{SymRegion{reg_$0<parse_t * p>},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}}
   return bits->b; // no-warning
 }
 } // namespace Bug_55934
diff --git a/clang/test/Analysis/symbol-simplification-disequality-info.cpp b/clang/test/Analysis/symbol-simplification-disequality-info.cpp
index 69238b5..33b8f15 100644
--- a/clang/test/Analysis/symbol-simplification-disequality-info.cpp
+++ b/clang/test/Analysis/symbol-simplification-disequality-info.cpp
@@ -14,14 +14,14 @@ void test(int a, int b, int c, int d) {
   clang_analyzer_printState();
   // CHECK:       "disequality_info": [
   // CHECK-NEXT:    {
-  // CHECK-NEXT:      "class": [ "((reg_$0<int a>) + (reg_$1<int b>)) + (reg_$2<int c>)" ],
+  // CHECK-NEXT:      "class": [ "((reg_$0<int a>) + (reg_$2<int b>)) + (reg_$5<int c>)" ],
   // CHECK-NEXT:      "disequal_to": [
-  // CHECK-NEXT:        [ "reg_$3<int d>" ]]
+  // CHECK-NEXT:        [ "reg_$8<int d>" ]]
   // CHECK-NEXT:    },
   // CHECK-NEXT:    {
-  // CHECK-NEXT:      "class": [ "reg_$3<int d>" ],
+  // CHECK-NEXT:      "class": [ "reg_$8<int d>" ],
   // CHECK-NEXT:      "disequal_to": [
-  // CHECK-NEXT:        [ "((reg_$0<int a>) + (reg_$1<int b>)) + (reg_$2<int c>)" ]]
+  // CHECK-NEXT:        [ "((reg_$0<int a>) + (reg_$2<int b>)) + (reg_$5<int c>)" ]]
   // CHECK-NEXT:    }
   // CHECK-NEXT:  ],
 
@@ -32,14 +32,14 @@ void test(int a, int b, int c, int d) {
   clang_analyzer_printState();
   // CHECK:      "disequality_info": [
   // CHECK-NEXT:   {
-  // CHECK-NEXT:     "class": [ "(reg_$0<int a>) + (reg_$2<int c>)" ],
+  // CHECK-NEXT:     "class": [ "(reg_$0<int a>) + (reg_$5<int c>)" ],
   // CHECK-NEXT:     "disequal_to": [
-  // CHECK-NEXT:       [ "reg_$3<int d>" ]]
+  // CHECK-NEXT:       [ "reg_$8<int d>" ]]
   // CHECK-NEXT:   },
   // CHECK-NEXT:   {
-  // CHECK-NEXT:     "class": [ "reg_$3<int d>" ],
+  // CHECK-NEXT:     "class": [ "reg_$8<int d>" ],
   // CHECK-NEXT:     "disequal_to": [
-  // CHECK-NEXT:        [ "(reg_$0<int a>) + (reg_$2<int c>)" ]]
+  // CHECK-NEXT:        [ "(reg_$0<int a>) + (reg_$5<int c>)" ]]
   // CHECK-NEXT:    }
   // CHECK-NEXT:  ],
 
@@ -50,10 +50,10 @@ void test(int a, int b, int c, int d) {
   // CHECK-NEXT:    {
   // CHECK-NEXT:      "class": [ "reg_$0<int a>" ],
   // CHECK-NEXT:      "disequal_to": [
-  // CHECK-NEXT:        [ "reg_$3<int d>" ]]
+  // CHECK-NEXT:        [ "reg_$8<int d>" ]]
   // CHECK-NEXT:    },
   // CHECK-NEXT:    {
-  // CHECK-NEXT:      "class": [ "reg_$3<int d>" ],
+  // CHECK-NEXT:      "class": [ "reg_$8<int d>" ],
   // CHECK-NEXT:      "disequal_to": [
   // CHECK-NEXT:        [ "reg_$0<int a>" ]]
   // CHECK-NEXT:    }
diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp
index 73922d4..42e9847 100644
--- a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp
+++ b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp
@@ -13,10 +13,10 @@ void test(int a, int b, int c) {
     return;
   clang_analyzer_printState();
   // CHECK:      "constraints": [
-  // CHECK-NEXT:   { "symbol": "((reg_$0<int a>) + (reg_$1<int b>)) != (reg_$2<int c>)", "range": "{ [0, 0] }" }
+  // CHECK-NEXT:   { "symbol": "((reg_$0<int a>) + (reg_$2<int b>)) != (reg_$5<int c>)", "range": "{ [0, 0] }" }
   // CHECK-NEXT: ],
   // CHECK-NEXT: "equivalence_classes": [
-  // CHECK-NEXT:   [ "(reg_$0<int a>) + (reg_$1<int b>)", "reg_$2<int c>" ]
+  // CHECK-NEXT:   [ "(reg_$0<int a>) + (reg_$2<int b>)", "reg_$5<int c>" ]
   // CHECK-NEXT: ],
   // CHECK-NEXT: "disequality_info": null,
 
@@ -25,12 +25,12 @@ void test(int a, int b, int c) {
     return;
   clang_analyzer_printState();
   // CHECK:        "constraints": [
-  // CHECK-NEXT:     { "symbol": "(reg_$0<int a>) != (reg_$2<int c>)", "range": "{ [0, 0] }" },
-  // CHECK-NEXT:     { "symbol": "reg_$1<int b>", "range": "{ [0, 0] }" }
+  // CHECK-NEXT:     { "symbol": "(reg_$0<int a>) != (reg_$5<int c>)", "range": "{ [0, 0] }" },
+  // CHECK-NEXT:     { "symbol": "reg_$2<int b>", "range": "{ [0, 0] }" }
   // CHECK-NEXT:   ],
   // CHECK-NEXT:   "equivalence_classes": [
-  // CHECK-NEXT:     [ "(reg_$0<int a>) != (reg_$2<int c>)" ],
-  // CHECK-NEXT:     [ "reg_$0<int a>", "reg_$2<int c>" ]
+  // CHECK-NEXT:     [ "(reg_$0<int a>) != (reg_$5<int c>)" ],
+  // CHECK-NEXT:     [ "reg_$0<int a>", "reg_$5<int c>" ]
   // CHECK-NEXT:   ],
   // CHECK-NEXT: "disequality_info": null,
 
diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp
index 679ed3f..cffb5a70 100644
--- a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp
+++ b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp
@@ -15,11 +15,11 @@ void test(int a, int b, int c, int d) {
     return;
   clang_analyzer_printState();
   // CHECK:      "constraints": [
-  // CHECK-NEXT:   { "symbol": "(((reg_$0<int a>) + (reg_$1<int b>)) + (reg_$2<int c>)) != (reg_$3<int d>)", "range": "{ [0, 0] }" },
-  // CHECK-NEXT:   { "symbol": "(reg_$2<int c>) + (reg_$1<int b>)", "range": "{ [0, 0] }" }
+  // CHECK-NEXT:   { "symbol": "(((reg_$0<int a>) + (reg_$2<int b>)) + (reg_$5<int c>)) != (reg_$8<int d>)", "range": "{ [0, 0] }" },
+  // CHECK-NEXT:   { "symbol": "(reg_$5<int c>) + (reg_$2<int b>)", "range": "{ [0, 0] }" }
   // CHECK-NEXT: ],
   // CHECK-NEXT: "equivalence_classes": [
-  // CHECK-NEXT:   [ "((reg_$0<int a>) + (reg_$1<int b>)) + (reg_$2<int c>)", "reg_$3<int d>" ]
+  // CHECK-NEXT:   [ "((reg_$0<int a>) + (reg_$2<int b>)) + (reg_$5<int c>)", "reg_$8<int d>" ]
   // CHECK-NEXT: ],
   // CHECK-NEXT: "disequality_info": null,
 
@@ -28,14 +28,14 @@ void test(int a, int b, int c, int d) {
     return;
   clang_analyzer_printState();
   // CHECK:       "constraints": [
-  // CHECK-NEXT:    { "symbol": "(reg_$0<int a>) != (reg_$3<int d>)", "range": "{ [0, 0] }" },
-  // CHECK-NEXT:    { "symbol": "reg_$1<int b>", "range": "{ [0, 0] }" },
-  // CHECK-NEXT:    { "symbol": "reg_$2<int c>", "range": "{ [0, 0] }" }
+  // CHECK-NEXT:    { "symbol": "(reg_$0<int a>) != (reg_$8<int d>)", "range": "{ [0, 0] }" },
+  // CHECK-NEXT:    { "symbol": "reg_$2<int b>", "range": "{ [0, 0] }" },
+  // CHECK-NEXT:    { "symbol": "reg_$5<int c>", "range": "{ [0, 0] }" }
   // CHECK-NEXT:  ],
   // CHECK-NEXT:  "equivalence_classes": [
-  // CHECK-NEXT:    [ "(reg_$0<int a>) != (reg_$3<int d>)" ],
-  // CHECK-NEXT:    [ "reg_$0<int a>", "reg_$3<int d>" ],
-  // CHECK-NEXT:    [ "reg_$2<int c>" ]
+  // CHECK-NEXT:    [ "(reg_$0<int a>) != (reg_$8<int d>)" ],
+  // CHECK-NEXT:    [ "reg_$0<int a>", "reg_$8<int d>" ],
+  // CHECK-NEXT:    [ "reg_$5<int c>" ]
   // CHECK-NEXT:  ],
   // CHECK-NEXT:  "disequality_info": null,
 
diff --git a/clang/test/Analysis/unary-sym-expr.c b/clang/test/Analysis/unary-sym-expr.c
index 92e11b2..64a01a9 100644
--- a/clang/test/Analysis/unary-sym-expr.c
+++ b/clang/test/Analysis/unary-sym-expr.c
@@ -11,9 +11,9 @@ int test(int x, int y) {
   clang_analyzer_dump(-x);       // expected-warning{{-reg_$0<int x>}}
   clang_analyzer_dump(~x);       // expected-warning{{~reg_$0<int x>}}
   int z = x + y;
-  clang_analyzer_dump(-z);       // expected-warning{{-((reg_$0<int x>) + (reg_$1<int y>))}}
-  clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0<int x>) + (reg_$1<int y>))}}
-  clang_analyzer_dump(-x + y);   // expected-warning{{(-reg_$0<int x>) + (reg_$1<int y>)}}
+  clang_analyzer_dump(-z);       // expected-warning{{-((reg_$0<int x>) + (reg_$3<int y>))}}
+  clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0<int x>) + (reg_$3<int y>))}}
+  clang_analyzer_dump(-x + y);   // expected-warning{{(-reg_$0<int x>) + (reg_$3<int y>)}}
 
   if (-x == 0) {
     clang_analyzer_eval(-x == 0); // expected-warning{{TRUE}}
diff --git a/clang/test/Analysis/z3-crosscheck-max-attempts.cpp b/clang/test/Analysis/z3-crosscheck-max-attempts.cpp
new file mode 100644
index 0000000..7951d26
--- /dev/null
+++ b/clang/test/Analysis/z3-crosscheck-max-attempts.cpp
@@ -0,0 +1,42 @@
+// Check the default config.
+// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ConfigDumper 2>&1 \
+// RUN: | FileCheck %s --match-full-lines
+// CHECK: crosscheck-with-z3-max-attempts-per-query = 3
+
+// RUN: rm -rf %t && mkdir %t
+// RUN: %host_cxx -shared -fPIC                           \
+// RUN:   %S/z3/Inputs/MockZ3_solver_check.cpp            \
+// RUN:   -o %t/MockZ3_solver_check.so
+
+// DEFINE: %{mocked_clang} =                              \
+// DEFINE: LD_PRELOAD="%t/MockZ3_solver_check.so"         \
+// DEFINE: %clang_cc1 %s -analyze -setup-static-analyzer  \
+// DEFINE:   -analyzer-config crosscheck-with-z3=true     \
+// DEFINE:   -analyzer-checker=core
+
+// DEFINE: %{attempts} = -analyzer-config crosscheck-with-z3-max-attempts-per-query
+
+// RUN: not %clang_analyze_cc1 %{attempts}=0 2>&1 | FileCheck %s --check-prefix=VERIFY-INVALID
+// VERIFY-INVALID: invalid input for analyzer-config option 'crosscheck-with-z3-max-attempts-per-query', that expects a positive value
+
+// RUN: Z3_SOLVER_RESULTS="UNDEF"             %{mocked_clang} %{attempts}=1 -verify=refuted
+// RUN: Z3_SOLVER_RESULTS="UNSAT"             %{mocked_clang} %{attempts}=1 -verify=refuted
+// RUN: Z3_SOLVER_RESULTS="SAT"               %{mocked_clang} %{attempts}=1 -verify=accepted
+
+// RUN: Z3_SOLVER_RESULTS="UNDEF,UNDEF"       %{mocked_clang} %{attempts}=2 -verify=refuted
+// RUN: Z3_SOLVER_RESULTS="UNDEF,UNSAT"       %{mocked_clang} %{attempts}=2 -verify=refuted
+// RUN: Z3_SOLVER_RESULTS="UNDEF,SAT"         %{mocked_clang} %{attempts}=2 -verify=accepted
+
+// RUN: Z3_SOLVER_RESULTS="UNDEF,UNDEF,UNDEF" %{mocked_clang} %{attempts}=3 -verify=refuted
+// RUN: Z3_SOLVER_RESULTS="UNDEF,UNDEF,UNSAT" %{mocked_clang} %{attempts}=3 -verify=refuted
+// RUN: Z3_SOLVER_RESULTS="UNDEF,UNDEF,SAT"   %{mocked_clang} %{attempts}=3 -verify=accepted
+
+
+// REQUIRES: z3, asserts, shell, system-linux
+
+// refuted-no-diagnostics
+
+int div_by_zero_test(int b) {
+  if (b) {}
+  return 100 / b; // accepted-warning {{Division by zero}}
+}
diff --git a/clang/test/Analysis/z3/D83660.c b/clang/test/Analysis/z3/D83660.c
index fd46333..b42d934 100644
--- a/clang/test/Analysis/z3/D83660.c
+++ b/clang/test/Analysis/z3/D83660.c
@@ -1,21 +1,18 @@
 // RUN: rm -rf %t && mkdir %t
-// RUN: %host_cxx -shared -fPIC %S/Inputs/MockZ3_solver_check.c -o %t/MockZ3_solver_check.so
+// RUN: %host_cxx -shared -fPIC \
+// RUN:   %S/Inputs/MockZ3_solver_check.cpp \
+// RUN:   -o %t/MockZ3_solver_check.so
 //
-// RUN: LD_PRELOAD="%t/MockZ3_solver_check.so"                                       \
-// RUN: %clang_cc1 -analyze -analyzer-constraints=z3 -setup-static-analyzer          \
-// RUN:   -analyzer-checker=core,debug.ExprInspection %s -verify 2>&1 | FileCheck %s
+// RUN: Z3_SOLVER_RESULTS="SAT,SAT,SAT,SAT,UNDEF" \
+// RUN: LD_PRELOAD="%t/MockZ3_solver_check.so" \
+// RUN: %clang_cc1 -analyze -analyzer-constraints=z3 -setup-static-analyzer \
+// RUN:   -analyzer-checker=core %s -verify
 //
 // REQUIRES: z3, asserts, shell, system-linux
 //
 // Works only with the z3 constraint manager.
 // expected-no-diagnostics
 
-// CHECK:      Z3_solver_check returns the real value: TRUE
-// CHECK-NEXT: Z3_solver_check returns the real value: TRUE
-// CHECK-NEXT: Z3_solver_check returns the real value: TRUE
-// CHECK-NEXT: Z3_solver_check returns the real value: TRUE
-// CHECK-NEXT: Z3_solver_check returns a mocked value: UNDEF
-
 void D83660(int b) {
   if (b) {
   }
diff --git a/clang/test/Analysis/z3/Inputs/MockZ3_solver_check.c b/clang/test/Analysis/z3/Inputs/MockZ3_solver_check.c
deleted file mode 100644
index 9c63a64..0000000
--- a/clang/test/Analysis/z3/Inputs/MockZ3_solver_check.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <dlfcn.h>
-#include <stdio.h>
-
-#include <z3.h>
-
-// Mock implementation: return UNDEF for the 5th invocation, otherwise it just
-// returns the result of the real invocation.
-Z3_lbool Z3_API Z3_solver_check(Z3_context c, Z3_solver s) {
-  static Z3_lbool (*OriginalFN)(Z3_context, Z3_solver);
-  if (!OriginalFN) {
-    OriginalFN = (Z3_lbool(*)(Z3_context, Z3_solver))dlsym(RTLD_NEXT,
-                                                           "Z3_solver_check");
-  }
-
-  // Invoke the actual solver.
-  Z3_lbool Result = OriginalFN(c, s);
-
-  // Mock the 5th invocation to return UNDEF.
-  static unsigned int Counter = 0;
-  if (++Counter == 5) {
-    fprintf(stderr, "Z3_solver_check returns a mocked value: UNDEF\n");
-    return Z3_L_UNDEF;
-  }
-  fprintf(stderr, "Z3_solver_check returns the real value: %s\n",
-          (Result == Z3_L_UNDEF ? "UNDEF"
-                                : ((Result == Z3_L_TRUE ? "TRUE" : "FALSE"))));
-  return Result;
-}
diff --git a/clang/test/Analysis/z3/Inputs/MockZ3_solver_check.cpp b/clang/test/Analysis/z3/Inputs/MockZ3_solver_check.cpp
new file mode 100644
index 0000000..54cd86d
--- /dev/null
+++ b/clang/test/Analysis/z3/Inputs/MockZ3_solver_check.cpp
@@ -0,0 +1,62 @@
+#include <cassert>
+#include <dlfcn.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <z3.h>
+
+static char *Z3ResultsBegin;
+static char *Z3ResultsCursor;
+
+static __attribute__((constructor)) void init() {
+  const char *Env = getenv("Z3_SOLVER_RESULTS");
+  if (!Env) {
+    fprintf(stderr, "Z3_SOLVER_RESULTS envvar must be defined; abort\n");
+    abort();
+  }
+  Z3ResultsBegin = strdup(Env);
+  Z3ResultsCursor = Z3ResultsBegin;
+  if (!Z3ResultsBegin) {
+    fprintf(stderr, "strdup failed; abort\n");
+    abort();
+  }
+}
+
+static __attribute__((destructor)) void finit() {
+  if (strlen(Z3ResultsCursor) > 0) {
+    fprintf(stderr, "Z3_SOLVER_RESULTS should have been completely consumed "
+                    "by the end of the test; abort\n");
+    abort();
+  }
+  free(Z3ResultsBegin);
+}
+
+static bool consume_token(char **pointer_to_cursor, const char *token) {
+  assert(pointer_to_cursor);
+  int len = strlen(token);
+  if (*pointer_to_cursor && strncmp(*pointer_to_cursor, token, len) == 0) {
+    *pointer_to_cursor += len;
+    return true;
+  }
+  return false;
+}
+
+Z3_lbool Z3_API Z3_solver_check(Z3_context c, Z3_solver s) {
+  consume_token(&Z3ResultsCursor, ",");
+
+  if (consume_token(&Z3ResultsCursor, "UNDEF")) {
+    printf("Z3_solver_check returns UNDEF\n");
+    return Z3_L_UNDEF;
+  }
+  if (consume_token(&Z3ResultsCursor, "SAT")) {
+    printf("Z3_solver_check returns SAT\n");
+    return Z3_L_TRUE;
+  }
+  if (consume_token(&Z3ResultsCursor, "UNSAT")) {
+    printf("Z3_solver_check returns UNSAT\n");
+    return Z3_L_FALSE;
+  }
+  fprintf(stderr, "Z3_SOLVER_RESULTS was exhausted; abort\n");
+  abort();
+}
diff --git a/clang/test/CIR/global-var-simple.cpp b/clang/test/CIR/global-var-simple.cpp
index bbd4526..ffcc3ef 100644
--- a/clang/test/CIR/global-var-simple.cpp
+++ b/clang/test/CIR/global-var-simple.cpp
@@ -13,11 +13,11 @@ unsigned char uc;
 short ss;
 // CHECK: cir.global @ss : !cir.int<s, 16>
 
-unsigned short us;
-// CHECK: cir.global @us : !cir.int<u, 16>
+unsigned short us = 100;
+// CHECK: cir.global @us = #cir.int<100> : !cir.int<u, 16>
 
-int si;
-// CHECK: cir.global @si : !cir.int<s, 32>
+int si = 42;
+// CHECK: cir.global @si = #cir.int<42> : !cir.int<s, 32>
 
 unsigned ui;
 // CHECK: cir.global @ui : !cir.int<u, 32>
@@ -31,8 +31,8 @@ unsigned long ul;
 long long sll;
 // CHECK: cir.global @sll : !cir.int<s, 64>
 
-unsigned long long ull;
-// CHECK: cir.global @ull : !cir.int<u, 64>
+unsigned long long ull = 123456;
+// CHECK: cir.global @ull = #cir.int<123456> : !cir.int<u, 64>
 
 __int128 s128;
 // CHECK: cir.global @s128 : !cir.int<s, 128>
@@ -67,8 +67,8 @@ __bf16 bf16;
 float f;
 // CHECK: cir.global @f : !cir.float
 
-double d;
-// CHECK: cir.global @d : !cir.double
+double d = 1.25;
+// CHECK: cir.global @d = #cir.fp<1.250000e+00> : !cir.double
 
 long double ld;
 // CHECK: cir.global @ld : !cir.long_double<!cir.f80>
@@ -79,8 +79,8 @@ __float128 f128;
 void *vp;
 // CHECK: cir.global @vp : !cir.ptr<!cir.void>
 
-int *ip;
-// CHECK: cir.global @ip : !cir.ptr<!cir.int<s, 32>>
+int *ip = 0;
+// CHECK: cir.global @ip = #cir.ptr<null> : !cir.ptr<!cir.int<s, 32>>
 
 double *dp;
 // CHECK: cir.global @dp : !cir.ptr<!cir.double>
@@ -91,8 +91,8 @@ char **cpp;
 void (*fp)();
 // CHECK: cir.global @fp : !cir.ptr<!cir.func<!cir.void ()>>
 
-int (*fpii)(int);
-// CHECK: cir.global @fpii : !cir.ptr<!cir.func<!cir.int<s, 32> (!cir.int<s, 32>)>>
+int (*fpii)(int) = 0;
+// CHECK: cir.global @fpii = #cir.ptr<null> : !cir.ptr<!cir.func<!cir.int<s, 32> (!cir.int<s, 32>)>>
 
 void (*fpvar)(int, ...);
 // CHECK: cir.global @fpvar : !cir.ptr<!cir.func<!cir.void (!cir.int<s, 32>, ...)>>
diff --git a/clang/test/CXX/basic/basic.link/p3.cpp b/clang/test/CXX/basic/basic.link/p3.cpp
index 23f39d1..0120226 100644
--- a/clang/test/CXX/basic/basic.link/p3.cpp
+++ b/clang/test/CXX/basic/basic.link/p3.cpp
@@ -15,7 +15,8 @@ export module m; // #1
 
 // Import errors are fatal, so we test them in isolation.
 #if IMPORT_ERROR == 1
-import x = {}; // expected-error {{module 'x' not found}}
+import x = {}; // expected-error {{expected ';' after module name}}
+               // expected-error@-1 {{module 'x' not found}}
 
 #elif IMPORT_ERROR == 2
 struct X;
diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp
index 993b6f2..15f4694 100644
--- a/clang/test/CXX/drs/cwg0xx.cpp
+++ b/clang/test/CXX/drs/cwg0xx.cpp
@@ -1,9 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98,cxx98-14 -fexceptions -fcxx-exceptions -pedantic-errors -Wno-bind-to-temporary-copy
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98,cxx98-14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++11 %s -verify=expected,since-cxx11,cxx98-14,cxx11-14 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 // RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx11,cxx98-14,cxx11-14 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -53,16 +54,16 @@ namespace cwg1 { // cwg1: no
     // FIXME: This should be rejected, due to the ambiguous default argument.
     i();
   }
-}
+} // namespace cwg1
 
-namespace cwg3 { // cwg3: yes
+namespace cwg3 { // cwg3: 2.7
   template<typename T> struct A {};
   template<typename T> void f(T) { A<T> a; } // #cwg3-f-T
   template void f(int);
   template<> struct A<int> {};
   // expected-error@-1 {{explicit specialization of 'cwg3::A<int>' after instantiation}}
   //   expected-note@#cwg3-f-T {{implicit instantiation first required here}}
-}
+} // namespace cwg3
 
 namespace cwg4 { // cwg4: 2.8
   extern "C" {
@@ -73,7 +74,7 @@ namespace cwg4 { // cwg4: 2.8
     // expected-error@-1 {{conflicting types for 'cwg4_g'}}
     //   expected-note@#cwg4-g-int {{previous definition is here}}
   }
-}
+} // namespace cwg4
 
 namespace cwg5 { // cwg5: 3.1
   struct A {} a;
@@ -87,7 +88,7 @@ namespace cwg5 { // cwg5: 3.1
   struct D : C {};
   struct E { operator D&(); } e;
   const C c = e;
-}
+} // namespace cwg5
 
 namespace cwg7 { // cwg7: 3.4
   class A { public: ~A(); };
@@ -117,7 +118,7 @@ namespace cwg7 { // cwg7: 3.4
     };
     S5::S5() {}
   }
-}
+} // namespace cwg7
 
 namespace cwg8 { // cwg8: dup 45
   class A {
@@ -129,7 +130,7 @@ namespace cwg8 { // cwg8: dup 45
     T<U, k, &A::f> *g();
   };
   A::T<A::U, A::k, &A::f> *A::g() { return 0; }
-}
+} // namespace cwg8
 
 namespace cwg9 { // cwg9: 2.8
   struct B {
@@ -145,7 +146,7 @@ namespace cwg9 { // cwg9: 2.8
   //   expected-note@#cwg9-N {{constrained by protected inheritance here}}
   //   expected-note@#cwg9-m {{member is declared here}}
   int R2() { return n.m; }
-}
+} // namespace cwg9
 
 namespace cwg10 { // cwg10: dup 45
   class A {
@@ -153,9 +154,9 @@ namespace cwg10 { // cwg10: dup 45
       A::B *p;
     };
   };
-}
+} // namespace cwg10
 
-namespace cwg11 { // cwg11: yes
+namespace cwg11 { // cwg11: 2.7
   template<typename T> struct A : T {
     using typename T::U;
     U u;
@@ -167,7 +168,7 @@ namespace cwg11 { // cwg11: yes
   };
   struct X { typedef int U; };
   A<X> ax;
-}
+} // namespace cwg11
 
 namespace cwg12 { // cwg12: sup 239
   enum E { e };
@@ -179,7 +180,7 @@ namespace cwg12 { // cwg12: sup 239
     int &b = f(e);
     int &c = f(1);
   }
-}
+} // namespace cwg12
 
 namespace cwg13 { // cwg13: no
   extern "C" void f(int);
@@ -194,7 +195,7 @@ namespace cwg13 { // cwg13: no
   A<char> a2(g);
   int a3 = h(f); // FIXME: We should reject this.
   int a4 = h(g);
-}
+} // namespace cwg13
 
 namespace cwg14 { // cwg14: 3.4
   namespace X { extern "C" int cwg14_f(); }
@@ -218,14 +219,14 @@ namespace cwg14 { // cwg14: 3.4
   // expected-error@-1 {{reference to 'U' is ambiguous}}
   //   expected-note@#cwg14-X-U {{candidate found by name lookup is 'cwg14::X::U'}}
   //   expected-note@#cwg14-Y-U {{candidate found by name lookup is 'cwg14::Y::U'}}
-}
+} // namespace cwg14
 
-namespace cwg15 { // cwg15: yes
+namespace cwg15 { // cwg15: 2.7
   template<typename T> void f(int); // #cwg15-f-decl-first
   template<typename T> void f(int = 0);
   // expected-error@-1 {{default arguments cannot be added to a function template that has already been declared}}
   //   expected-note@#cwg15-f-decl-first {{previous template declaration is here}}
-}
+} // namespace cwg15
 
 namespace cwg16 { // cwg16: 2.8
   class A { // #cwg16-A
@@ -247,9 +248,9 @@ namespace cwg16 { // cwg16: 2.8
       //   expected-note@#cwg16-B {{implicitly declared private here}}
     }
   };
-}
+} // namespace cwg16
 
-namespace cwg17 { // cwg17: yes
+namespace cwg17 { // cwg17: 2.7
   class A {
     int n;
     int f();
@@ -260,7 +261,7 @@ namespace cwg17 { // cwg17: yes
   struct A::C : A {
     int g() { return n; }
   };
-}
+} // namespace cwg17
 
 // cwg18: sup 577
 
@@ -278,7 +279,7 @@ namespace cwg19 { // cwg19: 3.1
     //   expected-note@#cwg19-n {{member is declared here}}
     int get2() { return ((A&)c).n; } // ok, A is an accessible base of B from here
   };
-}
+} // namespace cwg19
 
 namespace cwg20 { // cwg20: 2.8
   class X {
@@ -291,7 +292,7 @@ namespace cwg20 { // cwg20: 2.8
   X x = f();
   // expected-error@-1 {{calling a private constructor of class 'cwg20::X'}}
   //   expected-note@#cwg20-X-ctor {{declared private here}}
-}
+} // namespace cwg20
 
 namespace cwg21 { // cwg21: 3.4
   template<typename T> struct A;
@@ -301,27 +302,27 @@ namespace cwg21 { // cwg21: 3.4
     template<typename T = int> friend struct B;
     // expected-error@-1 {{default template argument not permitted on a friend template}}
   };
-}
+} // namespace cwg21
 
 namespace cwg22 { // cwg22: sup 481
   template<typename cwg22_T = cwg22_T> struct X;
   // expected-error@-1 {{unknown type name 'cwg22_T'}}
   typedef int T;
   template<typename T = T> struct Y;
-}
+} // namespace cwg22
 
-namespace cwg23 { // cwg23: yes
+namespace cwg23 { // cwg23: 2.7
   template<typename T> void f(T, T); // #cwg23-f-T-T
   template<typename T> void f(T, int); // #cwg23-f-T-int
   void g() { f(0, 0); }
   // expected-error@-1 {{call to 'f' is ambiguous}}
   //   expected-note@#cwg23-f-T-T {{candidate function [with T = int]}}
   //   expected-note@#cwg23-f-T-int {{candidate function [with T = int]}}
-}
+} // namespace cwg23
 
 // cwg24: na
 
-namespace cwg25 { // cwg25: yes
+namespace cwg25 { // cwg25: 4
   struct A {
     void f() throw(int);
     // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}}
@@ -354,9 +355,9 @@ namespace cwg25 { // cwg25: yes
     // since-cxx17-error@-2 {{different exception specifications}}
     j = &A::f;
   }
-}
+} // namespace cwg25
 
-namespace cwg26 { // cwg26: yes
+namespace cwg26 { // cwg26: 2.7
   struct A { A(A, const A & = A()); };
   // expected-error@-1 {{copy constructor must pass its first argument by reference}}
   struct B {
@@ -374,12 +375,12 @@ namespace cwg26 { // cwg26: yes
     // expected-error@-1 {{recursive evaluation of default argument}}
     //   expected-note@-2 {{default argument used here}}
   };
-}
+} // namespace cwg26
 
-namespace cwg27 { // cwg27: yes
+namespace cwg27 { // cwg27: 2.7
   enum E { e } n;
   E &m = true ? n : n;
-}
+} // namespace cwg27
 
 // cwg28: na lib
 
@@ -440,7 +441,7 @@ namespace cwg29 { // cwg29: 3.4
     // expected-error@-1 {{declaration of 'cwg29_f8' has a different language linkage}}
     //   expected-note@#cwg29-f8 {{previous declaration is here}}
   }
-}
+} // namespace cwg29
 
 namespace cwg30 { // cwg30: sup 468 c++11
   struct A {
@@ -453,7 +454,7 @@ namespace cwg30 { // cwg30: sup 468 c++11
   // cxx98-error@-1 {{'template' keyword outside of a template}}
   int z = p->template f<0>();
   // cxx98-error@-1 {{'template' keyword outside of a template}}
-}
+} // namespace cwg30
 
 namespace cwg31 { // cwg31: 2.8
   class X {
@@ -465,7 +466,7 @@ namespace cwg31 { // cwg31: 2.8
   X *p = new X;
   // expected-error@-1 {{'operator delete' is a private member of 'cwg31::X'}}
   //   expected-note@#cwg31-delete {{declared private here}}
-}
+} // namespace cwg31
 
 // cwg32: na
 
@@ -510,7 +511,7 @@ namespace cwg33 { // cwg33: 9
     int m = Q() + X().f<int>; // ok
     int n = Q() + (&(X().f<int>)); // ok
   }
-}
+} // namespace cwg33
 
 // cwg34: na
 // cwg35: dup 178
@@ -618,15 +619,15 @@ namespace example4 {
     //   expected-note@#cwg36-E-k-first {{previous using declaration}}
   };
 }
-}
+} // namespace cwg36
 
 // cwg37: sup 475
 
-namespace cwg38 { // cwg38: yes
+namespace cwg38 { // cwg38: 2.7
   template<typename T> struct X {};
   template<typename T> X<T> operator+(X<T> a, X<T> b) { return a; }
   template X<int> operator+<int>(X<int>, X<int>);
-}
+} // namespace cwg38
 
 namespace cwg39 { // cwg39: no
   namespace example1 {
@@ -708,25 +709,25 @@ namespace cwg39 { // cwg39: no
     // expected-error@#cwg39-sizeof {{unknown type name}}
 #if __cplusplus >= 201103L
     decltype(D::n) n;
-    /* expected-error@-1
+    /* since-cxx11-error@-1
     {{non-static member 'n' found in multiple base-class subobjects of type 'A':
     struct cwg39::PR5916::D -> B -> A
     struct cwg39::PR5916::D -> C -> A}} */
-    //   expected-note@#cwg39-A-n {{member found by ambiguous name lookup}}
+    //   since-cxx11-note@#cwg39-A-n {{member found by ambiguous name lookup}}
 #endif
   }
-}
+} // namespace cwg39
 
 // cwg40: na
 
-namespace cwg41 { // cwg41: yes
+namespace cwg41 { // cwg41: 2.7
   struct S f(S);
-}
+} // namespace cwg41
 
-namespace cwg42 { // cwg42: yes
+namespace cwg42 { // cwg42: 2.7
   struct A { static const int k = 0; };
   struct B : A { static const int k = A::k; };
-}
+} // namespace cwg42
 
 // cwg43: na
 
@@ -735,21 +736,21 @@ namespace cwg44 { // cwg44: sup 727
     template<int> void f();
     template<> void f<0>();
   };
-}
+} // namespace cwg44
 
-namespace cwg45 { // cwg45: yes
+namespace cwg45 { // cwg45: 2.7
   class A {
     class B {};
     class C : B {};
     C c;
   };
-}
+} // namespace cwg45
 
-namespace cwg46 { // cwg46: yes
+namespace cwg46 { // cwg46: 2.7
   template<typename> struct A { template<typename> struct B {}; };
   template template struct A<int>::B<int>;
   // expected-error@-1 {{expected unqualified-id}}
-}
+} // namespace cwg46
 
 namespace cwg47 { // cwg47: sup 329
   template<typename T> struct A {
@@ -763,9 +764,9 @@ namespace cwg47 { // cwg47: sup 329
 
   void f();
   void g() { f(); }
-}
+} // namespace cwg47
 
-namespace cwg48 { // cwg48: yes
+namespace cwg48 { // cwg48: 2.7
   namespace {
     struct S {
       static const int m = 0;
@@ -779,7 +780,7 @@ namespace cwg48 { // cwg48: yes
   const int &b = S::n;
   const int S::o;
   const int &c = S::o;
-}
+} // namespace cwg48
 
 namespace cwg49 { // cwg49: 2.8
   template<int*> struct A {}; // #cwg49-A
@@ -805,9 +806,9 @@ namespace cwg49 { // cwg49: 2.8
   // since-cxx17-error@#cwg49-c {{non-type template argument is not a constant expression}}
   //   since-cxx17-note@#cwg49-c {{read of non-constexpr variable 'q' is not allowed in a constant expression}}
   //   since-cxx17-note@#cwg49-q {{declared here}}
-}
+} // namespace cwg49
 
-namespace cwg50 { // cwg50: yes
+namespace cwg50 { // cwg50: 2.7
   struct X; // #cwg50-X
   extern X *p;
   X *q = (X*)p;
@@ -817,7 +818,7 @@ namespace cwg50 { // cwg50: yes
   X *u = dynamic_cast<X*>(p);
   // expected-error@-1 {{'cwg50::X' is an incomplete type}}
   //   expected-note@#cwg50-X {{forward declaration of 'cwg50::X'}}
-}
+} // namespace cwg50
 
 namespace cwg51 { // cwg51: 2.8
   struct A {};
@@ -827,7 +828,7 @@ namespace cwg51 { // cwg51: 2.8
     operator B&();
   } s;
   A &a = s;
-}
+} // namespace cwg51
 
 namespace cwg52 { // cwg52: 2.8
   struct A { int n; }; // #cwg52-A
@@ -839,12 +840,12 @@ namespace cwg52 { // cwg52: 2.8
   //   expected-note@#cwg52-A {{member is declared here}}
   // expected-error@#cwg52-k {{cannot cast 'struct B' to its private base class 'cwg52::A'}}
   //   expected-note@#cwg52-B {{declared private here}}
-}
+} // namespace cwg52
 
-namespace cwg53 { // cwg53: yes
+namespace cwg53 { // cwg53: 2.7
   int n = 0;
   enum E { e } x = static_cast<E>(n);
-}
+} // namespace cwg53
 
 namespace cwg54 { // cwg54: 2.8
   struct A { int a; } a;
@@ -898,14 +899,14 @@ namespace cwg54 { // cwg54: 2.8
   // expected-error@-1 {{cannot cast 'cwg54::V *' to 'B *' via virtual base 'cwg54::V'}}
   int B::*cmbv = (int B::*)(&V::v);
   // expected-error@-1 {{conversion from pointer to member of class 'cwg54::V' to pointer to member of class 'B' via virtual base 'cwg54::V' is not allowed}}
-}
+} // namespace cwg54
 
-namespace cwg55 { // cwg55: yes
+namespace cwg55 { // cwg55: 2.7
   enum E { e = 5 };
   static_assert(e + 1 == 6, "");
-}
+} // namespace cwg55
 
-namespace cwg56 { // cwg56: yes
+namespace cwg56 { // cwg56: 2.7
   struct A {
     typedef int T; // #cwg56-typedef-int-T-first
     typedef int T;
@@ -919,7 +920,7 @@ namespace cwg56 { // cwg56: yes
     // expected-error@-1 {{redefinition of 'X'}}
     //   expected-note@#cwg56-typedef-X-X-first {{previous definition is here}}
   };
-}
+} // namespace cwg56
 
 namespace cwg58 { // cwg58: 3.1
   // FIXME: Ideally, we should have a CodeGen test for this.
@@ -930,9 +931,9 @@ namespace cwg58 { // cwg58: 3.1
   static_assert(X{E1_1, E2_m1}.e1 == 1, "");
   static_assert(X{E1_1, E2_m1}.e2 == -1, "");
 #endif
-}
+} // namespace cwg58
 
-namespace cwg59 { // cwg59: yes
+namespace cwg59 { // cwg59: 2.7
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdeprecated-volatile"
   template<typename T> struct convert_to { operator T() const; };
@@ -987,14 +988,14 @@ namespace cwg59 { // cwg59: yes
   int n4 = convert_to<const volatile int>();
   int n5 = convert_to<const volatile int&>();
 #pragma clang diagnostic pop
-}
+} // namespace cwg59
 
-namespace cwg60 { // cwg60: yes
+namespace cwg60 { // cwg60: 2.7
   void f(int &);
   int &f(...);
   const int k = 0;
   int &n = f(k);
-}
+} // namespace cwg60
 
 namespace cwg61 { // cwg61: 3.4
   struct X {
@@ -1011,7 +1012,7 @@ namespace cwg61 { // cwg61: 3.4
   // expected-error@-1 {{cannot create a non-constant pointer to member function}}
   void (*r)() = y.f;
   // expected-error@-1 {{cannot create a non-constant pointer to member function}}
-}
+} // namespace cwg61
 
 namespace cwg62 { // cwg62: 2.9
   struct A {
@@ -1064,21 +1065,21 @@ namespace cwg62 { // cwg62: 2.9
     X<int NoLinkage::*> d;
     // cxx98-error@-1 {{template argument uses local type }}
   }
-}
+} // namespace cwg62
 
-namespace cwg63 { // cwg63: yes
+namespace cwg63 { // cwg63: 2.7
   template<typename T> struct S { typename T::error e; };
   extern S<int> *p;
   void *q = p;
-}
+} // namespace cwg63
 
-namespace cwg64 { // cwg64: yes
+namespace cwg64 { // cwg64: 2.7
   template<class T> void f(T);
   template<class T> void f(T*);
   template<> void f(int*);
   template<> void f<int>(int*);
   template<> void f(int);
-}
+} // namespace cwg64
 
 // cwg65: na
 
@@ -1099,7 +1100,7 @@ namespace cwg66 { // cwg66: no
   int c = f(1, 2);
   // expected-error@-1 {{no matching function for call to 'f'}}
   //   expected-note@#cwg66-f-first {{candidate function not viable: requires single argument 'n', but 2 arguments were provided}}
-}
+} // namespace cwg66
 
 // cwg67: na
 
@@ -1120,7 +1121,7 @@ namespace cwg68 { // cwg68: 2.8
     friend typename ::cwg68::X<double>;
     // cxx98-error@-1 {{unelaborated friend declaration is a C++11 extension; specify 'struct' to befriend 'typename ::cwg68::X<double>'}}
   };
-}
+} // namespace cwg68
 
 namespace cwg69 { // cwg69: 9
   template<typename T> static void f() {} // #cwg69-f
@@ -1133,59 +1134,59 @@ namespace cwg69 { // cwg69: 9
   Q<&f<int> > q;
   // cxx98-error@-1 {{non-type template argument referring to function 'f<int>' with internal linkage is a C++11 extension}}
   //   cxx98-note@#cwg69-f {{non-type template argument refers to function here}}
-}
+} // namespace cwg69
 
-namespace cwg70 { // cwg70: yes
+namespace cwg70 { // cwg70: 2.7
   template<int> struct A {};
   template<int I, int J> int f(int (&)[I + J], A<I>, A<J>);
   int arr[7];
   int k = f(arr, A<3>(), A<4>());
-}
+} // namespace cwg70
 
 // cwg71: na
 // cwg72: dup 69
 
-#if __cplusplus >= 201103L
 namespace cwg73 { // cwg73: sup 1652
+#if __cplusplus >= 201103L
   int a, b;
   static_assert(&a + 1 != &b, "");
-  // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-  //   expected-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}}
-}
+  // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+  //   since-cxx11-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}}
 #endif
+} // namespace cwg73
 
-namespace cwg74 { // cwg74: yes
+namespace cwg74 { // cwg74: 2.7
   enum E { k = 5 };
   int (*p)[k] = new int[k][k];
-}
+} // namespace cwg74
 
-namespace cwg75 { // cwg75: yes
+namespace cwg75 { // cwg75: 2.7
   struct S {
     static int n = 0;
     // expected-error@-1 {{non-const static data member must be initialized out of line}}
   };
-}
+} // namespace cwg75
 
-namespace cwg76 { // cwg76: yes
+namespace cwg76 { // cwg76: 2.7
   const volatile int n = 1;
   static_assert(n, "");
   // expected-error@-1 {{static assertion expression is not an integral constant expression}}
   //   expected-note@-2 {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
-}
+} // namespace cwg76
 
-namespace cwg77 { // cwg77: yes
+namespace cwg77 { // cwg77: 2.7
   struct A {
     struct B {};
     friend struct B;
   };
-}
+} // namespace cwg77
 
 namespace cwg78 { // cwg78: sup ????
   // Under CWG78, this is valid, because 'k' has static storage duration, so is
   // zero-initialized.
   const int k;
   // expected-error@-1 {{default initialization of an object of const type 'const int'}}
-}
+} // namespace cwg78
 
 // cwg79: na
 
@@ -1207,18 +1208,18 @@ namespace cwg80 { // cwg80: 2.9
     int D;
     // expected-error@-1 {{member 'D' has the same name as its class}}
   };
-}
+} // namespace cwg80
 
 // cwg81: na
 // cwg82: dup 48
 
-namespace cwg83 { // cwg83: yes
+namespace cwg83 { // cwg83: 2.7
   int &f(const char*);
   char &f(char *);
   int &k = f("foo");
-}
+} // namespace cwg83
 
-namespace cwg84 { // cwg84: yes
+namespace cwg84 { // cwg84: 2.7
   struct B;
   struct A { operator B() const; };
   struct C {};
@@ -1234,7 +1235,7 @@ namespace cwg84 { // cwg84: yes
   // cxx98-14-error@-1 {{no viable constructor copying variable of type 'B'}}
   //   cxx98-14-note@#cwg84-copy-ctor {{candidate constructor not viable: expects an lvalue for 1st argument}}
   //   cxx98-14-note@#cwg84-ctor-from-C {{candidate constructor not viable: no known conversion from 'B' to 'C' for 1st argument}}
-}
+} // namespace cwg84
 
 namespace cwg85 { // cwg85: 3.4
   struct A {
@@ -1254,14 +1255,14 @@ namespace cwg85 { // cwg85: 3.4
     enum E1 : int;
     enum E1 : int { e1 }; // #cwg85-E1-def
     enum E1 : int;
-    // expected-error@-1 {{class member cannot be redeclared}}
-    //   expected-note@#cwg85-E1-def {{previous declaration is here}}
+    // since-cxx11-error@-1 {{class member cannot be redeclared}}
+    //   since-cxx11-note@#cwg85-E1-def {{previous declaration is here}}
 
     enum class E2;
     enum class E2 { e2 }; // #cwg85-E2-def
     enum class E2;
-    // expected-error@-1 {{class member cannot be redeclared}}
-    //   expected-note@#cwg85-E2-def {{previous declaration is here}}
+    // since-cxx11-error@-1 {{class member cannot be redeclared}}
+    //   since-cxx11-note@#cwg85-E2-def {{previous declaration is here}}
 #endif
   };
 
@@ -1272,7 +1273,7 @@ namespace cwg85 { // cwg85: 3.4
     // expected-error@-1 {{class member cannot be redeclared}}
     //   expected-note@#cwg85-C-B-def {{previous declaration is here}}
   };
-}
+} // namespace cwg85
 
 // cwg86: dup 446
 
@@ -1283,7 +1284,7 @@ namespace cwg87 { // cwg87: no
   X<void() throw()> x;
   // This is valid under cwg87 but not under cwg1975.
   X<void(void() throw())> y;
-}
+} // namespace cwg87
 
 namespace cwg88 { // cwg88: 2.8
   template<typename T> struct S {
@@ -1294,11 +1295,11 @@ namespace cwg88 { // cwg88: 2.8
   // expected-error@-1 {{static data member 'a' already has an initializer}}
   //   expected-note@#cwg88-a {{previous initialization is here}}
   template<> const int S<int>::b = 4;
-}
+} // namespace cwg88
 
 // cwg89: na
 
-namespace cwg90 { // cwg90: yes
+namespace cwg90 { // cwg90: 2.7
   struct A {
     template<typename T> friend void cwg90_f(T);
   };
@@ -1330,12 +1331,12 @@ namespace cwg90 { // cwg90: yes
     cwg90_g(F());
     // expected-error@-1 {{use of undeclared identifier 'cwg90_g'}}
   }
-}
+} // namespace cwg90
 
-namespace cwg91 { // cwg91: yes
+namespace cwg91 { // cwg91: 2.7
   union U { friend int f(U); };
   int k = f(U());
-}
+} // namespace cwg91
 
 namespace cwg92 { // cwg92: 4 c++17
   void f() throw(int, float);
@@ -1378,14 +1379,14 @@ namespace cwg92 { // cwg92: 4 c++17
   // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}}
   //   since-cxx17-note@-2 {{use 'noexcept(false)' instead}}
   Y<&h> yp; // ok
-}
+} // namespace cwg92
 
 // cwg93: na
 
-namespace cwg94 { // cwg94: yes
+namespace cwg94 { // cwg94: 2.7
   struct A { static const int n = 5; };
   int arr[A::n];
-}
+} // namespace cwg94
 
 namespace cwg95 { // cwg95: 3.3
   struct A;
@@ -1402,7 +1403,7 @@ namespace cwg95 { // cwg95: 3.3
   struct B { void f() { N::C::f(); } };
   // expected-error@-1 {{'f' is a private member of 'cwg95::N::C'}}
   //   expected-note@#cwg95-C-f {{implicitly declared private here}}
-}
+} // namespace cwg95
 
 namespace cwg96 { // cwg96: sup P1787
   struct A {
@@ -1423,16 +1424,16 @@ namespace cwg96 { // cwg96: sup P1787
     A::template S<int> s;
     B<A::template S> b;
   }
-}
+} // namespace cwg96
 
-namespace cwg97 { // cwg97: yes
+namespace cwg97 { // cwg97: 2.7
   struct A {
     static const int a = false;
     static const int b = !a;
   };
-}
+} // namespace cwg97
 
-namespace cwg98 { // cwg98: yes
+namespace cwg98 { // cwg98: 2.7
   void test(int n) {
     switch (n) {
       try { // #cwg98-try
@@ -1458,11 +1459,11 @@ namespace cwg98 { // cwg98: yes
         //   expected-note@#cwg98-catch {{jump bypasses initialization of catch block}}
     }
   }
-}
+} // namespace cwg98
 
 namespace cwg99 { // cwg99: sup 214
   template<typename T> void f(T&);
   template<typename T> int &f(const T&);
   const int n = 0;
   int &r = f(n);
-}
+} // namespace cwg99
diff --git a/clang/test/CXX/drs/cwg10xx.cpp b/clang/test/CXX/drs/cwg10xx.cpp
index 58d5529..c5b96c4 100644
--- a/clang/test/CXX/drs/cwg10xx.cpp
+++ b/clang/test/CXX/drs/cwg10xx.cpp
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 
 namespace std {
   __extension__ typedef __SIZE_TYPE__ size_t;
@@ -12,7 +13,7 @@ namespace std {
     const T *p; size_t n;
     initializer_list(const T *p, size_t n);
   };
-}
+} // namespace std
 
 namespace cwg1004 { // cwg1004: 5
   template<typename> struct A {};
@@ -43,7 +44,7 @@ namespace cwg1004 { // cwg1004: 5
   // expected-error@-1 {{is a constructor name}}
   //   expected-note@#cwg1004-t {{in instantiation of default argument}}
   Third<A<int> > t; // #cwg1004-t
-}
+} // namespace cwg1004
 
 namespace cwg1042 { // cwg1042: 3.5
 #if __cplusplus >= 201402L
@@ -58,7 +59,7 @@ namespace cwg1042 { // cwg1042: 3.5
   // list in this mode.
   using foo [[]] = int;
 #endif
-}
+} // namespace cwg1042
 
 namespace cwg1048 { // cwg1048: 3.6
   struct A {};
@@ -76,7 +77,7 @@ namespace cwg1048 { // cwg1048: 3.6
     }
   } (0);
 #endif
-}
+} // namespace cwg1048
 
 namespace cwg1054 { // cwg1054: no
   // FIXME: Test is incomplete.
@@ -89,7 +90,7 @@ namespace cwg1054 { // cwg1054: no
     a;
     // expected-warning@-1 {{expression result unused; assign into a variable to force a volatile load}}
   }
-}
+} // namespace cwg1054
 
 namespace cwg1070 { // cwg1070: 3.5
 #if __cplusplus >= 201103L
@@ -108,4 +109,4 @@ namespace cwg1070 { // cwg1070: 3.5
   };
   C c = {};
 #endif
-}
+} // namespace cwg1070
diff --git a/clang/test/CXX/drs/cwg118.cpp b/clang/test/CXX/drs/cwg118.cpp
index 04e19ce..c7685fd 100644
--- a/clang/test/CXX/drs/cwg118.cpp
+++ b/clang/test/CXX/drs/cwg118.cpp
@@ -1,7 +1,11 @@
 // RUN: %clang_cc1 -triple x86_64-linux -std=c++98 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call "
 // RUN: %clang_cc1 -triple x86_64-linux -std=c++11 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call "
 // RUN: %clang_cc1 -triple x86_64-linux -std=c++14 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call "
-// RUN: %clang_cc1 -triple x86_64-linux -std=c++1z %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call "
+// RUN: %clang_cc1 -triple x86_64-linux -std=c++17 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call "
+// RUN: %clang_cc1 -triple x86_64-linux -std=c++20 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call "
+// RUN: %clang_cc1 -triple x86_64-linux -std=c++23 %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call "
+// RUN: %clang_cc1 -triple x86_64-linux -std=c++2c %s -pedantic-errors -emit-llvm -o - | FileCheck %s --implicit-check-not " call "
+
 
 // cwg118: yes
 
diff --git a/clang/test/CXX/drs/cwg11xx.cpp b/clang/test/CXX/drs/cwg11xx.cpp
index 8d18704..03612b6 100644
--- a/clang/test/CXX/drs/cwg11xx.cpp
+++ b/clang/test/CXX/drs/cwg11xx.cpp
@@ -2,7 +2,9 @@
 // RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++2a %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
 
 namespace cwg1110 { // cwg1110: 3.1
 #if __cplusplus >= 201103L
@@ -19,7 +21,7 @@ decltype(return_T<B<int>>())* b;
 #endif
 } // namespace cwg1110
 
-namespace cwg1111 { // cwg1111: 3.2
+namespace cwg1111 { // cwg1111: partial
 namespace example1 {
 template <typename> struct set; // #cwg1111-struct-set
 
@@ -57,6 +59,48 @@ void baz() {
   a.operator A();
 }
 } // namespace example2
+
+namespace example3 {
+struct A {
+  operator int();
+} a;
+void foo() {
+  typedef int T;
+  a.operator T(); // T is found using unqualified lookup
+                  // after qualified lookup in A fails.
+}
+} // namespace example3
+
+namespace example4 {
+struct A {
+  typedef int T; // #cwg1111-A-T
+  operator T();
+};
+struct B : A {
+  operator T();
+} b;
+void foo() {
+  b.A::operator T(); // FIXME: qualified lookup should find T in A.
+  // expected-error@-1 {{unknown type name 'T'}}
+  //   expected-note@#cwg1111-A-T {{'A::T' declared here}}
+}
+} // namespace example4
+
+namespace example5 {
+template <class T1> struct A {
+  operator T1();
+};
+template <class T2> struct B : A<T2> {
+  operator T2();
+  void foo() {
+    // In both cases, during instantiation, qualified lookup for T2 wouldn't be able
+    // to find anything, so T2 has to be found by unqualified lookup.
+    // After that, 'operator T2()' is found in A<T2> by qualfied lookup.
+    T2 a = A<T2>::operator T2();
+    T2 b = ((A<T2> *)this)->operator T2();
+  }
+};
+} // namespace example5
 } // namespace cwg1111
 
 namespace cwg1113 { // cwg1113: partial
@@ -84,6 +128,6 @@ namespace cwg1113 { // cwg1113: partial
     extern "C" void f();
   }
   void g() { f(); }
-}
+} // namespace cwg1113
 
 // cwg1150: na
diff --git a/clang/test/CXX/drs/cwg12xx.cpp b/clang/test/CXX/drs/cwg12xx.cpp
index cdfbc6d..344adb6 100644
--- a/clang/test/CXX/drs/cwg12xx.cpp
+++ b/clang/test/CXX/drs/cwg12xx.cpp
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11,since-cxx23 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx17,since-cxx14,since-cxx11,since-cxx23 -fexceptions -fcxx-exceptions -pedantic-errors
 
 // cwg1200: na
 
@@ -29,10 +30,10 @@ namespace cwg1213 { // cwg1213: 7
   using U = decltype(V4Int()[0]);
   using U = decltype(EV4Int()[0]);
 #endif
-}
+} // namespace cwg1213
 
+namespace cwg1223 { // cwg1223: 17
 #if __cplusplus >= 201103L
-namespace cwg1223 { // cwg1223: 17 drafting 2023-05-12
 struct M;
 template <typename T>
 struct V;
@@ -82,12 +83,11 @@ void g() {
     sizeof(auto () -> C[1]);
     // since-cxx11-error@-1 {{function cannot return array type 'C[1]' (aka 'cwg1223::BB[1]')}}
 }
-
-}
 #endif
+} // namespace cwg1223
 
-#if __cplusplus >= 201103L
 namespace cwg1227 { // cwg1227: 3.0
+#if __cplusplus >= 201103L
 template <class T> struct A { using X = typename T::X; };
 // since-cxx11-error@-1 {{type 'int' cannot be used prior to '::' because it has no members}}
 //   since-cxx11-note@#cwg1227-g {{in instantiation of template class 'cwg1227::A<int>' requested here}}
@@ -101,8 +101,8 @@ void h() {
   f<int>(0); // OK, substituting return type causes deduction to fail
   g<int>(0); // #cwg1227-g-int
 }
-}
 #endif
+} // namespace cwg1227
 
 namespace cwg1250 { // cwg1250: 3.9
 struct Incomplete;
@@ -114,7 +114,7 @@ struct Base {
 struct Derived : Base {
   virtual Incomplete *meow();
 };
-}
+} // namespace cwg1250
 
 namespace cwg1265 { // cwg1265: 5
 #if __cplusplus >= 201103L
@@ -134,7 +134,7 @@ namespace cwg1265 { // cwg1265: 5
   auto k(), l();
   // since-cxx14-error@-1 {{function with deduced return type must be the only declaration in its group}}
 #endif
-}
+} // namespace cwg1265
 
 // cwg1291: na
 
@@ -161,5 +161,4 @@ namespace cwg1295 { // cwg1295: 4
   using T = decltype(true ? other : x.bitfield);
   using T = unsigned;
 #endif
-}
-
+} // namespace cwg1295
diff --git a/clang/test/CXX/drs/cwg13xx.cpp b/clang/test/CXX/drs/cwg13xx.cpp
index 980fcb4..9c72fef 100644
--- a/clang/test/CXX/drs/cwg13xx.cpp
+++ b/clang/test/CXX/drs/cwg13xx.cpp
@@ -14,10 +14,10 @@ namespace std {
     size_t n;
     initializer_list(const T*, size_t);
   };
-}
+} // namespace std
 
-#if __cplusplus >= 201103L
 namespace cwg1305 { // cwg1305: 3.0
+#if __cplusplus >= 201103L
 struct Incomplete; // #cwg1305-Incomplete
 struct Complete {};
 
@@ -25,8 +25,8 @@ int incomplete = alignof(Incomplete(&)[]);
 // since-cxx11-error@-1 {{invalid application of 'alignof' to an incomplete type 'Incomplete'}}
 //   since-cxx11-note@#cwg1305-Incomplete {{forward declaration of 'cwg1305::Incomplete'}}
 int complete = alignof(Complete(&)[]);
-}
 #endif
+} // namespace cwg1305
 
 namespace cwg1307 { // cwg1307: 14
 #if __cplusplus >= 201103L
@@ -150,7 +150,7 @@ namespace cwg1310 { // cwg1310: 5
   }
   template void wt_test<W<int> >(); // #cwg1310-W-int
   template void wt_test_good<W<int> >();
-}
+} // namespace cwg1310
 
 namespace cwg1315 { // cwg1315: partial
   template <int I, int J> struct A {};
@@ -182,7 +182,7 @@ namespace cwg1315 { // cwg1315: partial
   template <typename T, int I> struct C;
   template <typename T> struct C<T, T::value>;
   // expected-error@-1 {{type of specialized non-type template argument depends on a template parameter of the partial specialization}}
-}
+} // namespace cwg1315
 
 namespace cwg1330 { // cwg1330: 4 c++11
   // exception-specifications are parsed in a context where the class is complete.
@@ -302,7 +302,7 @@ namespace cwg1330 { // cwg1330: 4 c++11
 
   struct E : C<int> {}; // #cwg1330-C-int
   E e; // #cwg1330-e
-}
+} // namespace cwg1330
 
 // cwg1334: sup 1719
 
@@ -333,7 +333,7 @@ struct S {
   int z : 1 || new int { 0 };
 };
 #endif
-}
+} // namespace cwg1341
 
 namespace cwg1346 { // cwg1346: 3.5
   auto a(1);
@@ -376,7 +376,7 @@ namespace cwg1346 { // cwg1346: 3.5
     // since-cxx11-error@-2 {{cannot deduce type for lambda capture 'e' from parenthesized initializer list}}
   }
 #endif
-}
+} // namespace cwg1346
 
 namespace cwg1347 { // cwg1347: 3.1
   auto x = 5, *y = &x;
@@ -390,7 +390,7 @@ namespace cwg1347 { // cwg1347: 3.1
   auto (*fp)(int) -> int, i = 0;
   // since-cxx11-error@-1 {{declaration with trailing return type must be the only declaration in its group}}
 #endif
-}
+} // namespace cwg1347
 
 namespace cwg1350 { // cwg1350: 3.5
 #if __cplusplus >= 201103L
@@ -520,7 +520,7 @@ namespace cwg1358 { // cwg1358: 3.1
     //   cxx11-20-note@#cwg1358-NonLit {{'NonLit' is not literal because it is not an aggregate and has no constexpr constructors other than copy or move constructors}}
   };
 #endif
-}
+} // namespace cwg1358
 
 namespace cwg1359 { // cwg1359: 3.5
 #if __cplusplus >= 201103L
@@ -549,7 +549,7 @@ namespace cwg1359 { // cwg1359: 3.5
   //   cxx11-17-note@#cwg1359-Y {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}}
   //   cxx11-17-note@#cwg1359-Y {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}}
 #endif
-}
+} // namespace cwg1359
 
 namespace cwg1388 { // cwg1388: 4
   template<typename A, typename ...T> void f(T..., A); // #cwg1388-f
@@ -622,7 +622,7 @@ namespace cwg1388 { // cwg1388: 4
     // expected-error@-1 {{no matching function for call to 'f_pair_4'}}
     //   expected-note@#cwg1388-f-4 {{candidate template ignored: deduced packs of different lengths for parameter 'T' (<int, long> vs. <int, long, const char *>)}}
   }
-}
+} // namespace cwg1388
 
 namespace cwg1391 { // cwg1391: partial
   struct A {}; struct B : A {};
@@ -713,14 +713,14 @@ namespace cwg1391 { // cwg1391: partial
     int test_c1 = c(0); // ok
     int test_c2 = c<int>(0); // FIXME: apparently ambiguous
   }
-}
+} // namespace cwg1391
 
 namespace cwg1394 { // cwg1394: 15
 #if __cplusplus >= 201103L
 struct Incomplete;
 Incomplete f(Incomplete) = delete; // well-formed
 #endif
-}
+} // namespace cwg1394
 
 namespace cwg1395 { // cwg1395: 16
 #if __cplusplus >= 201103L
@@ -731,7 +731,7 @@ namespace cwg1395 { // cwg1395: 16
     f(&i);
   }
 #endif
-}
+} // namespace cwg1395
 
 namespace cwg1397 { // cwg1397: 3.2
 #if __cplusplus >= 201103L
@@ -757,4 +757,4 @@ namespace cwg1399 { // cwg1399: dup 1388
     // expected-error@-1 {{no matching function for call to 'f'}}
     //   expected-note@#cwg1399-f {{candidate template ignored: deduced packs of different lengths for parameter 'T' (<> vs. <int, int>)}}
   }
-}
+} // namespace cwg1399
diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index cb2f34b..51bc961 100644
--- a/clang/test/CXX/drs/cwg14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
@@ -40,7 +40,7 @@ namespace cwg1413 { // cwg1413: 12
       //   expected-note@#cwg1413-var2 {{'var2' declared here}}
     }
   };
-}
+} // namespace cwg1413
 
 namespace cwg1423 { // cwg1423: 11
 #if __cplusplus >= 201103L
@@ -53,7 +53,7 @@ namespace cwg1423 { // cwg1423: 11
   bool b4{nullptr};
   // since-cxx11-warning@-1 {{implicit conversion of nullptr constant to 'bool'}}
 #endif
-}
+} // namespace 1423
 
 // cwg1425: na abi
 
@@ -76,15 +76,15 @@ namespace cwg1432 { // cwg1432: 16
 
   template struct common_type<int, double>;
 #endif
-}
+} // namespace cwg1432
 
-namespace cwg1443 { // cwg1443: yes
+namespace cwg1443 { // cwg1443: 2.7
 struct A {
   int i;
   A() { void foo(int=i); }
   // expected-error@-1 {{default argument references 'this'}}
 };
-}
+} // namespace cwg1443
 
 namespace cwg1458 { // cwg1458: 3.1
 #if __cplusplus >= 201103L
@@ -93,7 +93,7 @@ struct A;
 void f() {
   constexpr A* a = nullptr;
   constexpr int p = &*a;
-  // expected-error@-1 {{cannot initialize a variable of type 'const int' with an rvalue of type 'A *'}}
+  // since-cxx11-error@-1 {{cannot initialize a variable of type 'const int' with an rvalue of type 'A *'}}
   constexpr A *p2 = &*a;
 }
 
@@ -108,27 +108,27 @@ namespace cwg1460 { // cwg1460: 3.5
   namespace DRExample {
     union A {
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       constexpr A() {}
     };
     constexpr A a = A();
 
     union B {
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       constexpr B() = default;
     };
     constexpr B b = B();
 
     union C {
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
     };
     constexpr C c = C();
 #if __cplusplus >= 201403L
@@ -141,7 +141,7 @@ namespace cwg1460 { // cwg1460: 3.5
   union B { int n; }; // #cwg1460-B
   union C { int n = 0; };
   struct D { union {}; };
-  // expected-error@-1 {{declaration does not declare anything}}
+  // since-cxx11-error@-1 {{declaration does not declare anything}}
   struct E { union { int n; }; }; // #cwg1460-E
   struct F { union { int n = 0; }; };
 
@@ -173,7 +173,7 @@ namespace cwg1460 { // cwg1460: 3.5
     // cxx11-17-error@-1 {{defaulted definition of default constructor cannot be marked constexpr}}
     union C { int n = 0; constexpr C() = default; };
     struct D { union {}; constexpr D() = default; };
-    // expected-error@-1 {{declaration does not declare anything}}
+    // since-cxx11-error@-1 {{declaration does not declare anything}}
     struct E { union { int n; }; constexpr E() = default; };
     // cxx11-17-error@-1 {{defaulted definition of default constructor cannot be marked constexpr}}
     struct F { union { int n = 0; }; constexpr F() = default; };
@@ -222,8 +222,8 @@ namespace cwg1460 { // cwg1460: 3.5
   union G {
     int a = 0; // #cwg1460-G-a
     int b = 0;
-    // expected-error@-1 {{initializing multiple members of union}}
-    //   expected-note@#cwg1460-G-a {{previous initialization is here}}
+    // since-cxx11-error@-1 {{initializing multiple members of union}}
+    //   since-cxx11-note@#cwg1460-G-a {{previous initialization is here}}
   };
   union H {
     union {
@@ -231,16 +231,16 @@ namespace cwg1460 { // cwg1460: 3.5
     };
     union {
       int b = 0;
-      // expected-error@-1 {{initializing multiple members of union}}
-      //   expected-note@#cwg1460-H-a {{previous initialization is here}}
+      // since-cxx11-error@-1 {{initializing multiple members of union}}
+      //   since-cxx11-note@#cwg1460-H-a {{previous initialization is here}}
     };
   };
   struct I {
     union {
       int a = 0; // #cwg1460-I-a
       int b = 0;
-      // expected-error@-1 {{initializing multiple members of union}}
-      //   expected-note@#cwg1460-I-a {{previous initialization is here}}
+      // since-cxx11-error@-1 {{initializing multiple members of union}}
+      //   since-cxx11-note@#cwg1460-I-a {{previous initialization is here}}
     };
   };
   struct J {
@@ -264,23 +264,23 @@ namespace cwg1460 { // cwg1460: 3.5
     };
     static_assert(B().a == 1, "");
     static_assert(B().b == 2, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
     static_assert(B('x').a == 0, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(B('x').b == 4, "");
     static_assert(B(123).b == 2, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}}
     static_assert(B(123).c == 3, "");
     static_assert(B("").a == 1, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(B("").b == 2, "");
     static_assert(B("").c == 3, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
 
     struct C {
       union { int a, b = 2, c; };
@@ -294,54 +294,54 @@ namespace cwg1460 { // cwg1460: 3.5
 
     static_assert(C().a == 1, "");
     static_assert(C().b == 2, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
     static_assert(C().d == 4, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
     static_assert(C().e == 5, "");
 
     static_assert(C('x').b == 2, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}}
     static_assert(C('x').c == 3, "");
     static_assert(C('x').d == 4, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
     static_assert(C('x').e == 5, "");
 
     static_assert(C(1).b == 2, "");
     static_assert(C(1).c == 3, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(C(1).d == 4, "");
     static_assert(C(1).e == 5, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'e' of union with active member 'd' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'e' of union with active member 'd' is not allowed in a constant expression}}
 
     static_assert(C(1.f).b == 2, "");
     static_assert(C(1.f).c == 3, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(C(1.f).e == 5, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'e' of union with active member 'f' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'e' of union with active member 'f' is not allowed in a constant expression}}
     static_assert(C(1.f).f == 6, "");
 
     static_assert(C("").a == 1, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(C("").b == 2, "");
     static_assert(C("").c == 3, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(C("").d == 4, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
     static_assert(C("").e == 5, "");
     static_assert(C("").f == 6, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'f' of union with active member 'e' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'f' of union with active member 'e' is not allowed in a constant expression}}
 
     struct D;
     extern const D d;
@@ -357,7 +357,7 @@ namespace cwg1460 { // cwg1460: 3.5
     static_assert(d.a == 0, "");
   }
 #endif
-}
+} // namespace cwg1460
 
 #if __cplusplus >= 201103L
 namespace std {
@@ -388,7 +388,7 @@ namespace std {
     const _E* begin() const {return __begin_;}
     const _E* end()   const {return __begin_ + __size_;}
   };
-} // std
+} // namespace std
 #endif
 
 namespace cwg1467 {  // cwg1467: 3.7 c++11
@@ -601,7 +601,7 @@ namespace cwg1467 {  // cwg1467: 3.7 c++11
   }
   } // namespace StringLiterals
 #endif
-} // cwg1467
+} // namespace cwg1467
 
 namespace cwg1477 { // cwg1477: 2.7
 namespace N {
@@ -630,7 +630,7 @@ namespace cwg1479 { // cwg1479: 3.1
   int operator""_a(const char*, std::size_t = 0);
   // since-cxx11-error@-1 {{literal operator cannot have a default argument}}
 #endif
-}
+} // namespace cwg1479
 
 namespace cwg1482 { // cwg1482: 3.0
                    // NB: sup 2516, test reused there
@@ -675,7 +675,7 @@ namespace cwg1490 {  // cwg1490: 3.7 c++11
   std::initializer_list<char>{"abc"};
   // since-cxx11-error@-1 {{expected unqualified-id}}}
 #endif
-} // cwg1490
+} // namespace cwg1490
 
 namespace cwg1495 { // cwg1495: 4
 #if __cplusplus >= 201103L
@@ -717,7 +717,7 @@ namespace cwg1495 { // cwg1495: 4
   //   since-cxx14-note@#cwg1495-c {{template is declared here}}
 #endif
 #endif
-}
+} // namespace cwg1495
 
 namespace cwg1496 { // cwg1496: no
 #if __cplusplus >= 201103L
@@ -728,4 +728,4 @@ struct A {
 // default constructor which is not deleted.
 static_assert(__is_trivial(A), "");
 #endif
-}
+} // namespace cwg1496
diff --git a/clang/test/CXX/drs/cwg158.cpp b/clang/test/CXX/drs/cwg158.cpp
index 2a74438..1f5c319 100644
--- a/clang/test/CXX/drs/cwg158.cpp
+++ b/clang/test/CXX/drs/cwg158.cpp
@@ -1,7 +1,10 @@
 // RUN: %clang_cc1 -triple x86_64-linux -std=c++98 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s
 // RUN: %clang_cc1 -triple x86_64-linux -std=c++11 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s
 // RUN: %clang_cc1 -triple x86_64-linux -std=c++14 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s
-// RUN: %clang_cc1 -triple x86_64-linux -std=c++1z %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-linux -std=c++17 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-linux -std=c++20 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-linux -std=c++23 %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-linux -std=c++2c %s -O3 -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK %s
 // RUN: %clang_cc1 -triple x86_64-linux -std=c++1z %s -O3 -pointer-tbaa -disable-llvm-passes -pedantic-errors -emit-llvm -o - | FileCheck --check-prefixes=CHECK,POINTER-TBAA %s
 
 // cwg158: yes
diff --git a/clang/test/CXX/drs/cwg15xx.cpp b/clang/test/CXX/drs/cwg15xx.cpp
index 961c250..30ec639 100644
--- a/clang/test/CXX/drs/cwg15xx.cpp
+++ b/clang/test/CXX/drs/cwg15xx.cpp
@@ -170,7 +170,7 @@ namespace cwg1512 { // cwg1512: 4
     //   since-cxx11-note@#cwg1512-Wrap {{second operand was implicitly converted to type 'int *'}}
   }
 #endif
-}
+} // namespace cwg1512
 
 namespace cwg1514 { // cwg1514: 11
 #if __cplusplus >= 201103L
@@ -184,7 +184,7 @@ namespace cwg1514 { // cwg1514: 11
 
   // The behavior in other contexts is superseded by CWG1966.
 #endif
-}
+} // namespace cwg1514
 
 namespace cwg1518 { // cwg1518: 4
 #if __cplusplus >= 201103L
@@ -321,13 +321,13 @@ namespace std_example {
   }
 }
 #endif // __cplusplus >= 201103L
-}
+} // namespace cwg1518
 
 namespace cwg1550 { // cwg1550: 3.4
   int f(bool b, int n) {
     return (b ? (throw 0) : n) + (b ? n : (throw 0));
   }
-}
+} // namespace cwg1550
 
 namespace cwg1558 { // cwg1558: 12
 #if __cplusplus >= 201103L
@@ -344,7 +344,7 @@ namespace cwg1558 { // cwg1558: 12
     //   since-cxx11-note@#cwg1558-f {{candidate template ignored: substitution failure [with T = int]: type 'int' cannot be used prior to '::' because it has no members}}
   }
 #endif
-}
+} // namespace cwg1558
 
 namespace cwg1560 { // cwg1560: 3.5
   void f(bool b, int n) {
@@ -353,9 +353,9 @@ namespace cwg1560 { // cwg1560: 3.5
   class X { X(const X&); };
   const X &get();
   const X &x = true ? get() : throw 0;
-}
+} // namespace cwg1560
 
-namespace cwg1563 { // cwg1563: yes
+namespace cwg1563 { // cwg1563: 3.1
 #if __cplusplus >= 201103L
   double bar(double) { return 0.0; }
   float bar(float) { return 0.0f; }
@@ -363,7 +363,7 @@ namespace cwg1563 { // cwg1563: yes
   using fun = double(double);
   fun &foo{bar}; // ok
 #endif
-}
+} // namespace cwg1563
 
 namespace cwg1567 { // cwg1567: 3.3
 #if __cplusplus >= 201103L
@@ -402,7 +402,7 @@ B b5{A{0}};
 //   since-cxx11-note@#cwg1567-B {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'A' to 'B' for 1st argument}}
 //   since-cxx11-note@#cwg1567-B-double {{candidate constructor not viable: no known conversion from 'A' to 'double' for 1st argument}}
 #endif
-}
+} // namespace cwg1567
 
 namespace cwg1573 { // cwg1573: 3.9
 #if __cplusplus >= 201103L
@@ -445,7 +445,7 @@ namespace cwg1573 { // cwg1573: 3.9
   // since-cxx11-error@-1 {{call to deleted constructor of 'J'}}
   //   since-cxx11-note@#cwg1573-I {{'I' has been explicitly marked deleted here}}
 #endif
-}
+} // namespace cwg1573
 
 #if __cplusplus >= 201103L
 namespace std {
@@ -485,7 +485,7 @@ namespace std {
   };
   typedef basic_string<char> string;
 
-} // std
+} // namespace std
 #endif
 
 namespace cwg1579 { // cwg1579: 3.9
@@ -558,7 +558,7 @@ auto CWG1579_lambda_invalid = []() -> GenericMoveOnly<char> {
   //   since-cxx11-note@#cwg1579-deleted-U {{'GenericMoveOnly<float>' has been explicitly marked deleted here}}
 };
 #endif
-} // end namespace cwg1579
+} // namespace cwg1579
 
 namespace cwg1584 { // cwg1584: 7 drafting 2015-05
 // Deducing function types from cv-qualified types
@@ -633,7 +633,7 @@ namespace cwg1589 {   // cwg1589: 3.7 c++11
     //   since-cxx11-note@#cwg1589-f2-ilist-int {{candidate function}}
   }
 #endif
-} // cwg1589
+} // namespace cwg1589
 
 namespace cwg1591 {  //cwg1591. Deducing array bound and element type from initializer list
 #if __cplusplus >= 201103L
@@ -718,4 +718,4 @@ namespace cwg1591 {  //cwg1591. Deducing array bound and element type from initi
     short *ps = i(Arr<int>{1, 2});  // OK #5
   }
 #endif
-} // cwg1591
+} // namespace cwg1591
diff --git a/clang/test/CXX/drs/cwg16xx.cpp b/clang/test/CXX/drs/cwg16xx.cpp
index 95e241f..bd2c484 100644
--- a/clang/test/CXX/drs/cwg16xx.cpp
+++ b/clang/test/CXX/drs/cwg16xx.cpp
@@ -51,7 +51,7 @@ namespace cwg1611 { // cwg1611: dup 1658
   struct B : virtual A { virtual void f() = 0; };
   struct C : B { C() : A(0) {} void f(); };
   C c;
-}
+} // namespace cwg1611
 
 namespace cwg1631 {  // cwg1631: 3.7
 #if __cplusplus >= 201103L
@@ -81,7 +81,7 @@ namespace cwg1631 {  // cwg1631: 3.7
     }
   }
 #endif
-}
+} // namespace cwg1631
 
 namespace cwg1638 { // cwg1638: 3.1
 #if __cplusplus >= 201103L
@@ -122,7 +122,7 @@ namespace cwg1638 { // cwg1638: 3.1
     // since-cxx11-note@-3 {{remove 'enum class' to befriend an enum}}
   };
 #endif
-}
+} // namespace cwg1638
 
 namespace cwg1645 { // cwg1645: 3.9
 #if __cplusplus >= 201103L
@@ -149,14 +149,14 @@ namespace cwg1645 { // cwg1645: 3.9
   //   since-cxx11-note@#cwg1645-int-int-int {{candidate inherited constructor has been explicitly deleted}}
   //   since-cxx11-note@#cwg1645-using {{constructor from base class 'A' inherited here}}
 #endif
-}
+} // namespace cwg1645
 
 namespace cwg1652 { // cwg1652: 3.6
   int a, b;
   static_assert(&a + 1 == &b, "");
   // expected-error@-1 {{static assertion expression is not an integral constant expression}}
   //   expected-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}}
-}
+} // namespace cwg1652
 
 namespace cwg1653 { // cwg1653: 4 c++17
   void f(bool b) {
@@ -173,7 +173,7 @@ namespace cwg1653 { // cwg1653: 4 c++17
     b += 1; // ok
     b -= 1; // ok
   }
-}
+} // namespace cwg1653
 
 namespace cwg1658 { // cwg1658: 5
   namespace DefCtor {
@@ -324,7 +324,7 @@ namespace cwg1658 { // cwg1658: 5
   }
 
   // assignment case is superseded by cwg2180
-}
+} // namespace cwg1658
 
 namespace cwg1672 { // cwg1672: 7
   struct Empty {};
@@ -349,7 +349,7 @@ namespace cwg1672 { // cwg1672: 7
   static_assert(!__is_standard_layout(Y<G>), "");
   static_assert(!__is_standard_layout(Y<H>), "");
   static_assert(!__is_standard_layout(Y<X>), "");
-}
+} // namespace cwg1672
 
 namespace cwg1684 { // cwg1684: 3.6
 #if __cplusplus >= 201103L
@@ -363,7 +363,7 @@ namespace cwg1684 { // cwg1684: 3.6
   // cxx11-20-error@-1 {{constexpr function's 1st parameter type 'NonLiteral' is not a literal type}}
   //   cxx11-20-note@#cwg1684-struct {{'NonLiteral' is not literal because it is not an aggregate and has no constexpr constructors other than copy or move constructors}}
 #endif
-}
+} // namespace cwg1684
 
 namespace cwg1687 { // cwg1687: 7
   template<typename T> struct To {
@@ -386,7 +386,7 @@ namespace cwg1687 { // cwg1687: 7
   // since-cxx20-error@-1 {{invalid operands to binary expression ('To<E1>' and 'To<E2>')}}
   //   since-cxx20-note@#cwg1687-op-T {{operand was implicitly converted to type 'cwg1687::E}}
 #endif
-}
+} // namespace cwg1687
 
 namespace cwg1690 { // cwg1690: 9
   // See also the various tests in "CXX/basic/basic.lookup/basic.lookup.argdep".
@@ -401,7 +401,7 @@ namespace cwg1690 { // cwg1690: 9
     f(s); // ok
   }
 #endif
-}
+} // namespace cwg1690
 
 namespace cwg1691 { // cwg1691: 9
 #if __cplusplus >= 201103L
@@ -421,7 +421,7 @@ namespace cwg1691 { // cwg1691: 9
     //   since-cxx11-note@#cwg1691-g {{'N::g' declared here}}
   }
 #endif
-}
+} // namespace cwg1691
 
 namespace cwg1692 { // cwg1692: 9
   namespace N {
@@ -436,7 +436,7 @@ namespace cwg1692 { // cwg1692: 9
     N::A::B::C c;
     f(c); // ok
   }
-}
+} // namespace cwg1692
 
 namespace cwg1696 { // cwg1696: 7
   namespace std_examples {
@@ -554,4 +554,4 @@ namespace cwg1696 { // cwg1696: 7
     //   since-cxx11-note@#cwg1696-il-5 {{nitializing field 'il' with default member initializer}}
   };
 #endif
-}
+} // namespace cwg1696
diff --git a/clang/test/CXX/drs/cwg1748.cpp b/clang/test/CXX/drs/cwg1748.cpp
index f216963..a0fe737 100644
--- a/clang/test/CXX/drs/cwg1748.cpp
+++ b/clang/test/CXX/drs/cwg1748.cpp
@@ -1,7 +1,10 @@
 // RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s
 // RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s
 // RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s
-// RUN: %clang_cc1 -std=c++1z %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | FileCheck %s
 
 // cwg1748: 3.7
 
diff --git a/clang/test/CXX/drs/cwg177x.cpp b/clang/test/CXX/drs/cwg177x.cpp
index cc62bda..a17fd22 100644
--- a/clang/test/CXX/drs/cwg177x.cpp
+++ b/clang/test/CXX/drs/cwg177x.cpp
@@ -1,7 +1,10 @@
 // RUN: %clang_cc1 -std=c++98 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
 // RUN: %clang_cc1 -std=c++11 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11
 // RUN: %clang_cc1 -std=c++14 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14
-// RUN: %clang_cc1 -std=c++1z %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14
+// RUN: %clang_cc1 -std=c++17 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14
+// RUN: %clang_cc1 -std=c++20 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14
+// RUN: %clang_cc1 -std=c++23 %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14
+// RUN: %clang_cc1 -std=c++2c %s -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14
 // RUN: %clang_cc1 -std=c++1z %s -fexceptions -fcxx-exceptions -pedantic-errors -triple i386-windows-pc -ast-dump | FileCheck %s --check-prefixes=CHECK,CXX11,CXX14
 
 namespace cwg1772 { // cwg1772: 14
@@ -28,7 +31,7 @@ namespace cwg1772 { // cwg1772: 14
   // CXX11-NEXT: StringLiteral{{.+}} 'const char[11]' lvalue "operator()"
   }
 #endif // __cplusplus >= 201103L
-}
+} // namespace cwg1772
 
 namespace cwg1779 { // cwg1779: 14
   // __func__ in a function template, member function template, or generic
@@ -76,4 +79,4 @@ namespace cwg1779 { // cwg1779: 14
     };
   }
 #endif // __cplusplus >= 201402L
-}
+} // namespace cwg1779
diff --git a/clang/test/CXX/drs/cwg17xx.cpp b/clang/test/CXX/drs/cwg17xx.cpp
index fb53a56..8c4f916 100644
--- a/clang/test/CXX/drs/cwg17xx.cpp
+++ b/clang/test/CXX/drs/cwg17xx.cpp
@@ -44,7 +44,7 @@ namespace cwg1715 { // cwg1715: 3.9
   //   since-cxx11-note@#cwg1715-E {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}}
   //   since-cxx11-note@#cwg1715-E {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}}
 #endif
-}
+} // namespace cwg1715
 
 namespace cwg1719 { // cwg1719: 19
 #if __cplusplus >= 201103L
@@ -96,7 +96,7 @@ struct A {
 // operator, or move assignment operator.
 static_assert(__is_trivially_copyable(A), "");
 #endif
-}
+} // namespace cwg1734
 
 namespace cwg1736 { // cwg1736: 3.9
 #if __cplusplus >= 201103L
@@ -115,7 +115,7 @@ struct S {
 struct Q { typedef int type; } q;
 S s(q); // #cwg1736-s
 #endif
-}
+} // namespace cwg1736
 
 namespace cwg1738 { // cwg1738: sup P0136R1
 #if __cplusplus >= 201103L
@@ -132,7 +132,7 @@ struct B : A {
 template B::B(int, double);
 // since-cxx11-error@-1 {{explicit instantiation of 'B' does not refer to a function template, variable template, member function, member class, or static data member}}
 #endif
-}
+} // namespace cwg1738
 
 // cwg1748 is in cwg1748.cpp
 
@@ -165,7 +165,7 @@ namespace cwg1753 { // cwg1753: 11
     n.~decltype(n)(); // OK
   #endif
   }
-}
+} // namespace cwg1753
 
 namespace cwg1756 { // cwg1756: 3.7
 #if __cplusplus >= 201103L
@@ -176,7 +176,7 @@ namespace cwg1756 { // cwg1756: 3.7
   struct X { operator int(); } x;
   int b{x};
 #endif
-}
+} // namespace cwg1756
 
 namespace cwg1758 { // cwg1758: 3.7
 #if __cplusplus >= 201103L
@@ -195,7 +195,7 @@ namespace cwg1758 { // cwg1758: 3.7
   } b;
   A a{b};
 #endif
-}
+} // namespace cwg1758
 
 namespace cwg1762 { // cwg1762: 14
 #if __cplusplus >= 201103L
@@ -204,7 +204,7 @@ namespace cwg1762 { // cwg1762: 14
   // since-cxx11-error@-1 {{invalid suffix on literal; C++11 requires a space between literal and identifier}}
   // since-cxx11-warning@-2 {{user-defined literal suffixes not starting with '_' are reserved; no literal will invoke this operator}}
 #endif
-}
+} // namespace cwg1762
 
 // cwg1772 is in cwg177x.cpp
 
@@ -221,12 +221,12 @@ namespace cwg1778 { // cwg1778: 9
   static_assert(!noexcept(C()), "");
   static_assert(noexcept(D()), "");
 #endif
-}
+} // namespace cwg1778
 
 // cwg1779 is in cwg177x.cpp
 
-namespace cwg1794 { // cwg1794: yes
-                   // NB: dup 1710
+namespace cwg1794 { // cwg1794: 2.7
+                    // NB: dup 1710
 #if __cplusplus >= 201103L
 template <template <typename> class Template> struct Internal {
   template <typename Arg> using Bind = Template<Arg>;
diff --git a/clang/test/CXX/drs/cwg1807.cpp b/clang/test/CXX/drs/cwg1807.cpp
index 59edacc..a2c4968 100644
--- a/clang/test/CXX/drs/cwg1807.cpp
+++ b/clang/test/CXX/drs/cwg1807.cpp
@@ -15,7 +15,7 @@ struct S {
 void f() {
   S s[3];
 }
-}
+} // namespace cwg1807
 
 // CHECK-LABEL:      define dso_local void @cwg1807::f()
 // CHECK:              invoke void @cwg1807::S::S(){{.+}}
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 0fd2cd6..626473f 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx11-17,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14,cxx17 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,cxx98-14,cxx11-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14,cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -26,7 +26,7 @@ S<i> V; // #cwg1801-S-i
 // cxx98-14-error@-1 {{non-type template argument does not refer to any declaration}}
 //   cxx98-14-note@#cwg1801-S {{template parameter is declared here}}
 // cxx17-error@#cwg1801-S-i {{non-type template argument refers to subobject '.i'}}
-}
+} // namespace cwg1801
 
 namespace cwg1802 { // cwg1802: 3.1
 #if __cplusplus >= 201103L
@@ -198,13 +198,13 @@ namespace cwg1813 { // cwg1813: 7
   static_assert(!__is_standard_layout(U), "");
 }
 
-namespace cwg1814 { // cwg1814: yes
+namespace cwg1814 { // cwg1814: 3.1
 #if __cplusplus >= 201103L
   void test() {
     auto lam = [](int x = 42) { return x; };
   }
 #endif
-}
+} // namespace cwg1814
 
 namespace cwg1815 { // cwg1815: 20
 #if __cplusplus >= 201402L
@@ -229,7 +229,7 @@ namespace cwg1815 { // cwg1815: 20
   static_assert(f() == 0);
 #endif
 #endif
-}
+} // namespace cwg1815
 
 // cwg1818 is in cwg1818.cpp
 
@@ -274,8 +274,8 @@ void d2() {
 #if __cplusplus >= 201103L
 auto e = [] {
   typedef int cwg1820::A;
-  // expected-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}}
-  // expected-error@-2 {{typedef declarator cannot be qualified}}
+  // since-cxx11-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}}
+  // since-cxx11-error@-2 {{typedef declarator cannot be qualified}}
 };
 #endif
 } // namespace cwg1820
@@ -296,14 +296,14 @@ struct A {
 };
 } // namespace cwg1821
 
-namespace cwg1822 { // cwg1822: yes
+namespace cwg1822 { // cwg1822: 3.1
 #if __cplusplus >= 201103L
   double a;
   auto x = [] (int a) {
     static_assert(__is_same(decltype(a), int), "should be resolved to lambda parameter");
   };
 #endif
-}
+} // namespace cwg1822
 
 namespace cwg1824 { // cwg1824: 2.7
 template<typename T>
@@ -325,9 +325,9 @@ enum E { // #cwg1832-E
 
 #if __cplusplus >= 201103L
 enum E2: decltype(static_cast<E2>(0), 0) {};
-// expected-error@-1 {{unknown type name 'E2'}}
+// since-cxx11-error@-1 {{unknown type name 'E2'}}
 enum class E3: decltype(static_cast<E3>(0), 0) {};
-// expected-error@-1 {{unknown type name 'E3'}}
+// since-cxx11-error@-1 {{unknown type name 'E3'}}
 #endif
 } // namespace cwg1832
 
@@ -373,7 +373,7 @@ namespace cwg1837 { // cwg1837: 3.3
     };
   };
 #endif
-}
+} // namespace cwg1837
 
 namespace cwg1862 { // cwg1862: no
 template<class T>
@@ -488,17 +488,14 @@ namespace cwg1872 { // cwg1872: 9
   // cxx11-17-error@-1 {{constexpr variable 'y2' must be initialized by a constant expression}}
   //   cxx11-17-note@-2 {{cannot evaluate call to virtual function in a constant expression in C++ standards before C++20}}
 #if __cplusplus >= 202002L
-  static_assert(y == 0);
+  static_assert(y2 == 0);
 #endif
   constexpr int z = A<Z>().f();
-  // since-cxx11-error@-1 {{constexpr variable 'z' must be initialized by a constant expression}}a
-#if __cplusplus < 202302L
-  //   since-cxx11-note@-3 {{non-literal type 'A<Z>' cannot be used in a constant expression}}
-#else
-  //   since-cxx23-note@-5 {{cannot construct object of type 'A<cwg1872::Z>' with virtual base class in a constant expression}}
-#endif
+  // since-cxx11-error@-1 {{constexpr variable 'z' must be initialized by a constant expression}}
+  //   cxx11-20-note@-2 {{non-literal type 'A<Z>' cannot be used in a constant expression}}
+  //   since-cxx23-note@-3 {{cannot construct object of type 'A<cwg1872::Z>' with virtual base class in a constant expression}}
 #endif
-}
+} // namespace cwg1872
 
 namespace cwg1878 { // cwg1878: 18
 #if __cplusplus >= 201402L
@@ -533,7 +530,7 @@ struct S {
 #endif
 };
 #endif
-}
+} // namespace cwg1878
 
 namespace cwg1881 { // cwg1881: 7
   struct A { int a : 4; };
@@ -545,7 +542,7 @@ namespace cwg1881 { // cwg1881: 7
   struct D : C { int : 0; };
   static_assert(__is_standard_layout(C), "");
   static_assert(!__is_standard_layout(D), "");
-}
+} // namespace cwg1881
 
 // cwg1884 is in cwg1884.cpp
 
@@ -585,9 +582,9 @@ void cwg1891() { // cwg1891: 4
   typedef decltype(a) A;
   typedef decltype(b) B;
 
-  static_assert(!__has_trivial_constructor(A), "");
+  static_assert(!__is_trivially_constructible(A), "");
   // since-cxx20-error@-1 {{failed}}
-  static_assert(!__has_trivial_constructor(B), "");
+  static_assert(!__is_trivially_constructible(B), "");
 
   // C++20 allows default construction for non-capturing lambdas (P0624R2).
   A x;
@@ -613,7 +610,7 @@ void cwg1891() { // cwg1891: 4
   // since-cxx11-error-re@-1 {{{{object of type '\(lambda at .+\)' cannot be assigned because its copy assignment operator is implicitly deleted}}}}
   //   since-cxx11-note@#cwg1891-b {{lambda expression begins here}}
 #endif
-}
+} // void cwg1891()
 
 namespace cwg1894 { // cwg1894: 3.8
                    // NB: reusing part of cwg407 test
@@ -641,7 +638,7 @@ namespace H {
   using namespace A;
   struct S s;
 }
-}
+} // namespace cwg1894
 
 namespace cwg1898 { // cwg1898: 2.7
 void e(int) {} // #cwg1898-e
diff --git a/clang/test/CXX/drs/cwg19xx.cpp b/clang/test/CXX/drs/cwg19xx.cpp
index 2fe4690..55a7c7cb 100644
--- a/clang/test/CXX/drs/cwg19xx.cpp
+++ b/clang/test/CXX/drs/cwg19xx.cpp
@@ -6,7 +6,9 @@
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
-namespace std { struct type_info; }
+namespace std {
+struct type_info;
+} // namespace std
 
 namespace cwg1900 { // cwg1900: 2.7
 // See the test for CWG1477 for detailed analysis
@@ -44,7 +46,7 @@ namespace cwg1902 { // cwg1902: 3.7
   //   since-cxx11-note@#cwg1902-B-A {{candidate constructor}}
   //   since-cxx11-note@#cwg1902-B-copy-ctor {{candidate constructor has been explicitly deleted}}
 #endif
-}
+} // namespace cwg1902
 
 namespace cwg1903 { // cwg1903: 2.7
   namespace A {
@@ -69,7 +71,7 @@ namespace cwg1903 { // cwg1903: 2.7
     using A::d;
     struct a *p;
   }
-}
+} // namespace cwg1903
 
 namespace cwg1909 { // cwg1909: 3.7
   struct A {
@@ -90,7 +92,7 @@ namespace cwg1909 { // cwg1909: 3.7
     // cxx98-error@-1 {{alias declarations are a C++11 extension}}
     // expected-error@-2 {{member 'D' has the same name as its class}}
   };
-}
+} // namespace cwg1909
 
 namespace cwg1918 { // cwg1918: no
 template<typename T> struct A {
@@ -125,7 +127,7 @@ static union {
   int not_empty;
 };
 #endif
-}
+} // namespace cwg1918
 
 namespace cwg1941 { // cwg1941: 3.9
 #if __cplusplus >= 201402L
@@ -155,7 +157,7 @@ struct iter {
 derived d1(it, end);
 derived d2(42, 9);
 #endif
-}
+} // namespace cwg1941
 
 namespace cwg1945 { // cwg1945: no
 template<typename T> struct A {
@@ -180,7 +182,7 @@ unsigned b = 0b'01;
 unsigned x = 0x'01;
 // since-cxx14-error@-1 {{invalid suffix 'x'01' on integer constant}}
 #endif
-}
+} // namespace cwg1947
 
 #if __cplusplus >= 201103L
 // cwg1948: 3.5
@@ -230,7 +232,7 @@ namespace cwg1959 { // cwg1959: 3.9
   // where the base class is reference-related to the argument type.
   c q(static_cast<c&&>(q));
 #endif
-}
+} // namespace cwg1959
 
 namespace cwg1960 { // cwg1960: no
 struct A {
@@ -251,7 +253,7 @@ struct C : B {
 using A::f;
 using A::g;
 };
-}
+} // namespace cwg1960
 
 namespace cwg1966 { // cwg1966: 11
 #if __cplusplus >= 201103L
@@ -280,7 +282,7 @@ namespace cwg1966 { // cwg1966: 11
     // since-cxx11-error@-3 {{anonymous bit-field cannot have a default member initializer}}
   };
 #endif
-}
+} // namespace cwg1966
 
 namespace cwg1968 { // cwg1968: no
 #if __cplusplus >= 201103L
@@ -291,7 +293,7 @@ namespace cwg1968 { // cwg1968: no
   constexpr const std::type_info *f() { return &typeid(int); }
   static_assert(f() == f(), "");
 #endif
-}
+} // namespace cwg1968
 
 namespace cwg1991 { // cwg1991: 3.9
 #if __cplusplus >= 201103L
@@ -309,6 +311,6 @@ namespace cwg1991 { // cwg1991: 3.9
   // of ambiguity.
   B b(0, 0); // ok, calls B constructor
 #endif
-}
+} // namespace cwg1991
 
 // cwg1994: dup 529
diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index 6aec8b6..98eb86c 100644
--- a/clang/test/CXX/drs/cwg1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17,cxx98-17 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -16,7 +17,7 @@
 #define __enable_constant_folding
 #endif
 
-namespace cwg100 { // cwg100: yes
+namespace cwg100 { // cwg100: 2.7
   template<const char (*)[4]> struct A {}; // #cwg100-A
   template<const char (&)[4]> struct B {}; // #cwg100-B
   template<const char *> struct C {}; // #cwg100-C
@@ -37,7 +38,7 @@ namespace cwg100 { // cwg100: yes
   // cxx98-14-error@#cwg100-d {{non-type template argument does not refer to any declaration}}
   //   cxx98-14-note@#cwg100-D {{template parameter is declared here}}
   // since-cxx17-error@#cwg100-d {{reference to subobject of string literal is not allowed in a template argument}}
-}
+} // namespace cwg100
 
 namespace cwg101 { // cwg101: 3.5
   extern "C" void cwg101_f();
@@ -50,9 +51,9 @@ namespace cwg101 { // cwg101: 3.5
   using X::size_t;
   extern "C" void cwg101_f();
   typedef unsigned size_t;
-}
+} // namespace cwg101
 
-namespace cwg102 { // cwg102: yes
+namespace cwg102 { // cwg102: 2.7
   namespace A {
     template<typename T> T f(T a, T b) { return a + b; }
     // expected-error@-1 {{call to function 'operator+' that is neither visible in the template definition nor found by argument-dependent lookup}}
@@ -64,7 +65,7 @@ namespace cwg102 { // cwg102: yes
   }
   B::S operator+(B::S, B::S); // #cwg102-operator-plus
   template B::S A::f(B::S, B::S); // #cwg102-instantiation
-}
+} // namespace cwg102
 
 // cwg103: na
 // cwg104: na lib
@@ -84,12 +85,12 @@ namespace cwg106 { // cwg106: sup 540
   // expected-warning@-1 {{'const' qualifier on reference type 'r2' (aka 'const int &') has no effect}}
   typedef const r2 &r2;
   // expected-warning@-1 {{'const' qualifier on reference type 'r2' (aka 'const int &') has no effect}}
-}
+} // namespace cwg106
 
-namespace cwg107 { // cwg107: yes
+namespace cwg107 { // cwg107: 2.7
   struct S {};
   extern "C" S operator+(S, S) { return S(); }
-}
+} // namespace cwg107
 
 namespace cwg108 { // cwg108: 2.9
   template<typename T> struct A {
@@ -100,9 +101,9 @@ namespace cwg108 { // cwg108: 2.9
     // expected-error@-1 {{unknown type name 'X'}}
   };
   template<> struct A<int>::B { int X; };
-}
+} // namespace cwg108
 
-namespace cwg109 { // cwg109: yes
+namespace cwg109 { // cwg109: 2.8
   struct A { template<typename T> void f(T); };
   template<typename T> struct B : T {
     using T::template f;
@@ -117,7 +118,7 @@ namespace cwg109 { // cwg109: yes
     void g() { this->f<int>(123); }
     // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}}
   };
-}
+} // namespace cwg109
 
 namespace cwg110 { // cwg110: 2.8
 template <typename T>
@@ -141,9 +142,9 @@ namespace cwg111 { // cwg111: dup 535
   // expected-error@-1 {{no matching constructor for initialization of 'B'}}
   //   expected-note@#cwg111-B {{candidate constructor (the implicit copy constructor) not viable: 1st argument ('const B') would lose const qualifier}}
   //   expected-note@#cwg111-B {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
-}
+} // namespace cwg111
 
-namespace cwg112 { // cwg112: yes
+namespace cwg112 { // cwg112: 3.1
   struct T { int n; };
   typedef T Arr[1];
 
@@ -161,9 +162,9 @@ namespace cwg112 { // cwg112: yes
   // cxx98-error@-1 {{non-type template argument referring to object 'a3' with internal linkage is a C++11 extension}}
   //   cxx98-note@#cwg112-a3 {{non-type template argument refers to object here}}
   X<a4> x4;
-}
+} // namespace cwg112
 
-namespace cwg113 { // cwg113: yes
+namespace cwg113 { // cwg113: 2.7
   extern void (*p)();
   void f() {
     no_such_function();
@@ -172,9 +173,9 @@ namespace cwg113 { // cwg113: yes
   }
   void g();
   void (*p)() = &g;
-}
+} // namespace cwg113
 
-namespace cwg114 { // cwg114: yes
+namespace cwg114 { // cwg114: 2.7
   struct A {
     virtual void f(int) = 0; // #cwg114-A-f
   };
@@ -184,7 +185,7 @@ namespace cwg114 { // cwg114: yes
   } b;
   // expected-error@-1 {{variable type 'struct B' is an abstract class}}
   //   expected-note@#cwg114-A-f {{unimplemented pure virtual method 'f' in 'B'}}
-}
+} // namespace cwg114
 
 namespace cwg115 { // cwg115: 3.0
   template<typename T> int f(T); // #cwg115-f
@@ -278,24 +279,24 @@ namespace cwg115 { // cwg115: 3.0
   // Special case kicks in only if a template argument list is specified.
   template<typename T=int> void with_default(); // #cwg115-with-default
   int k10 = f(&with_default);
-  // expected-error@-1 {{no matching function for call to 'f'}}
-  //   expected-note@#cwg115-f {{candidate template ignored: couldn't infer template argument 'T'}}
+  // since-cxx11-error@-1 {{no matching function for call to 'f'}}
+  //   since-cxx11-note@#cwg115-f {{candidate template ignored: couldn't infer template argument 'T'}}
   int k11 = f(&with_default<>);
   void k() {
     (void)&with_default;
-    // expected-error@-1 {{address of overloaded function 'with_default' cannot be cast to type 'void'}}
-    //   expected-note@#cwg115-with-default {{candidate function template}}
+    // since-cxx11-error@-1 {{address of overloaded function 'with_default' cannot be cast to type 'void'}}
+    //   since-cxx11-note@#cwg115-with-default {{candidate function template}}
     (void)&with_default<>;
     &with_default;
-    // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
-    //   expected-note@#cwg115-with-default {{possible target for call}}
+    // since-cxx11-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+    //   since-cxx11-note@#cwg115-with-default {{possible target for call}}
     &with_default<>;
-    // expected-warning@-1 {{expression result unused}}
+    // since-cxx11-warning@-1 {{expression result unused}}
   }
 #endif
-}
+} // namespace cwg115
 
-namespace cwg116 { // cwg116: yes
+namespace cwg116 { // cwg116: 2.7
   template<int> struct A {};
   template<int N> void f(A<N>) {} // #cwg116-f-N
   template<int M> void f(A<M>) {}
@@ -305,14 +306,14 @@ namespace cwg116 { // cwg116: yes
   template<typename U> void f(A<sizeof(U)>) {}
   // expected-error@-1 {{redefinition of 'f'}}
   //   expected-note@#cwg116-f-T {{previous definition is here}}
-}
+} // namespace cwg116
 
 // cwg117: na
 // cwg118 is in cwg118.cpp
 // cwg119: na
 // cwg120: na
 
-namespace cwg121 { // cwg121: yes
+namespace cwg121 { // cwg121: 2.7
   struct X {
     template<typename T> struct Y {};
   };
@@ -323,17 +324,17 @@ namespace cwg121 { // cwg121: yes
     // cxx98-17-error@-2 {{missing 'typename' prior to dependent type name T::Y; implicit 'typename' is a C++20 extension}}
   };
   Z<X> z;
-}
+} // namespace cwg121
 
-namespace cwg122 { // cwg122: yes
+namespace cwg122 { // cwg122: 2.7
   template<typename T> void f();
   void g() { f<int>(); }
-}
+} // namespace cwg122
 
 // cwg123: na
 // cwg124 is in cwg124.cpp
 
-// cwg125: yes
+// cwg125: 2.7
 struct cwg125_A { struct cwg125_B {}; }; // #cwg125_B
 cwg125_A::cwg125_B cwg125_C();
 namespace cwg125_B { cwg125_A cwg125_C(); }
@@ -348,7 +349,7 @@ namespace cwg125 {
     // since-cxx11-error@#cwg125_C {{'cwg125_B' is missing exception specification 'noexcept'}}
     //   since-cxx11-note@#cwg125_B {{previous declaration is here}}
   };
-}
+} // namespace cwg125
 
 namespace cwg126 { // cwg126: partial
   // FIXME: We do not yet generate correct code for this change:
@@ -459,7 +460,7 @@ namespace cwg126 { // cwg126: partial
   void f() throw(int);
   // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}}
   //   since-cxx17-note@-2 {{use 'noexcept(false)' instead}}
-}
+} // namespace cwg126
 
 namespace cwg127 { // cwg127: 2.9
   __extension__ typedef __decltype(sizeof(0)) size_t;
@@ -475,12 +476,12 @@ namespace cwg127 { // cwg127: 2.9
   };
   A<void> *p = new A<void>; // #cwg127-p
   A<int> *q = new ("") A<int>; // #cwg127-q
-}
+} // namespace cwg127
 
-namespace cwg128 { // cwg128: yes
+namespace cwg128 { // cwg128: 2.7
   enum E1 { e1 } x = e1;
   enum E2 { e2 } y = static_cast<E2>(x), z = static_cast<E2>(e1);
-}
+} // namespace cwg128
 
 // cwg129: dup 616
 // cwg130: na
@@ -489,7 +490,7 @@ namespace cwg131 { // cwg131: sup P1949
   const char *a_with_\u0e8c = "\u0e8c";
   const char *b_with_\u0e8d = "\u0e8d";
   const char *c_with_\u0e8e = "\u0e8e";
-}
+} // namespace cwg131
 
 namespace cwg132 { // cwg132: no
   void f() {
@@ -497,18 +498,18 @@ namespace cwg132 { // cwg132: no
     extern struct S {} y; // FIXME: This is invalid.
   }
   static enum { E } e;
-}
+} // namespace cwg132
 
 // cwg133: dup 87
 // cwg134: na
 
-namespace cwg135 { // cwg135: yes
+namespace cwg135 { // cwg135: 2.7
   struct A {
     A f(A a) { return a; }
     friend A g(A a) { return a; }
     static A h(A a) { return a; }
   };
-}
+} // namespace cwg135
 
 namespace cwg136 { // cwg136: 3.4
   void f(int, int, int = 0); // #cwg136-f
@@ -556,9 +557,9 @@ namespace cwg136 { // cwg136: 3.4
     // expected-error@-1 {{friend declaration specifying a default argument must be the only declaration}}
     //   expected-note@#cwg136-B-f {{previous declaration is here}}
   };
-}
+} // namespace cwg136
 
-namespace cwg137 { // cwg137: yes
+namespace cwg137 { // cwg137: 2.7
   extern void *p;
   extern const void *cp;
   extern volatile void *vp;
@@ -580,7 +581,7 @@ namespace cwg137 { // cwg137: yes
   const volatile int *cvqc = static_cast<const volatile int*>(cp);
   const volatile int *cvqv = static_cast<const volatile int*>(vp);
   const volatile int *cvqcv = static_cast<const volatile int*>(cvp);
-}
+} // namespace cwg137
 
 namespace cwg138 { // cwg138: partial
 namespace example1 {
@@ -656,7 +657,7 @@ struct Data {
 } // namespace example3
 } // namespace cwg138
 
-namespace cwg139 { // cwg139: yes
+namespace cwg139 { // cwg139: 2.7
   namespace example1 {
     typedef int f; // #cwg139-typedef-f
     struct A {
@@ -676,16 +677,16 @@ namespace cwg139 { // cwg139: yes
       };
     }
   }
-}
+} // namespace cwg139
 
-namespace cwg140 { // cwg140: yes
+namespace cwg140 { // cwg140: 2.7
   void f(int *const) {} // #cwg140-f-first
   void f(int[3]) {}
   // expected-error@-1 {{redefinition of 'f'}}
   //   expected-note@#cwg140-f-first {{previous definition is here}}
   void g(const int);
   void g(int n) { n = 2; }
-}
+} // namespace cwg140
 
 namespace cwg141 { // cwg141: 3.1
   template<typename T> void f();
@@ -727,7 +728,7 @@ namespace cwg141 { // cwg141: 3.1
     template<typename T> void S();
   };
   void i() { C<X>().i(); } // ok!!
-}
+} // namespace cwg141
 
 namespace cwg142 { // cwg142: 2.8
   class B { // #cwg142-B
@@ -780,9 +781,9 @@ namespace cwg142 { // cwg142: 2.8
     cwg142::B *bp2 = (cwg142::B*)this;
     bp2->mi = 3;
   }
-}
+} // namespace cwg142
 
-namespace cwg143 { // cwg143: yes
+namespace cwg143 { // cwg143: 2.7
   namespace A { struct X; }
   namespace B { void f(A::X); }
   namespace A {
@@ -792,9 +793,9 @@ namespace cwg143 { // cwg143: yes
     f(x);
     // expected-error@-1 {{use of undeclared identifier 'f'}}
   }
-}
+} // namespace cwg143
 
-namespace cwg145 { // cwg145: yes
+namespace cwg145 { // cwg145: 2.7
   void f(bool b) {
     ++b;
     // cxx98-14-warning@-1 {{incrementing expression of type bool is deprecated and incompatible with C++17}}
@@ -803,9 +804,9 @@ namespace cwg145 { // cwg145: yes
     // cxx98-14-warning@-1 {{incrementing expression of type bool is deprecated and incompatible with C++17}}
     // since-cxx17-error@-2 {{ISO C++17 does not allow incrementing expression of type bool}}
   }
-}
+} // namespace cwg145
 
-namespace cwg147 { // cwg147: yes
+namespace cwg147 { // cwg147: 2.7
   namespace example1 {
     template<typename> struct A {
       template<typename T> A(T);
@@ -831,13 +832,13 @@ namespace cwg147 { // cwg147: yes
     template<> A<int>::A<int>(A<int>::a);
     // expected-error@-1 {{qualified reference to 'A' is a constructor name rather than a template name in this context}}
   }
-}
+} // namespace cwg147
 
-namespace cwg148 { // cwg148: yes
+namespace cwg148 { // cwg148: 2.7
   struct A { int A::*p; };
   static_assert(__is_pod(int(A::*)), "");
   static_assert(__is_pod(A), "");
-}
+} // namespace cwg148
 
 // cwg149: na
 
@@ -879,15 +880,15 @@ namespace cwg150 { // cwg150: 19
       b + p;
     }
   } // namespace n1
-}
+} // namespace cwg150
 
 namespace cwg151 { // cwg151: 3.1
   struct X {};
   typedef int X::*p;
   static_assert(__enable_constant_folding(p() == 0), "");
-}
+} // namespace cwg151
 
-namespace cwg152 { // cwg152: yes
+namespace cwg152 { // cwg152: 2.7
   struct A {
     A(); // #cwg152-A-ctor
     explicit A(const A&); // #cwg152-A-explicit-ctor
@@ -904,24 +905,25 @@ namespace cwg152 { // cwg152: yes
   //   expected-note@#cwg152-A-explicit-ctor {{explicit constructor is not a candidate}}
   //   expected-note@#cwg152-A-ctor {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
   A a4(f());
-}
+} // namespace cwg152
 
 // cwg153: na
 
-namespace cwg154 { // cwg154: yes
+namespace cwg154 { // cwg154: 2.7
   union { int a; };
   // expected-error@-1 {{nonymous unions at namespace or global scope must be declared 'static'}}
   namespace {
     union { int b; };
   }
   static union { int c; };
-}
+} // namespace cwg154
 
 namespace cwg155 { // cwg155: dup 632
   struct S { int n; } s = { { 1 } };
   // expected-warning@-1 {{braces around scalar initializer}}
-}
+} // namespace cwg155
 
+// cwg156: sup 1111
 // cwg158 is in cwg158.cpp
 
 namespace cwg159 { // cwg159: 3.5
@@ -930,7 +932,7 @@ namespace cwg159 { // cwg159: 3.5
   void cwg159::f() {}
   // expected-warning@-1 {{extra qualification on member 'f'}}
   void cwg159::X::f() {}
-}
+} // namespace cwg159
 
 // cwg160: na
 
@@ -969,7 +971,7 @@ namespace cwg161 { // cwg161: 3.1
       D::sf();
     }
   };
-}
+} // namespace cwg161
 
 namespace cwg162 { // cwg162: 19
   struct A {
@@ -986,11 +988,11 @@ namespace cwg162 { // cwg162: 19
   int &c = (&A::f)(0);
   char &d = (&A::f)('0');
   // expected-error@-1 {{non-const lvalue reference to type 'char' cannot bind to a value of unrelated type 'int'}}
-}
+} // namespace cwg162
 
 // cwg163: na
 
-namespace cwg164 { // cwg164: yes
+namespace cwg164 { // cwg164: 2.7
   void f(int);
   template <class T> int g(T t) { return f(t); }
 
@@ -998,7 +1000,7 @@ namespace cwg164 { // cwg164: yes
   int f(E);
 
   int k = g(e);
-}
+} // namespace cwg164
 
 namespace cwg165 { // cwg165: no
   namespace N {
@@ -1009,7 +1011,7 @@ namespace cwg165 { // cwg165: no
   struct N::B {};
   // FIXME: cwg165 says this is ill-formed, but the argument in cwg1477 says it's ok
   void N::g() {}
-}
+} // namespace cwg165
 
 namespace cwg166 { // cwg166: 2.9
   namespace A { class X; }
@@ -1041,7 +1043,7 @@ namespace cwg166 { // cwg166: 2.9
   int i(A::X x) { return x.n; }
   // expected-error@-1 {{'n' is a private member of 'cwg166::A::X'}}
   //   expected-note@#cwg166-X-n {{implicitly declared private here}}
-}
+} // namespace cwg166
 
 // cwg167: sup 1012
 
@@ -1053,9 +1055,9 @@ namespace cwg168 { // cwg168: no
   };
   p a = &S::f; // FIXME: this should fail.
   q b = &S::f;
-}
+} // namespace cwg168
 
-namespace cwg169 { // cwg169: yes
+namespace cwg169 { // cwg169: 3.4
   template<typename> struct A { int n; };
   struct B {
     template<typename> struct C;
@@ -1072,7 +1074,7 @@ namespace cwg169 { // cwg169: yes
     using B::n<int>;
     // expected-error@-1 {{using declaration cannot refer to a template specialization}}
   };
-}
+} // namespace cwg169
 
 namespace { // cwg171: 3.4
   int cwg171a;
@@ -1083,9 +1085,9 @@ namespace cwg171 {
   extern "C" void cwg171b();
   // expected-error@-1 {{declaration of 'cwg171b' with C language linkage conflicts with declaration in global scope}}
   //   expected-note@#cwg171b-int {{declared in global scope here}}
-}
+} // namespace cwg171
 
-namespace cwg172 { // cwg172: yes
+namespace cwg172 { // cwg172: 2.7
   enum { zero };
   static_assert(-1 < zero, "");
 
@@ -1117,13 +1119,13 @@ namespace cwg172 { // cwg172: yes
   // cxx98-error@-1 {{'long long' is a C++11 extension}}
   static_assert(sizeof(f) == sizeof(unsigned long), "");
   static_assert(-f > 0, "");
-}
+} // namespace cwg172
 
-namespace cwg173 { // cwg173: yes
+namespace cwg173 { // cwg173: 2.7
   static_assert('0' + 1 == '1' && '0' + 2 == '2' && '0' + 3 == '3' &&
                 '0' + 4 == '4' && '0' + 5 == '5' && '0' + 6 == '6' &&
                 '0' + 7 == '7' && '0' + 8 == '8' && '0' + 9 == '9', "");
-}
+} // namespace cwg173
 
 // cwg174: sup 1012
 
@@ -1137,7 +1139,7 @@ namespace cwg175 { // cwg175: 2.8
     //   expected-note@#cwg175-A {{member is declared here}}
     cwg175::A b;
   };
-}
+} // namespace cwg175
 
 namespace cwg176 { // cwg176: 3.1
   template<typename T> class Y;
@@ -1177,9 +1179,9 @@ namespace cwg176 { // cwg176: 3.1
     // since-cxx17-error@#cwg176-p4 {{use of class template 'cwg176::X' requires template arguments; argument deduction not allowed in non-static class member}}
     //  since-cxx17-note@#cwg176-X {{template is declared here}}
   };
-}
+} // namespace cwg176
 
-namespace cwg177 { // cwg177: yes
+namespace cwg177 { // cwg177: 2.7
   struct B {};
   struct A {
     A(A &); // #cwg177-A-copy-ctor
@@ -1198,9 +1200,9 @@ namespace cwg177 { // cwg177: yes
   C c = e;
   // expected-error@-1 {{no viable constructor copying variable of type 'D'}}
   //   expected-note@#cwg177-C-copy-ctor {{candidate constructor not viable: expects an lvalue for 1st argument}}
-}
+} // namespace cwg177
 
-namespace cwg178 { // cwg178: yes
+namespace cwg178 { // cwg178: 3.1
   static_assert(int() == 0, "");
 #if __cplusplus >= 201103L
   static_assert(int{} == 0, "");
@@ -1211,13 +1213,13 @@ namespace cwg178 { // cwg178: yes
   struct U : S { constexpr U() : S() {} };
   static_assert(U().b == 0, "");
 #endif
-}
+} // namespace cwg178
 
-namespace cwg179 { // cwg179: yes
+namespace cwg179 { // cwg179: 2.7
   void f();
   int n = &f - &f;
   // expected-error@-1 {{arithmetic on pointers to the function type 'void ()'}}
-}
+} // namespace cwg179
 
 namespace cwg180 { // cwg180: 2.8
   template<typename T> struct X : T, T::some_base {
@@ -1227,9 +1229,9 @@ namespace cwg180 { // cwg180: 2.8
       enum T::some_enum e;
     }
   };
-}
+} // namespace cwg180
 
-namespace cwg181 { // cwg181: yes
+namespace cwg181 { // cwg181: 2.7
   namespace X {
     template <template X<class T> > struct A { };
     // expected-error@-1 +{{}}
@@ -1241,7 +1243,7 @@ namespace cwg181 { // cwg181: yes
     template <template <class T> class X> struct A { };
     template <template <class T> class X> void f(A<X>) { }
   }
-}
+} // namespace cwg181
 
 namespace cwg182 { // cwg182: 14
   template <class T> struct C {
@@ -1264,7 +1266,7 @@ namespace cwg182 { // cwg182: 14
     C<B> cb;
     cb.f();
   }
-}
+} // namespace cwg182
 
 namespace cwg183 { // cwg183: sup 382
   template<typename T> struct A {};
@@ -1275,9 +1277,9 @@ namespace cwg183 { // cwg183: sup 382
     typename B<int>::X x;
     // cxx98-error@-1 {{'typename' occurs outside of a template}}
   };
-}
+} // namespace cwg183
 
-namespace cwg184 { // cwg184: yes
+namespace cwg184 { // cwg184: 2.7
   template<typename T = float> struct B {};
 
   template<template<typename TT = float> class T> struct A {
@@ -1298,7 +1300,7 @@ namespace cwg184 { // cwg184: yes
   }
 
   void h() { A<B>().g(); }
-}
+} // namespace cwg184
 
 // cwg185 is in cwg185.cpp
 
@@ -1307,18 +1309,35 @@ namespace cwg187 { // cwg187: sup 481
   template<int X = Z, int Z = X> struct A;
   typedef A<> T;
   typedef A<1, 1> T;
-}
+} // namespace cwg187
 
-namespace cwg188 { // cwg188: yes
+namespace cwg188 { // cwg188: 2.7
   char c[10];
   static_assert(sizeof(0, c) == 10, "");
-}
+} // namespace cwg188
+
+namespace cwg190 { // cwg190: 19
+struct A {
+  int a;
+  static double x;
+  int b;
+  void y();
+  int c;
+};
 
-// cwg190 FIXME: add codegen test for tbaa
-//              or implement C++20 std::is_layout_compatible and test it this way
+struct B {
+  int a;
+  void y();
+  int b;
+  static double x;
+  int c;
+};
+
+static_assert(__is_layout_compatible(A, B), "");
+} // namespace cwg190
 
 int cwg191_j;
-namespace cwg191 { // cwg191: yes
+namespace cwg191 { // cwg191: 2.7
   namespace example1 {
     struct outer {
       static int i;
@@ -1345,11 +1364,19 @@ namespace cwg191 { // cwg191: yes
       }
     };
   }
-}
+} // namespace cwg191
+
+namespace cwg192 { // cwg192: 2.7
+struct S {
+  void f(I i) { }
+  // expected-error@-1 {{unknown type name 'I'}}
+  typedef int I;
+};
+} // namespace cwg192
 
 // cwg193 is in cwg193.cpp
 
-namespace cwg194 { // cwg194: yes
+namespace cwg194 { // cwg194: 2.7
   struct A {
     A();
     void A();
@@ -1363,17 +1390,17 @@ namespace cwg194 { // cwg194: yes
   struct C {
     inline explicit C(int) {}
   };
-}
+} // namespace cwg194
 
-namespace cwg195 { // cwg195: yes
+namespace cwg195 { // cwg195: 2.7
   void f();
   int *p = (int*)&f;
   // cxx98-error@-1 {{cast between pointer-to-function and pointer-to-object is an extension}}
   void (*q)() = (void(*)())&p;
   // cxx98-error@-1 {{cast between pointer-to-function and pointer-to-object is an extension}}
-}
+} // namespace cwg195
 
-namespace cwg197 { // cwg197: yes
+namespace cwg197 { // cwg197: 2.7
   char &f(char);
 
   template <class T> void g(T t) {
@@ -1395,9 +1422,9 @@ namespace cwg197 { // cwg197: yes
     g(2);
     g(e); // #cwg197-g-e-call
   }
-}
+} // namespace cwg197
 
-namespace cwg198 { // cwg198: yes
+namespace cwg198 { // cwg198: 2.9
   struct A {
     int n;
     struct B {
@@ -1420,6 +1447,6 @@ namespace cwg198 { // cwg198: yes
     // cxx98-error@-1 {{invalid use of non-static data member 'n'}}
     int f() { return n; }
   };
-}
+} // namespace cwg198
 
 // cwg199 is in cwg199.cpp
diff --git a/clang/test/CXX/drs/cwg20xx.cpp b/clang/test/CXX/drs/cwg20xx.cpp
index b2dc5a9..141a101 100644
--- a/clang/test/CXX/drs/cwg20xx.cpp
+++ b/clang/test/CXX/drs/cwg20xx.cpp
@@ -22,7 +22,7 @@ int a = b2[0];
 int b = __builtin_addressof(b2)->foo;
 // cxx98-error@-1 {{no member named 'foo' in 'cwg2007::B<cwg2007::A<void> >'}}
 // since-cxx11-error@-2 {{no member named 'foo' in 'cwg2007::B<cwg2007::A<void>>'}}
-}
+} // namespace cwg2007
 
 // cwg2009: na
 
@@ -88,7 +88,7 @@ namespace cwg2026 { // cwg2026: 11
     //   since-cxx20-note@-3 {{read of object outside its lifetime is not allowed in a constant expression}}
 #endif
   }
-}
+} // namespace cwg2026
 
 namespace cwg2049 { // cwg2049: 18
 #if __cplusplus >= 202302L
@@ -97,9 +97,9 @@ X<> a;
 X<nullptr> b;
 static_assert(__is_same(decltype(a), decltype(b)));
 #endif
-}
+} // namespace cwg2049
 
-namespace cwg2061 { // cwg2061: yes
+namespace cwg2061 { // cwg2061: 2.7
 #if __cplusplus >= 201103L
   namespace A {
     inline namespace b {
@@ -125,7 +125,7 @@ namespace cwg2061 { // cwg2061: yes
     A::C::S<int> s;
   }
 #endif // C++11
-}
+} // namespace cwg2061
 
 namespace cwg2076 { // cwg2076: 13
 #if __cplusplus >= 201103L
@@ -172,14 +172,14 @@ namespace cwg2076 { // cwg2076: 13
     //   since-cxx11-note@#cwg2076-bar {{cannot convert initializer list}}
   }
 #endif
-}
+} // namespace cwg2076
 
 namespace cwg2082 { // cwg2082: 11
   void test1(int x, int = sizeof(x)); // ok
 #if __cplusplus >= 201103L
   void test2(int x, int = decltype(x){}); // ok
 #endif
-}
+} // namespace cwg2082
 
 namespace cwg2083 { // cwg2083: partial
 #if __cplusplus >= 201103L
@@ -399,7 +399,7 @@ namespace cwg2083 { // cwg2083: partial
     }
   }
 #endif
-}
+} // namespace cwg2083
 
 namespace cwg2091 { // cwg2091: 10
 template<int &> struct X;
@@ -448,6 +448,6 @@ namespace cwg2094 { // cwg2094: 5
 
   static_assert(__is_trivially_assignable(A, const A&), "");
   static_assert(__is_trivially_assignable(B, const B&), "");
-}
+} // namespace cwg2094
 
 // cwg2096: dup 2598
diff --git a/clang/test/CXX/drs/cwg21xx.cpp b/clang/test/CXX/drs/cwg21xx.cpp
index 2800228..42a7c4d 100644
--- a/clang/test/CXX/drs/cwg21xx.cpp
+++ b/clang/test/CXX/drs/cwg21xx.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -64,9 +64,9 @@ namespace cwg2100 { // cwg2100: 12
   template<int N> struct Y<N> {
     static const int declared_later = 0;
   };
-}
+} // namespace cwg2100
 
-namespace cwg2103 { // cwg2103: yes
+namespace cwg2103 { // cwg2103: 2.7
   void f() {
     int a;
     int &r = a; // #cwg2103-r
@@ -79,7 +79,7 @@ namespace cwg2103 { // cwg2103: yes
       }
     };
   }
-}
+} // namespace cwg2103
 
 namespace cwg2120 { // cwg2120: 7
   struct A {};
@@ -90,7 +90,7 @@ namespace cwg2120 { // cwg2120: 7
   static_assert(__is_standard_layout(B), "");
   static_assert(__is_standard_layout(D), "");
   static_assert(!__is_standard_layout(E), "");
-}
+} // namespace cwg2120
 
 namespace cwg2126 { // cwg2126: 12
 #if __cplusplus >= 201103L
@@ -142,7 +142,7 @@ namespace cwg2126 { // cwg2126: 12
   //   since-cxx11-note@#cwg21260-j {{temporary created here}}
   static_assert(k.a.n == 1, "");
 #endif
-}
+} // namespace cwg2126
 
 namespace cwg2137 { // cwg2137: 20
 #if __cplusplus >= 201103L
@@ -177,7 +177,7 @@ namespace cwg2137 { // cwg2137: 20
 
   int z = g({ d });
 #endif
-}
+} // namespace cwg2137
 
 namespace cwg2140 { // cwg2140: 9
 #if __cplusplus >= 201103L
@@ -187,7 +187,7 @@ namespace cwg2140 { // cwg2140: 9
   }
   static_assert(!test({123}), "u.b should be valid even when b is inactive");
 #endif
-}
+} // namespace cwg2140
 
 namespace cwg2141 { // cwg2141: 17
 struct A{};
@@ -220,7 +220,7 @@ void foo() {
   // expected-error@-1 {{'E' cannot be defined in a type specifier}}
 
 }
-}
+} // namespace cwg2141
 
 // cwg2149 is in cwg2149.cpp
 
@@ -232,7 +232,7 @@ namespace cwg2157 { // cwg2157: 11
     // since-cxx11-error@-1 {{ISO C++ only allows ':' in member enumeration declaration to introduce a fixed underlying type, not an anonymous bit-field}}
   };
 #endif
-}
+} // namespace cwg2157
 
 // cwg2165: na
 
@@ -249,7 +249,7 @@ namespace cwg2170 { // cwg2170: 9
     };
   }
 #endif
-}
+} // namespace cwg2170
 
 namespace cwg2171 { // cwg2171: 15
 #if __cplusplus >= 201103L
@@ -259,13 +259,13 @@ struct NonConstCopy {
   NonConstCopy &operator=(NonConstCopy &) = default;
 };
 
-static_assert(__has_trivial_copy(NonConstCopy), "");
+static_assert(__is_trivially_copyable(NonConstCopy), "");
 static_assert(__is_trivially_constructible(NonConstCopy, NonConstCopy &), "");
 static_assert(!__is_trivially_constructible(NonConstCopy, NonConstCopy), "");
 static_assert(!__is_trivially_constructible(NonConstCopy, const NonConstCopy &), "");
 static_assert(!__is_trivially_constructible(NonConstCopy, NonConstCopy &&), "");
 
-static_assert(__has_trivial_assign(NonConstCopy), "");
+static_assert(__is_trivially_assignable(NonConstCopy, NonConstCopy &), "");
 static_assert(__is_trivially_assignable(NonConstCopy &, NonConstCopy &), "");
 static_assert(!__is_trivially_assignable(NonConstCopy &, const NonConstCopy &), "");
 static_assert(!__is_trivially_assignable(NonConstCopy &, NonConstCopy), "");
@@ -287,7 +287,7 @@ static_assert(!noexcept(typeid(*static_cast<D*>(nullptr))), "");
 #endif
 } // namespace cwg2191
 
-namespace cwg2180 { // cwg2180: yes
+namespace cwg2180 { // cwg2180: 3.0
   class A {
     A &operator=(const A &); // #cwg2180-A-copy
     A &operator=(A &&); // #cwg2180-A-move
@@ -315,7 +315,7 @@ namespace cwg2180 { // cwg2180: yes
   //   cxx98-note@#cwg2180-A-move {{implicitly declared private here}}
   // since-cxx11-error@#cwg2180-B-move {{defaulting this move assignment operator would delete it after its first declaration}}
   //   since-cxx11-note@#cwg2180-B {{move assignment operator of 'B' is implicitly deleted because base class 'A' has an inaccessible move assignment operator}}
-}
+} // namespace cwg2180
 
 namespace cwg2199 { // cwg2199: 3.8
                    // NB: reusing part of cwg407 test
@@ -343,4 +343,4 @@ namespace H {
   using namespace A;
   struct S s;
 }
-}
+} // namespace cwg2199
diff --git a/clang/test/CXX/drs/cwg22xx.cpp b/clang/test/CXX/drs/cwg22xx.cpp
index 0614d4f..d93070e 100644
--- a/clang/test/CXX/drs/cwg22xx.cpp
+++ b/clang/test/CXX/drs/cwg22xx.cpp
@@ -7,8 +7,8 @@
 // RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
 
-#if __cplusplus >= 201103L
 namespace cwg2211 { // cwg2211: 8
+#if __cplusplus >= 201103L
 void f() {
   int a;
   auto f = [a](int a) { (void)a; };
@@ -16,10 +16,10 @@ void f() {
   //   since-cxx11-note@-2 {{variable 'a' is explicitly captured here}}
   auto g = [=](int a) { (void)a; };
 }
-}
 #endif
+} // namespace cwg2211
 
-namespace cwg2213 { // cwg2213: yes
+namespace cwg2213 { // cwg2213: 2.7
 template <typename T, typename U>
 struct A;
 
@@ -41,7 +41,7 @@ struct AnonBitfieldQualifiers {
   volatile unsigned i2 : 1;
   const volatile unsigned i3 : 1;
 };
-}
+} // namespace cwg2229
 
 namespace cwg2233 { // cwg2233: 11
 #if __cplusplus >= 201103L
@@ -153,7 +153,7 @@ D d1(c);
 const D &d2{c}; // FIXME ill-formed
 const D &d3(c); // FIXME ill-formed
 #endif
-}
+} // namespace cwg2267
 
 namespace cwg2273 { // cwg2273: 3.3
 #if __cplusplus >= 201103L
@@ -170,7 +170,7 @@ B b;
 //   since-cxx11-note@#cwg2273-B {{default constructor of 'B' is implicitly deleted because base class 'A' has a deleted default constructor}}
 //   since-cxx11-note@#cwg2273-A {{'A' has been explicitly marked deleted here}}
 #endif
-}
+} // namespace cwg2273
 
 namespace cwg2277 { // cwg2277: partial
 #if __cplusplus >= 201103L
@@ -194,7 +194,7 @@ void g() {
   //   since-cxx11-note@#cwg2277-B-f {{candidate function}}
 }
 #endif
-}
+} // namespace cwg2277
 
 namespace cwg2292 { // cwg2292: 9
 #if __cplusplus >= 201103L
@@ -203,4 +203,4 @@ namespace cwg2292 { // cwg2292: 9
     p->template id<int>::~id<int>();
   }
 #endif
-}
+} // namespace cwg2292
diff --git a/clang/test/CXX/drs/cwg2335.cpp b/clang/test/CXX/drs/cwg2335.cpp
index 8b00a9d..805c272 100644
--- a/clang/test/CXX/drs/cwg2335.cpp
+++ b/clang/test/CXX/drs/cwg2335.cpp
@@ -1,14 +1,12 @@
-// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx98-11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 
-#if __cplusplus <= 201103L
-// expected-no-diagnostics
-#endif
+// cxx98-11-no-diagnostics
 
 namespace cwg2335 { // cwg2335: no drafting 2018-06
 // FIXME: current consensus is that the examples are well-formed.
diff --git a/clang/test/CXX/drs/cwg2353.cpp b/clang/test/CXX/drs/cwg2353.cpp
new file mode 100644
index 0000000..31dd5bd
--- /dev/null
+++ b/clang/test/CXX/drs/cwg2353.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s
+
+// expected-no-diagnostics
+
+namespace cwg2353 { // cwg2353: 9
+  struct X {
+    static const int n = 0;
+  };
+
+  // CHECK: FunctionDecl {{.*}} use
+  int use(X x) {
+    // CHECK: MemberExpr {{.*}} .n
+    // CHECK-NOT: non_odr_use
+    // CHECK: DeclRefExpr {{.*}} 'x'
+    // CHECK-NOT: non_odr_use
+    return *&x.n;
+  }
+#pragma clang __debug dump use
+
+  // CHECK: FunctionDecl {{.*}} not_use
+  int not_use(X x) {
+    // CHECK: MemberExpr {{.*}} .n {{.*}} non_odr_use_constant
+    // CHECK: DeclRefExpr {{.*}} 'x'
+    return x.n;
+  }
+#pragma clang __debug dump not_use
+
+  // CHECK: FunctionDecl {{.*}} not_use_2
+  int not_use_2(X *x) {
+    // CHECK: MemberExpr {{.*}} ->n {{.*}} non_odr_use_constant
+    // CHECK: DeclRefExpr {{.*}} 'x'
+    return x->n;
+  }
+#pragma clang __debug dump not_use_2
+} // namespace cwg2353
diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index 7f57d23..d144cf9 100644
--- a/clang/test/CXX/drs/cwg23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx11-14,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx11-14,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors
 
 namespace std {
   __extension__ typedef __SIZE_TYPE__ size_t;
@@ -16,8 +16,8 @@ namespace std {
   };
 }
 
-#if __cplusplus >= 201103L
 namespace cwg2303 { // cwg2303: 12
+#if __cplusplus >= 201103L
 template <typename... T>
 struct A;
 template <>
@@ -54,8 +54,8 @@ void g() {
     struct cwg2303::F -> B -> A<int, int>
     struct cwg2303::F -> E -> A<int, int>}} */
 }
-} // namespace cwg2303
 #endif
+} // namespace cwg2303
 
 namespace cwg2304 { // cwg2304: 2.8
 template<typename T> void foo(T, int);
@@ -191,10 +191,10 @@ auto j = std::initializer_list<InitListCtor>{ i };
 // since-cxx17-error@-1 {{conversion function from 'std::initializer_list<InitListCtor>' to 'const cwg2311::InitListCtor' invokes a deleted function}}
 //   since-cxx17-note@#cwg2311-InitListCtor {{'InitListCtor' has been explicitly marked deleted here}}
 #endif
-}
+} // namespace cwg2311
 
-#if __cplusplus >= 201103L
 namespace cwg2338 { // cwg2338: 12
+#if __cplusplus >= 201103L
 namespace B {
 enum E : bool { Zero, One };
 static_assert((int)(E)2 == 1, "");
@@ -203,15 +203,15 @@ namespace D {
 enum class E : bool { Zero, One };
 static_assert((int)(E)2 == 1, "");
 } // namespace D
-} // namespace cwg2338
 #endif
+} // namespace cwg2338
 
 namespace cwg2346 { // cwg2346: 11
   void test() {
     const int i2 = 0;
     extern void h2b(int x = i2 + 0); // ok, not odr-use
   }
-}
+} // namespace cwg2346
 
 namespace cwg2351 { // cwg2351: 20
 #if __cplusplus >= 201103L
@@ -248,7 +248,7 @@ namespace cwg2351 { // cwg2351: 20
   //   cxx98-note@-2 {{to match this '('}}
   // cxx98-error@-3 {{expected expression}}
 #endif
-}
+} // namespace cwg2351
 
 namespace cwg2352 { // cwg2352: 10
   int **p;
@@ -284,39 +284,7 @@ namespace cwg2352 { // cwg2352: 10
 #if __cplusplus >= 201103L
   static_assert(&p == &check_f, "");
 #endif
-}
-
-namespace cwg2353 { // cwg2353: 9
-  struct X {
-    static const int n = 0;
-  };
-
-  // CHECK: FunctionDecl {{.*}} use
-  int use(X x) {
-    // CHECK: MemberExpr {{.*}} .n
-    // CHECK-NOT: non_odr_use
-    // CHECK: DeclRefExpr {{.*}} 'x'
-    // CHECK-NOT: non_odr_use
-    return *&x.n;
-  }
-#pragma clang __debug dump use
-
-  // CHECK: FunctionDecl {{.*}} not_use
-  int not_use(X x) {
-    // CHECK: MemberExpr {{.*}} .n {{.*}} non_odr_use_constant
-    // CHECK: DeclRefExpr {{.*}} 'x'
-    return x.n;
-  }
-#pragma clang __debug dump not_use
-
-  // CHECK: FunctionDecl {{.*}} not_use_2
-  int not_use_2(X *x) {
-    // CHECK: MemberExpr {{.*}} ->n {{.*}} non_odr_use_constant
-    // CHECK: DeclRefExpr {{.*}} 'x'
-    return x->n;
-  }
-#pragma clang __debug dump not_use_2
-}
+} // namespace cwg2352
 
 namespace cwg2354 { // cwg2354: 15
 #if __cplusplus >= 201103L
@@ -350,22 +318,22 @@ B b2 = static_cast<B&&>(b1);      // calls #3: #1, #2, and #4 are not viable
 struct C { operator B&&(); };
 B b3 = C();                       // calls #3
 #endif
-}
+} // namespace cwg2356
 
-#if __cplusplus >= 201402L
 namespace cwg2358 { // cwg2358: 16
+#if __cplusplus >= 201402L
   void f2() {
     int i = 1;
     void g1(int = [xxx=1] { return xxx; }());  // OK
     void g2(int = [xxx=i] { return xxx; }());
     // since-cxx14-error@-1 {{default argument references local variable 'i' of enclosing function}}
   }
-}
 #endif
+} // namespace cwg2358
 
 // CWG2363 was closed as NAD, but its resolution does affirm that
 // a friend declaration cannot have an opaque-enumm-specifier.
-namespace cwg2363 { // cwg2363: yes
+namespace cwg2363 { // cwg2363: 19
 #if __cplusplus >= 201103L
 enum class E0;
 enum E1 : int;
@@ -373,26 +341,26 @@ enum E1 : int;
 struct A {
   friend enum class E0;
   // since-cxx11-error@-1 {{reference to enumeration must use 'enum' not 'enum class'}}
-  // expected-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-3 {{remove 'enum class' to befriend an enum}}
+  // since-cxx11-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-3 {{remove 'enum class' to befriend an enum}}
 
   friend enum E0;
-  // expected-error@-1 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-2 {{remove 'enum' to befriend an enum}}
+  // since-cxx11-error@-1 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-2 {{remove 'enum' to befriend an enum}}
 
   friend enum class E1;
   // since-cxx11-error@-1 {{reference to enumeration must use 'enum' not 'enum class'}}
-  // expected-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-3 {{remove 'enum class' to befriend an enum}}
+  // since-cxx11-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-3 {{remove 'enum class' to befriend an enum}}
 
   friend enum E1;
-  // expected-error@-1 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-2 {{remove 'enum' to befriend an enum}}
+  // since-cxx11-error@-1 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-2 {{remove 'enum' to befriend an enum}}
 
   friend enum class E2;
   // since-cxx11-error@-1 {{reference to enumeration must use 'enum' not 'enum class'}}
-  // expected-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-3 {{remove 'enum class' to befriend an enum}}
+  // since-cxx11-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-3 {{remove 'enum class' to befriend an enum}}
 };
 #endif
 } // namespace cwg2363
@@ -411,11 +379,11 @@ class C {
 };
 } // namespace cwg2370
 
-#if __cplusplus >= 201702L
+namespace cwg2386 { // cwg2386: 9
 // Otherwise, if the qualified-id std::tuple_size<E> names a complete class
 // type **with a member value**, the expression std::tuple_size<E>::value shall
 // be a well-formed integral constant expression
-namespace cwg2386 { // cwg2386: 9
+#if __cplusplus >= 201702L
 struct Bad1 { int a, b; };
 struct Bad2 { int a, b; };
 } // namespace cwg2386
@@ -430,8 +398,8 @@ namespace cwg2386 {
 void no_value() { auto [x, y] = Bad1(); }
 void wrong_value() { auto [x, y] = Bad2(); }
 // since-cxx17-error@-1 {{type 'Bad2' decomposes into 42 elements, but only 2 names were provided}}
-} // namespace cwg2386
 #endif
+} // namespace cwg2386
 
 // cwg2385: na
 
@@ -451,7 +419,7 @@ namespace cwg2387 { // cwg2387: 9
   extern template int d<int>;
   extern template const int d<const int>;
 #endif
-}
+} // namespace cwg2387
 
 namespace cwg2390 { // cwg2390: 14
 // Test that macro expansion of the builtin argument works.
@@ -499,7 +467,7 @@ const A a;
 struct B { const A a; };
 B b;
 
-}
+} // namespace cwg2394
 
 namespace cwg2396 { // cwg2396: no
   struct A {
@@ -515,16 +483,15 @@ namespace cwg2396 { // cwg2396: no
   // void f(A a) { a.operator B B::*(); }
   // void g(A a) { a.operator decltype(B()) B::*(); }
   // void g2(A a) { a.operator B decltype(B())::*(); }
-}
+} // namespace cwg2396
 
-#if __cplusplus >= 201103L
 namespace cwg2397 { // cwg2397: 17
+#if __cplusplus >= 201103L
   void foo() {
     int a[5];
 
     auto (&b)[5] = a;
     auto (*c)[5] = &a;
   }
-} // namespace cwg2397
-
 #endif
+} // namespace cwg2397
diff --git a/clang/test/CXX/drs/cwg24xx.cpp b/clang/test/CXX/drs/cwg24xx.cpp
index 79e9d03..9c9a3f1 100644
--- a/clang/test/CXX/drs/cwg24xx.cpp
+++ b/clang/test/CXX/drs/cwg24xx.cpp
@@ -2,9 +2,9 @@
 // RUN: %clang_cc1 -std=c++11 -pedantic-errors %s -verify=expected,cxx98-14
 // RUN: %clang_cc1 -std=c++14 -pedantic-errors %s -verify=expected,cxx98-14
 // RUN: %clang_cc1 -std=c++17 -pedantic-errors %s -verify=expected,since-cxx17
-// RUN: %clang_cc1 -std=c++20 -pedantic-errors %s -verify=expected,since-cxx17
-// RUN: %clang_cc1 -std=c++23 -pedantic-errors %s -verify=expected,since-cxx17
-// RUN: %clang_cc1 -std=c++2c -pedantic-errors %s -verify=expected,since-cxx17
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors %s -verify=expected,since-cxx20,since-cxx17
+// RUN: %clang_cc1 -std=c++23 -pedantic-errors %s -verify=expected,since-cxx20,since-cxx17
+// RUN: %clang_cc1 -std=c++2c -pedantic-errors %s -verify=expected,since-cxx20,since-cxx17
 
 namespace cwg2406 { // cwg2406: 5
 #if __cplusplus >= 201703L
@@ -39,7 +39,7 @@ void fallthrough(int n) {
   }
 }
 #endif
-}
+} // namespace cwg2406
 
 namespace cwg2428 { // cwg2428: 19
 #if __cplusplus >= 202002L
@@ -48,45 +48,45 @@ concept C [[deprecated]] = true; // #cwg2428-C
 
 template <typename>
 [[deprecated]] concept C2 = true;
-// expected-error@-1 {{expected unqualified-id}}
+// since-cxx20-error@-1 {{expected unqualified-id}}
 
 template <typename T>
 concept C3 = C<T>;
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 
 template <typename T, C U>
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 requires C<T>
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 void f() {
   bool b = C<int>;
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 };
 
 void g(C auto a) {};
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 
 template <typename T>
 auto h() -> C auto {
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   C auto foo = T();
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   C auto *bar = T();
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   C auto &baz = T();
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   C auto &&quux = T();
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   return foo;
 }
 #endif
@@ -110,7 +110,7 @@ f<{.a= 0}>();
 }
 
 #endif
-}
+} // namespace cwg2450
 
 namespace cwg2459 { // cwg2459: 18
 #if __cplusplus >= 202302L
@@ -120,7 +120,7 @@ struct A {
 template<A> struct X {};
 X<1> x;
 #endif
-}
+} // namespace cwg2459
 
 namespace cwg2445 { // cwg2445: 19
 #if __cplusplus >= 202002L
@@ -181,7 +181,7 @@ namespace cwg2445 { // cwg2445: 19
     return d + 1;
   }
 #endif
-}
+} // namespace cwg2445
 
 namespace cwg2486 { // cwg2486: 4 c++17
 struct C {
diff --git a/clang/test/CXX/drs/cwg2504.cpp b/clang/test/CXX/drs/cwg2504.cpp
index fa775df..535ef9b 100644
--- a/clang/test/CXX/drs/cwg2504.cpp
+++ b/clang/test/CXX/drs/cwg2504.cpp
@@ -21,7 +21,7 @@ struct B : A {
 struct C : B {};
 void foo() { C c; } // bar is not invoked, because the V subobject is not initialized as part of B
 #endif
-}
+} // namespace cwg2504
 
 // FIXME: As specified in the comment above (which comes from an example in the Standard),
 //        we are not supposed to unconditionally call `bar()` and call a constructor
diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index 87a7280..d9a7d2b 100644
--- a/clang/test/CXX/drs/cwg25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
@@ -71,7 +71,7 @@ int test_specialization() {
 }
 #endif
 
-}
+} // namespace cwg2518
 
 namespace cwg2521 { // cwg2521: 17
 #if __cplusplus >= 201103L
@@ -86,8 +86,8 @@ operator""  _div();
 // since-cxx11-warning@-1 {{identifier '_div' preceded by whitespace in a literal operator declaration is deprecated}}
 
 using ::cwg2521::operator"" _\u03C0___;
+// since-cxx11-warning@-1 {{identifier '_π___' preceded by whitespace in a literal operator declaration is deprecated}}
 using ::cwg2521::operator""_div;
-// since-cxx11-warning@-2 {{identifier '_π___' preceded by whitespace in a literal operator declaration is deprecated}}
 
 long double operator"" _RESERVED(long double);
 // since-cxx11-warning@-1 {{identifier '_RESERVED' preceded by whitespace in a literal operator declaration is deprecated}}
@@ -120,8 +120,8 @@ struct S2 {
 #endif
 } // namespace cwg2547
 
-#if __cplusplus >= 202302L
 namespace cwg2553 { // cwg2553: 18 review 2023-07-14
+#if __cplusplus >= 202302L
 struct B {
   virtual void f(this B&); 
   // since-cxx23-error@-1 {{an explicit object parameter cannot appear in a virtual function}}
@@ -134,12 +134,11 @@ struct D : B {
   // since-cxx23-error@-1 {{an explicit object parameter cannot appear in a virtual function}}
   //   since-cxx23-note@#cwg2553-g {{overridden virtual function is here}}
 };
-
-}
 #endif
+} // namespace cwg2553
 
-#if __cplusplus >= 202302L
 namespace cwg2554 { // cwg2554: 18 review 2021-12-10
+#if __cplusplus >= 202302L
 struct B {
   virtual void f(); // #cwg2554-g
 };
@@ -161,12 +160,11 @@ struct D3 : B {
   // since-cxx23-error@-1 {{an explicit object parameter cannot appear in a virtual function}}
   //   since-cxx23-note@#cwg2554-g {{overridden virtual function is here}}
 };
-
-}
 #endif
+} // namespace cwg2554
 
-#if __cplusplus >= 202302L
 namespace cwg2561 { // cwg2561: no
+#if __cplusplus >= 202302L
 struct C {
     constexpr C(auto) { }
 };
@@ -178,10 +176,8 @@ void foo() {
     static_assert(fp(1) == 1);
     static_assert((&decltype(b)::operator())(1) == 1);
 }
-
-}
 #endif
-
+} // namespace cwg2561
 
 namespace cwg2565 { // cwg2565: 16 open 2023-06-07
 #if __cplusplus >= 202002L
@@ -210,7 +206,7 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   struct TwoParamsStruct{};
 
   using TPSU = TwoParamsStruct<void, void>;
-  // since-cxx20-error@-1 {{constraints not satisfied for class template 'TwoParamsStruct'}}
+  // since-cxx20-error@-1 {{constraints not satisfied for class template 'TwoParamsStruct' [with T = void, U = void]}}
   //   since-cxx20-note@#cwg2565-TPSREQ {{because 'TwoParams<void, void>' evaluated to false}}
   //   since-cxx20-note@#cwg2565-TPC {{because 'b' would be invalid: argument may not have 'void' type}}
 
@@ -222,15 +218,15 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   struct VariadicStruct{};
 
   using VSU = VariadicStruct<void, int, char, double>;
-  // since-cxx20-error@-1 {{constraints not satisfied for class template 'VariadicStruct'}}
+  // since-cxx20-error@-1 {{constraints not satisfied for class template 'VariadicStruct' [with T = void, U = <int, char, double>]}}
   //   since-cxx20-note@#cwg2565-VSREQ {{because 'Variadic<void, int, char, double>' evaluated to false}}
   //   since-cxx20-note@#cwg2565-VC {{because 'b' would be invalid: argument may not have 'void' type}}
 
   template<typename T>
   concept ErrorRequires = requires (ErrorRequires auto x) {
-  // since-cxx20-error@-1 {{a concept definition cannot refer to itself}} \
-  // since-cxx20-error@-1 {{'auto' not allowed in requires expression parameter}} \
-  // since-cxx20-note@-1 {{declared here}}
+  // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
+  //   since-cxx20-note@-2 {{declared here}}
+  // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
     x;
   };
   static_assert(ErrorRequires<int>);
@@ -238,20 +234,20 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
   template<typename T>
-  concept NestedErrorInRequires = requires (T x) { //
-    // since-cxx20-note@-1 {{declared here}}
+  concept NestedErrorInRequires = requires (T x) { // #cwg2565-NEIR
     requires requires (NestedErrorInRequires auto y) {
-    // since-cxx20-error@-1 {{a concept definition cannot refer to itself}} \
-    // since-cxx20-error@-1 {{'auto' not allowed in requires expression parameter}}
+    // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
+    //   since-cxx20-note@#cwg2565-NEIR {{declared here}}
+    // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
       y;
     };
   };
   static_assert(NestedErrorInRequires<int>);
-  // expected-error@-1 {{static assertion failed}}
-  //   expected-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // since-cxx20-error@-1 {{static assertion failed}}
+  //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
 #endif
-}
+} // namespace cwg2565
 
 namespace cwg2583 { // cwg2583: 19
 #if __cplusplus >= 201103L
@@ -290,10 +286,10 @@ struct X {
   // e.g., "if an explicit object parameter is used it must be of type reference to 'X'"
   X& operator=(this int, const X&) = default;
   // since-cxx23-warning@-1 {{explicitly defaulted copy assignment operator is implicitly deleted}}
-  // since-cxx23-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
+  //   since-cxx23-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
   X& operator=(this X, const X&) = default;
   // since-cxx23-warning@-1 {{explicitly defaulted copy assignment operator is implicitly deleted}}
-  // since-cxx23-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
+  //   since-cxx23-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
 };
 struct Y {
   void operator=(this int, const Y&); // This is copy constructor, suppresses implicit declaration
@@ -372,8 +368,5 @@ union U {
 };
 static_assert(!__is_literal(U), "");
 #endif
-
-
-
 #endif
-}
+} // namespace cwg2598
diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index 63a954c..efc49b0 100644
--- a/clang/test/CXX/drs/cwg26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
@@ -33,7 +33,7 @@ namespace std {
   static_assert(sizeof(int16_t) == 2 && sizeof(int32_t) == 4 && sizeof(int64_t) == 8, "Some tests rely on these sizes");
 
   template<typename T> T declval();
-}
+} // namespace std
 
 namespace cwg2621 { // cwg2621: sup 2877
 #if __cplusplus >= 202002L
@@ -49,7 +49,7 @@ int E; // ignored by type-only lookup
 using enum E;
 }
 #endif
-}
+} // namespace cwg2621
 
 namespace cwg2627 { // cwg2627: 20
 #if __cplusplus >= 202002L
@@ -155,7 +155,7 @@ void f() {
   //   FIXME-since-cxx20-note@#cwg2628-ctor {{marked deleted here}} 
 }
 #endif
-}
+} // namespace cwg2628
 
 // cwg2630 is in cwg2630.cpp
 
@@ -175,7 +175,7 @@ namespace cwg2631 { // cwg2631: 16
     return k();
   }
 #endif
-}
+} // namespace cwg2631
 
 namespace cwg2635 { // cwg2635: 16
 #if __cplusplus >= 202002L
@@ -205,7 +205,7 @@ void TemplUse() {
   // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
 }
 #endif
-}
+} // namespace cwg2635
 
 // cwg2636: na
 
@@ -230,7 +230,7 @@ int y = cwg2640_a\N{LOTUS});
 // expected-error@-1 {{character <U+1FAB7> not allowed in an identifier}}
 // expected-error@-2 {{use of undeclared identifier 'cwg2640_a🪷'}}
 // expected-error@-3 {{extraneous ')' before ';'}}
-}
+} // namespace cwg2640
 
 // cwg2642: na
 
@@ -243,10 +243,10 @@ auto z = [a = 42](int a) {
      return 1;
 };
 #endif
-}
+} // namespace cwg2644
 
-#if __cplusplus >= 202302L
 namespace cwg2650 { // cwg2650: 17
+#if __cplusplus >= 202302L
 template <class T, T> struct S {};
 template <class T> int f(S<T, T{}>*); // #cwg2650-f
 class X {
@@ -255,17 +255,17 @@ class X {
 int i0 = f<X>(0);
 // since-cxx23-error@-1 {{no matching function for call to 'f'}}
 //   since-cxx23-note@#cwg2650-f {{type 'X' of non-type template parameter is not a structural type}}
-}
 #endif
+} // namespace cwg2650
 
-#if __cplusplus >= 202302L
 namespace cwg2653 { // cwg2653: 18
+#if __cplusplus >= 202302L
   struct Test { void f(this const auto& = Test{}); };
   // since-cxx23-error@-1 {{the explicit object parameter cannot have a default argument}}
   auto L = [](this const auto& = Test{}){};
   // since-cxx23-error@-1 {{the explicit object parameter cannot have a default argument}}
-}
 #endif
+} // namespace cwg2653
 
 namespace cwg2654 { // cwg2654: 16
 void f() {
@@ -275,7 +275,7 @@ void f() {
     brachiosaur -= neck;                // OK
     brachiosaur |= neck;                // OK
 }
-}
+} // namespace cwg2654
 
 namespace cwg2681 { // cwg2681: 17
 #if __cplusplus >= 202002L
@@ -308,7 +308,7 @@ J j = { "ghi" };
 //   since-cxx20-note@#cwg2681-J {{candidate function template not viable: requires 0 arguments, but 1 was provided}}
 //   since-cxx20-note@#cwg2681-J {{implicit deduction guide declared as 'template <size_t N> J() -> J<N>'}}
 #endif
-}
+} // namespace cwg2681
 
 namespace cwg2672 { // cwg2672: 18
 #if __cplusplus >= 202002L
@@ -333,10 +333,10 @@ void m() {
   bar(0);
 }
 #endif
-}
+} // namespace cwg2672
 
-#if __cplusplus >= 202302L
 namespace cwg2687 { // cwg2687: 18
+#if __cplusplus >= 202302L
 struct S{
     void f(int);
     static void g(int);
@@ -349,9 +349,8 @@ void test() {
     (&S::g)(1);
     (&S::h)(S(), 1);
 }
-}
 #endif
-
+} // namespace cwg2687
 
 namespace cwg2692 { // cwg2692: 19
 #if __cplusplus >= 202302L
@@ -365,16 +364,13 @@ namespace cwg2692 { // cwg2692: 19
 
   void A::g() {
     (&A::f)(A());
-    // expected-error@-1 {{call to 'f' is ambiguous}}
-    // expected-note@#cwg2692-1 {{candidate}}
-    // expected-note@#cwg2692-2 {{candidate}}
-
-
-
+    // since-cxx23-error@-1 {{call to 'f' is ambiguous}}
+    //   since-cxx23-note@#cwg2692-1 {{candidate function}}
+    //   since-cxx23-note@#cwg2692-2 {{candidate function}}
     (&A::f)();
-    // expected-error@-1 {{no matching function for call to 'f'}}
-    // expected-note@#cwg2692-1 {{candidate function not viable: requires 1 argument, but 0 were provided}}
-    // expected-note@#cwg2692-2 {{candidate function not viable: requires 1 argument, but 0 were provided}}
+    // since-cxx23-error@-1 {{no matching function for call to 'f'}}
+    //   since-cxx23-note@#cwg2692-1 {{candidate function not viable: requires 1 argument, but 0 were provided}}
+    //   since-cxx23-note@#cwg2692-2 {{candidate function not viable: requires 1 argument, but 0 were provided}}
   }
 #endif
-}
+} // namespace cwg2692
diff --git a/clang/test/CXX/drs/cwg273.cpp b/clang/test/CXX/drs/cwg273.cpp
new file mode 100644
index 0000000..532f1a1
--- /dev/null
+++ b/clang/test/CXX/drs/cwg273.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+
+// expected-no-diagnostics
+
+#include <stdarg.h>
+#include <stddef.h>
+namespace cwg273 { // cwg273: 2.7
+  struct A {
+    int n;
+  };
+  void operator&(A);
+  void f(A a, ...) {
+    offsetof(A, n);
+    va_list val;
+    va_start(val, a);
+    va_end(val);
+  }
+} // namespace cwg273
diff --git a/clang/test/CXX/drs/cwg2771.cpp b/clang/test/CXX/drs/cwg2771.cpp
index 474660a..2dd446a 100644
--- a/clang/test/CXX/drs/cwg2771.cpp
+++ b/clang/test/CXX/drs/cwg2771.cpp
@@ -1,4 +1,10 @@
-// RUN: %clang_cc1 -std=c++23 %s -ast-dump | FileCheck --check-prefixes=CXX23 %s
+// RUN: %clang_cc1 -std=c++98 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++17 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++23 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -std=c++2c %s -ast-dump | FileCheck %s
 
 namespace cwg2771 { // cwg2771: 18
 
@@ -8,12 +14,12 @@ struct A{
       int* r = &a;
     }
 };
-// CXX23: CXXMethodDecl{{.+}}cwg2771
-// CXX23-NEXT: CompoundStmt
-// CXX23-NEXT: DeclStmt
-// CXX23-NEXT: VarDecl
-// CXX23-NEXT: UnaryOperator
-// CXX23-NEXT: MemberExpr
-// CXX23-NEXT: CXXThisExpr{{.+}}'cwg2771::A *'
+// CHECK: CXXMethodDecl{{.+}}cwg2771
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: DeclStmt
+// CHECK-NEXT: VarDecl
+// CHECK-NEXT: UnaryOperator
+// CHECK-NEXT: MemberExpr
+// CHECK-NEXT: CXXThisExpr{{.+}}'cwg2771::A *'
 
 } // namespace cwg2771
diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp
index fb5c8b1..a87d26d 100644
--- a/clang/test/CXX/drs/cwg27xx.cpp
+++ b/clang/test/CXX/drs/cwg27xx.cpp
@@ -204,7 +204,7 @@ void test() {
     //   since-cxx23-note@#cwg2789-g2 {{candidate function}}
 }
 #endif
-}
+} // namespace cwg2789
 
 namespace cwg2798 { // cwg2798: 17
 #if __cplusplus > 202302L
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index ff625a4..caa9b0f 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -std=c++98 -pedantic-errors -verify=expected,cxx98 %s
-// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,since-cxx20 %s
-// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,since-cxx20,since-cxx23 %s
-// RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected,since-cxx20,since-cxx23,since-cxx26 %s
+// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected,since-cxx11,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected,since-cxx11,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected,since-cxx11,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,since-cxx11,cxx11-23,since-cxx20 %s
+// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,since-cxx11,cxx11-23,since-cxx20,since-cxx23 %s
+// RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected,since-cxx11,since-cxx20,since-cxx23,since-cxx26 %s
 
 
 int main() {} // required for cwg2811
@@ -14,14 +14,14 @@ namespace cwg2811 { // cwg2811: 3.5
 void f() {
   (void)[&] {
     using T = decltype(main);
-    // expected-error@-1 {{referring to 'main' within an expression is a Clang extension}}
+    // since-cxx11-error@-1 {{referring to 'main' within an expression is a Clang extension}}
   };
   using T2 = decltype(main);
-  // expected-error@-1 {{referring to 'main' within an expression is a Clang extension}}
+  // since-cxx11-error@-1 {{referring to 'main' within an expression is a Clang extension}}
 }
 
 using T = decltype(main);
-// expected-error@-1 {{referring to 'main' within an expression is a Clang extension}}
+// since-cxx11-error@-1 {{referring to 'main' within an expression is a Clang extension}}
 
 int main();
 
@@ -47,14 +47,19 @@ void f() {
 #endif
 } // namespace cwg2813
 
-namespace cwg2819 { // cwg2819: 19 tentatively ready 2023-12-01
-
-#if __cpp_constexpr >= 202306L
+namespace cwg2819 { // cwg2819: 19 c++26
+#if __cplusplus >= 201103L
+  // CWG 2024-04-19: This issue is not a DR.
   constexpr void* p = nullptr;
-  constexpr int* q = static_cast<int*>(p);
-  static_assert(q == nullptr);
+  constexpr int* q = static_cast<int*>(p); // #cwg2819-q
+  // cxx11-23-error@-1 {{constexpr variable 'q' must be initialized by a constant expression}}
+  //   cxx11-23-note@-2 {{cast from 'void *' is not allowed in a constant expression}}
+  static_assert(q == nullptr, "");
+  // cxx11-23-error@-1 {{static assertion expression is not an integral constant expression}}
+  //   cxx11-23-note@-2 {{initializer of 'q' is not a constant expression}}
+  //   cxx11-23-note@#cwg2819-q {{declared here}}
 #endif
-}
+} // namespace cwg2819
 
 namespace cwg2847 { // cwg2847: 19 review 2024-03-01
 
@@ -145,7 +150,7 @@ struct A {
   // FIXME: The index of the pack-index-specifier is printed as a memory address in the diagnostic.
   template<typename U>
   friend struct Ts...[0]::C;
-  // expected-warning-re@-1 {{dependent nested name specifier 'Ts...[{{.*}}]::' for friend template declaration is not supported; ignoring this friend declaration}}
+  // since-cxx26-warning@-1 {{dependent nested name specifier 'Ts...[0]::' for friend template declaration is not supported; ignoring this friend declaration}}
 };
 
 #endif
@@ -169,9 +174,7 @@ void g() {
 } // namespace cwg2877
 
 namespace cwg2881 { // cwg2881: 19
-
 #if __cplusplus >= 202302L
-
 template <typename T> struct A : T {};
 template <typename T> struct B : T {};
 template <typename T> struct C : virtual T { C(T t) : T(t) {} };
@@ -183,12 +186,12 @@ struct O1 : A<Ts>, B<Ts> {
   using B<Ts>::operator();
 };
 
-template <typename Ts> struct O2 : protected Ts { // expected-note {{declared protected here}}
+template <typename Ts> struct O2 : protected Ts { // #cwg2881-O2
   using Ts::operator();
   O2(Ts ts) : Ts(ts) {}
 };
 
-template <typename Ts> struct O3 : private Ts { // expected-note {{declared private here}}
+template <typename Ts> struct O3 : private Ts { // #cwg2881-O3
   using Ts::operator();
   O3(Ts ts) : Ts(ts) {}
 };
@@ -212,7 +215,7 @@ struct O5 : private C<Ts>, D<Ts> {
 
 // This is only invalid if we call T's call operator.
 template <typename T, typename U>
-struct O6 : private T, U { // expected-note {{declared private here}}
+struct O6 : private T, U { // #cwg2881-O6
   using T::operator();
   using U::operator();
   O6(T t, U u) : T(t), U(u) {}
@@ -222,14 +225,26 @@ void f() {
   int x;
   auto L1 = [=](this auto&& self) { (void) &x; };
   auto L2 = [&](this auto&& self) { (void) &x; };
-  O1<decltype(L1)>{L1, L1}(); // expected-error {{inaccessible due to ambiguity}}
-  O1<decltype(L2)>{L2, L2}(); // expected-error {{inaccessible due to ambiguity}}
-  O2{L1}(); // expected-error {{must derive publicly from the lambda}}
-  O3{L1}(); // expected-error {{must derive publicly from the lambda}}
+  O1<decltype(L1)>{L1, L1}();
+  /* since-cxx23-error-re@-1 {{inaccessible due to ambiguity:
+    struct cwg2881::O1<class (lambda at {{.+}})> -> A<(lambda at {{.+}})> -> class (lambda at {{.+}})
+    struct cwg2881::O1<class (lambda at {{.+}})> -> B<(lambda at {{.+}})> -> class (lambda at {{.+}})}}*/
+  O1<decltype(L2)>{L2, L2}();
+  /* since-cxx23-error-re@-1 {{inaccessible due to ambiguity:
+    struct cwg2881::O1<class (lambda at {{.+}})> -> A<(lambda at {{.+}})> -> class (lambda at {{.+}})
+    struct cwg2881::O1<class (lambda at {{.+}})> -> B<(lambda at {{.+}})> -> class (lambda at {{.+}})}}*/
+  O2{L1}();
+  // since-cxx23-error-re@-1 {{invalid explicit object parameter type 'cwg2881::O2<(lambda at {{.+}})>' in lambda with capture; the type must derive publicly from the lambda}}
+  //   since-cxx23-note@#cwg2881-O2 {{declared protected here}}
+  O3{L1}();
+  // since-cxx23-error-re@-1 {{invalid explicit object parameter type 'cwg2881::O3<(lambda at {{.+}})>' in lambda with capture; the type must derive publicly from the lambda}}
+  //   since-cxx23-note@#cwg2881-O3 {{declared private here}}
   O4{L1}();
   O5{L1}();
   O6 o{L1, L2};
-  o.decltype(L1)::operator()(); // expected-error {{must derive publicly from the lambda}}
+  o.decltype(L1)::operator()();
+  // since-cxx23-error-re@-1 {{invalid explicit object parameter type 'cwg2881::O6<(lambda at {{.+}}), (lambda at {{.+}})>' in lambda with capture; the type must derive publicly from the lambda}}
+  //   since-cxx23-note@#cwg2881-O6 {{declared private here}}
   o.decltype(L1)::operator()(); // No error here because we've already diagnosed this method.
   o.decltype(L2)::operator()();
 }
@@ -238,12 +253,14 @@ void f2() {
   int x = 0;
   auto lambda = [x] (this auto self) { return x; };
   using Lambda = decltype(lambda);
-  struct D : private Lambda { // expected-note {{declared private here}}
+  struct D : private Lambda { // #cwg2881-D
     D(Lambda l) : Lambda(l) {}
     using Lambda::operator();
     friend Lambda;
   } d(lambda);
-  d(); // expected-error {{must derive publicly from the lambda}}
+  d();
+  // since-cxx23-error@-1 {{invalid explicit object parameter type 'D' in lambda with capture; the type must derive publicly from the lambda}}
+  //   since-cxx23-note@#cwg2881-D {{declared private here}}
 }
 
 template <typename L>
@@ -258,18 +275,20 @@ struct Indirect : T {
 };
 
 template<typename T>
-struct Ambiguous : Indirect<T>, T { // expected-warning {{is inaccessible due to ambiguity}}
+struct Ambiguous : Indirect<T>, T {
+/* since-cxx23-warning-re@-1 {{direct base '(lambda at {{.+}})' is inaccessible due to ambiguity:
+    struct cwg2881::Ambiguous<class (lambda at {{.+}})> -> Indirect<(lambda at {{.+}})> -> class (lambda at {{.+}})
+    struct cwg2881::Ambiguous<class (lambda at {{.+}})> -> class (lambda at {{.+}})}}*/
+//   since-cxx23-note-re@#cwg2881-f4 {{in instantiation of template class 'cwg2881::Ambiguous<(lambda at {{.+}})>' requested here}}
+//   since-cxx34-note-re@#cwg2881-f4-call {{while substituting deduced template arguments into function template 'f4' [with L = (lambda at {{.+}})]}}
   using Indirect<T>::operator();
 };
 
 template <typename L>
-constexpr auto f3(L l) -> decltype(Private<L>{l}()) { return l(); }
-// expected-note@-1 {{must derive publicly from the lambda}}
+constexpr auto f3(L l) -> decltype(Private<L>{l}()) { return l(); } // #cwg2881-f3
 
 template <typename L>
-constexpr auto f4(L l) -> decltype(Ambiguous<L>{{l}, l}()) { return l(); }
-// expected-note@-1 {{is inaccessible due to ambiguity}}
-// expected-note@-2 {{in instantiation of template class}}
+constexpr auto f4(L l) -> decltype(Ambiguous<L>{{l}, l}()) { return l(); } // #cwg2881-f4
 
 template<typename T>
 concept is_callable = requires(T t) { { t() }; };
@@ -277,15 +296,19 @@ concept is_callable = requires(T t) { { t() }; };
 void g() {
   int x = 0;
   auto lambda = [x](this auto self) {};
-  f3(lambda); // expected-error {{no matching function for call to 'f3'}}
-  f4(lambda); // expected-error {{no matching function for call to 'f4'}}
-  // expected-note@-1 {{while substituting deduced template arguments into function template 'f4'}}
+  f3(lambda);
+  // since-cxx23-error@-1 {{no matching function for call to 'f3'}}
+  //   since-cxx23-note-re@#cwg2881-f3 {{candidate template ignored: substitution failure [with L = (lambda at {{.+}})]: invalid explicit object parameter type 'cwg2881::Private<(lambda at {{.+}})>' in lambda with capture; the type must derive publicly from the lambda}}
+  f4(lambda); // #cwg2881-f4-call
+  // expected-error@-1 {{no matching function for call to 'f4'}}
+  //   expected-note-re@-2 {{while substituting deduced template arguments into function template 'f4' [with L = (lambda at {{.+}})]}}
+  /*   expected-note-re@#cwg2881-f4 {{candidate template ignored: substitution failure [with L = (lambda at {{.+}})]: lambda '(lambda at {{.+}})' is inaccessible due to ambiguity:
+    struct cwg2881::Ambiguous<class (lambda at {{.+}})> -> Indirect<(lambda at {{.+}})> -> class (lambda at {{.+}})
+    struct cwg2881::Ambiguous<class (lambda at {{.+}})> -> class (lambda at {{.+}})}}*/
   static_assert(!is_callable<Private<decltype(lambda)>>);
   static_assert(!is_callable<Ambiguous<decltype(lambda)>>);
 }
-
 #endif
-
 } // namespace cwg2881
 
 namespace cwg2882 { // cwg2882: 2.7
diff --git a/clang/test/CXX/drs/cwg29xx.cpp b/clang/test/CXX/drs/cwg29xx.cpp
index 9629bdd..aeb62b8 100644
--- a/clang/test/CXX/drs/cwg29xx.cpp
+++ b/clang/test/CXX/drs/cwg29xx.cpp
@@ -1,12 +1,14 @@
 // RUN: %clang_cc1 -std=c++98 -pedantic-errors -verify=expected,cxx98 %s
-// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected %s
+// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected,since-cxx11 %s
+// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected,since-cxx11 %s
+// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected,since-cxx11 %s
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,since-cxx11,since-cxx20 %s
+// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,since-cxx11,since-cxx20,since-cxx23 %s
+// RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected,since-cxx11,since-cxx20,since-cxx23,since-cxx26 %s
 
-namespace cwg2913 { // cwg2913: 20 tentatively ready 2024-08-16
+// cxx98-no-diagnostics
+
+namespace cwg2913 { // cwg2913: 20
 
 #if __cplusplus >= 202002L
 
@@ -20,60 +22,66 @@ template<typename T>
 R(T) -> R<T> requires true;
 
 template<typename T>
-R(T, T) requires true -> R<T>; // expected-error {{expected function body after function declarator}}
+R(T, T) requires true -> R<T>;
+// since-cxx20-error@-1 {{expected function body after function declarator}}
 
 #endif
 
 } // namespace cwg2913
 
-namespace cwg2915 { // cwg2915: 20 tentatively ready 2024-08-16
+namespace cwg2915 { // cwg2915: 20
 #if __cplusplus >= 202302L
 struct A {
-  void f(this void); // expected-error {{explicit object parameter cannot have 'void' type}}
+  void f(this void);
+  // since-cxx23-error@-1 {{explicit object parameter cannot have 'void' type}}
 };
 #endif
-}
+} // namespace cwg2915
 
 namespace cwg2917 { // cwg2917: 20 review 2024-07-30
+#if __cplusplus >= 201103L
 template <typename>
 class Foo;
 
-template<class ...> // cxx98-error {{variadic templates are a C++11 extension}}
+template<class ...>
 struct C {
   struct Nested { };
 };
 
 struct S {
   template <typename>
-  friend class Foo, int; // expected-error {{a friend declaration that befriends a template must contain exactly one type-specifier}}
+  friend class Foo, int;
+  // since-cxx11-error@-1 {{a friend declaration that befriends a template must contain exactly one type-specifier}}
 
-  template <typename ...Ts> // cxx98-error {{variadic templates are a C++11 extension}}
-  friend class C<Ts>::Nested...; // expected-error {{friend declaration expands pack 'Ts' that is declared it its own template parameter list}}
+  template <typename ...Ts>
+  friend class C<Ts>::Nested...;
+  // since-cxx11-error@-1 {{friend declaration expands pack 'Ts' that is declared it its own template parameter list}}
 };
+#endif
 } // namespace cwg2917
 
-#if __cplusplus >= 202400L
-
+#if __cplusplus > 202302L
 namespace std {
   using size_t = decltype(sizeof(0));
-};
+} // namespace std
 void *operator new(std::size_t, void *p) { return p; }
-void* operator new[] (std::size_t, void* p) {return p;}
-
+void* operator new[] (std::size_t, void* p) {return p; }
+#endif
 
-namespace cwg2922 { // cwg2922: 20 tentatively ready 2024-07-10
+namespace cwg2922 { // cwg2922: 20
+#if __cplusplus > 202302L
 union U { int a, b; };
 constexpr U nondeterministic(bool i) {
   if(i) {
     U u;
-    new (&u) int();
-    // expected-note@-1 {{placement new would change type of storage from 'U' to 'int'}}
+    new (&u) int(); // #cwg2922-placement-new
     return u;
   }
   return {};
 }
 constexpr U _ = nondeterministic(true);
-// expected-error@-1 {{constexpr variable '_' must be initialized by a constant expression}} \
-// expected-note@-1 {{in call to 'nondeterministic(true)'}}
-}
+// since-cxx26-error@-1 {{constexpr variable '_' must be initialized by a constant expression}}
+//   since-cxx26-note@#cwg2922-placement-new {{placement new would change type of storage from 'U' to 'int'}}
+//   since-cxx26-note@-3 {{in call to 'nondeterministic(true)'}}
 #endif
+} // namespace cwg2922
diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
index ec37b42..0d644ba 100644
--- a/clang/test/CXX/drs/cwg2xx.cpp
+++ b/clang/test/CXX/drs/cwg2xx.cpp
@@ -2,8 +2,9 @@
 // RUN: %clang_cc1 -std=c++11 %s -verify=expected,since-cxx11,cxx98-11,cxx98-14,cxx98-17 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx11,since-cxx14,cxx98-14,cxx98-17 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,cxx98-17 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors
 
 // FIXME: diagnostic above is emitted only on Windows platforms
 // PR13819 -- __SIZE_TYPE__ is incompatible.
@@ -29,7 +30,7 @@ namespace cwg200 { // cwg200: dup 214
   void g() {
     f<int>(1);
   }
-}
+} // namespace cwg200
 
 // cwg201 is in cwg201.cpp
 
@@ -39,11 +40,146 @@ namespace cwg202 { // cwg202: 3.1
     static_assert(__enable_constant_folding(g == &f<int>), "");
   };
   template struct X<f>;
+} // namespace cwg202
+
+namespace cwg203 { // cwg203: 3.0
+namespace ex1 {
+struct B {
+  int i;
+};
+struct D1 : B {};
+struct D2 : B {};
+
+int(D1::*pmD1) = &D2::i;
+} // namespace ex1
+
+#if __cplusplus >= 202002L
+namespace ex2 {
+struct A {
+  int i;
+  virtual void f() = 0; // #cwg203-ex2-A-f
+};
+
+struct B : A {
+  int j;
+  constexpr B() : j(5) {}
+  virtual void f();
+};
+
+struct C : B {
+  constexpr C() { j = 10; }
+};
+
+template <class T>
+constexpr int DefaultValue(int(T::*m)) {
+  return T().*m;
+  // since-cxx20-error@-1 {{allocating an object of abstract class type 'cwg203::ex2::A'}}
+  //   since-cxx20-note@#cwg203-ex2-a {{in instantiation of function template specialization 'cwg203::ex2::DefaultValue<cwg203::ex2::A>' requested here}}
+  //   since-cxx20-note@#cwg203-ex2-A-f {{unimplemented pure virtual method 'f' in 'A'}}
+} // #cwg203-ex2-DefaultValue
+
+int a = DefaultValue(&B::i); // #cwg203-ex2-a
+static_assert(DefaultValue(&C::j) == 5, "");
+} // namespace ex2
+#endif
+
+namespace ex3 {
+class Base {
+public:
+  int func() const;
+};
+
+class Derived : public Base {};
+
+template <class T> class Templ { // #cwg203-ex3-Templ
+public:
+  template <class S> Templ(S (T::*ptmf)() const); // #cwg203-ex3-Templ-ctor
+};
+
+void foo() { Templ<Derived> x(&Derived::func); }
+// expected-error@-1 {{no matching constructor for initialization of 'Templ<Derived>'}}
+//   expected-note@#cwg203-ex3-Templ {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int (cwg203::ex3::Base::*)() const' to 'const Templ<Derived>' for 1st argument}}
+//   since-cxx11-note@#cwg203-ex3-Templ {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int (cwg203::ex3::Base::*)() const' to 'Templ<Derived>' for 1st argument}}
+//   expected-note@#cwg203-ex3-Templ-ctor {{candidate template ignored: could not match 'cwg203::ex3::Derived' against 'cwg203::ex3::Base'}}
+} // namespace ex3
+
+namespace ex4 {
+struct Very_base {
+  int a;
+};
+struct Base1 : Very_base {};
+struct Base2 : Very_base {};
+struct Derived : Base1, Base2 {
+};
+
+int f() {
+  Derived d;
+  // FIXME: in the diagnostic below, Very_base is fully qualified, but Derived is not
+  int Derived::*a_ptr = &Derived::Base1::a;
+  /* expected-error@-1
+  {{ambiguous conversion from pointer to member of base class 'cwg203::ex4::Very_base' to pointer to member of derived class 'Derived':
+    struct cwg203::ex4::Derived -> Base1 -> Very_base
+    struct cwg203::ex4::Derived -> Base2 -> Very_base}}*/
+}
+} // namespace ex4
+
+namespace ex5 {
+struct Base {
+  int a;
+};
+struct Derived : Base {
+  int b;
+};
+
+template <typename Class, typename Member_type, Member_type Base::*ptr>
+Member_type get(Class &c) {
+  return c.*ptr;
+}
+
+void call(int (*f)(Derived &)); // #cwg203-ex5-call
+
+int f() {
+  // ill-formed, contrary to Core issue filing:
+  // `&Derived::b` yields `int Derived::*`, which can't initialize NTTP of type `int Base::*`,
+  // because (implicit) pointer-to-member conversion doesn't upcast.
+  call(&get<Derived, int, &Derived::b>);
+  // expected-error@-1 {{no matching function for call to 'call'}}
+  //   expected-note@#cwg203-ex5-call {{candidate function not viable: no overload of 'get' matching 'int (*)(Derived &)' for 1st argument}}
+
+  // well-formed, contrary to Core issue filing:
+  // `&Derived::a` yields `int Base::*`,
+  // which can initialize NTTP of type `int Base::*`.
+  call(&get<Derived, int, &Derived::a>);
+
+  call(&get<Base, int, &Derived::a>);
+  // expected-error@-1 {{no matching function for call to 'call'}}
+  //   expected-note@#cwg203-ex5-call {{candidate function not viable: no overload of 'get' matching 'int (*)(Derived &)' for 1st argument}}
+}
+} // namespace ex5
+
+namespace ex6 {
+struct Base {
+  int a;
+};
+struct Derived : private Base { // #cwg203-ex6-Derived
+public:
+  using Base::a; // make `a` accessible
+};
+
+int f() {
+  Derived d;
+  int b = d.a;
+  // FIXME: in the diagnostic below, Base is fully qualified, but Derived is not
+  int Derived::*ptr = &Derived::a;
+  // expected-error@-1 {{cannot cast private base class 'cwg203::ex6::Base' to 'Derived'}}
+  //   expected-note@#cwg203-ex6-Derived {{declared private here}}
 }
+} // namespace ex6
+} // namespace cwg203
 
 // cwg204: sup 820
 
-namespace cwg206 { // cwg206: yes
+namespace cwg206 { // cwg206: 2.7
   struct S; // #cwg206-S
   template<typename T> struct Q { S s; };
   // expected-error@-1 {{field has incomplete type 'S'}}
@@ -51,9 +187,9 @@ namespace cwg206 { // cwg206: yes
   template<typename T> void f() { S s; }
   // expected-error@-1 {{variable has incomplete type 'S'}}
   //   expected-note@#cwg206-S {{forward declaration of 'cwg206::S'}}
-}
+} // namespace cwg206
 
-namespace cwg207 { // cwg207: yes
+namespace cwg207 { // cwg207: 2.7
   class A {
   protected:
     static void f() {}
@@ -66,7 +202,7 @@ namespace cwg207 { // cwg207: yes
       f();
     }
   };
-}
+} // namespace cwg207
 
 // cwg208 FIXME: write codegen test
 
@@ -79,11 +215,11 @@ namespace cwg209 { // cwg209: 3.2
     // expected-error@-1 {{friend function 'f' is a private member of 'cwg209::A'}}
     //   expected-note@#cwg209-A-f {{implicitly declared private here}}
   };
-}
+} // namespace cwg209
 
 // cwg210 is in cwg210.cpp
 
-namespace cwg211 { // cwg211: yes
+namespace cwg211 { // cwg211: 2.7
   struct A {
     A() try {
       throw 0;
@@ -92,9 +228,9 @@ namespace cwg211 { // cwg211: yes
       // expected-error@-1 {{return in the catch of a function try block of a constructor is illegal}}
     }
   };
-}
+} // namespace cwg211
 
-namespace cwg213 { // cwg213: yes
+namespace cwg213 { // cwg213: 2.7
   template <class T> struct A : T {
     void h(T t) {
       char &r1 = f(t);
@@ -111,9 +247,9 @@ namespace cwg213 { // cwg213: yes
   char &f(B);
 
   template void A<B>::h(B); // #cwg213-instantiation
-}
+} // namespace cwg213
 
-namespace cwg214 { // cwg214: yes
+namespace cwg214 { // cwg214: 2.7
   template<typename T, typename U> T checked_cast(U from) { U::error; }
   template<typename T, typename U> T checked_cast(U *from);
   class C {};
@@ -124,7 +260,7 @@ namespace cwg214 { // cwg214: yes
   void g() {
     f<int>(1);
   }
-}
+} // namespace cwg214
 
 namespace cwg215 { // cwg215: 2.9
   template<typename T> class X {
@@ -134,7 +270,7 @@ namespace cwg215 { // cwg215: 2.9
   struct Y {
     void foo() { (void)+X<Y>().n; }
   };
-}
+} // namespace cwg215
 
 namespace cwg216 { // cwg216: no
   // FIXME: Should reject this: 'f' has linkage but its type does not,
@@ -150,17 +286,17 @@ namespace cwg216 { // cwg216: no
     void f(E);
   };
   void g(S s, S::E e) { s.f(e); }
-}
+} // namespace cwg216
 
-namespace cwg217 { // cwg217: yes
+namespace cwg217 { // cwg217: 2.7
   template<typename T> struct S {
     void f(int);
   };
   template<typename T> void S<T>::f(int = 0) {}
   // expected-error@-1 {{default arguments cannot be added to an out-of-line definition of a member of a class template}}
-}
+} // namespace cwg217
 
-namespace cwg218 { // cwg218: yes
+namespace cwg218 { // cwg218: 2.7
                   // NB: also dup 405
   namespace A {
     struct S {};
@@ -222,7 +358,7 @@ namespace cwg218 { // cwg218: yes
     template<typename A, typename B> struct C {};
   }
   void testG(G::C<G::X::A, G::Y::B> gc) { f(gc); }
-}
+} // namespace cwg218
 
 // cwg219: na
 // cwg220: na
@@ -259,7 +395,7 @@ namespace cwg221 { // cwg221: 3.6
     //   expected-note@#cwg221-S {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'float' to 'const A' for 1st argument}}
     a += f;
   }
-}
+} // namespace cwg221
 
 namespace cwg222 { // cwg222: dup 637
   void f(int a, int b, int c, int *x) {
@@ -279,7 +415,7 @@ namespace cwg222 { // cwg222: dup 637
     a = (b = ++a);
 #pragma clang diagnostic pop
   }
-}
+} // namespace cwg222
 
 // cwg223: na
 
@@ -351,7 +487,7 @@ namespace cwg224 { // cwg224: 16
       return i;
     }
   }
-}
+} // namespace cwg224
 
 // cwg225: yes
 template<typename T> void cwg225_f(T t) { cwg225_g(t); }
@@ -409,30 +545,32 @@ namespace cwg226 { // cwg226: no
   template<typename, typename X, typename=void, typename Y> int foo(X, Y);
   // cxx98-error@-1 {{default template arguments for a function template are a C++11 extension}}
   int x = foo(0, 0);
-}
+} // namespace cwg226
 
-void cwg227(bool b) { // cwg227: yes
+namespace cwg227 { // cwg227: 2.7
+void f(bool b) {
   if (b)
     int n;
   else
     int n;
 }
+} // namespace cwg227
 
-namespace cwg228 { // cwg228: yes
+namespace cwg228 { // cwg228: 2.7
   template <class T> struct X {
     void f();
   };
   template <class T> struct Y {
     void g(X<T> x) { x.template X<T>::f(); }
   };
-}
+} // namespace cwg228
 
 namespace cwg229 { // cwg229: 2.9
   template<typename T> void f();
   template<typename T> void f<T*>() {}
   // expected-error@-1 {{function template partial specialization is not allowed}}
   template<> void f<int>() {}
-}
+} // namespace cwg229
 
 namespace cwg230 { // cwg230: 3.0
   struct S {
@@ -441,9 +579,9 @@ namespace cwg230 { // cwg230: 3.0
     //   expected-note@#cwg230-f {{'f' declared here}}
     virtual void f() = 0; // #cwg230-f
   };
-}
+} // namespace cwg230
 
-namespace cwg231 { // cwg231: yes
+namespace cwg231 { // cwg231: 2.7
   namespace outer {
     namespace inner {
       int i; // #cwg231-i
@@ -453,7 +591,7 @@ namespace cwg231 { // cwg231: yes
     // expected-error@-1 {{use of undeclared identifier 'i'; did you mean 'inner::i'?}}
     //   expected-note@#cwg231-i {{'inner::i' declared here}}
   }
-}
+} // namespace cwg231
 
 // cwg234: na
 // cwg235: na
@@ -462,15 +600,15 @@ namespace cwg236 { // cwg236: 3.2
   void *p = int();
   // cxx98-warning@-1 {{expression which evaluates to zero treated as a null pointer constant of type 'void *'}}
   // since-cxx11-error@-2 {{cannot initialize a variable of type 'void *' with an rvalue of type 'int'}}
-}
+} // namespace cwg236
 
 namespace cwg237 { // cwg237: dup 470
   template<typename T> struct A { void f() { T::error; } };
   template<typename T> struct B : A<T> {};
   template struct B<int>; // ok
-}
+} // namespace cwg237
 
-namespace cwg239 { // cwg239: yes
+namespace cwg239 { // cwg239: 2.7
   namespace NS {
     class T {};
     void f(T);
@@ -484,11 +622,11 @@ namespace cwg239 { // cwg239: yes
     extern int &g(NS::T, float);
     int &s = g(parm, 1);
   }
-}
+} // namespace cwg239
 
 // cwg240: dup 616
 
-namespace cwg241 { // cwg241: yes
+namespace cwg241 { // cwg241: 9
   namespace A {
     struct B {};
     template <int X> void f(); // #cwg241-A-f
@@ -523,9 +661,9 @@ namespace cwg241 { // cwg241: yes
     //   expected-note@#cwg241-A-f {{candidate function template not viable: requires 0 arguments, but 1 was provided}}
     g<3>(b);
   }
-}
+} // namespace cwg241
 
-namespace cwg243 { // cwg243: yes
+namespace cwg243 { // cwg243: 2.8
   struct B;
   struct A {
     A(B); // #cwg243-A
@@ -539,7 +677,7 @@ namespace cwg243 { // cwg243: yes
   // expected-error@-1 {{conversion from 'struct B' to 'A' is ambiguous}}
   //   expected-note@#cwg243-A {{candidate constructor}}
   //   expected-note@#cwg243-B {{candidate function has been explicitly deleted}}
-}
+} // namespace cwg243
 
 namespace cwg244 { // cwg244: 11
                   // NB: this test is reused by cwg399
@@ -633,16 +771,16 @@ namespace cwg244 { // cwg244: 11
     }
     template void g(N::S<int>::Inner *);
   }
-}
+} // namespace cwg244
 
-namespace cwg245 { // cwg245: yes
+namespace cwg245 { // cwg245: 2.8
   struct S {
     enum E {}; // #cwg245-E
     class E *p;
     // expected-error@-1 {{use of 'E' with tag type that does not match previous declaration}}
     //   expected-note@#cwg245-E {{previous use is here}}
   };
-}
+} // namespace cwg245
 
 namespace cwg246 { // cwg246: 3.2
   struct S {
@@ -655,9 +793,9 @@ X: ;
       //   expected-note@#cwg246-try {{jump bypasses initialization of try block}}
     }
   };
-}
+} // namespace cwg246
 
-namespace cwg247 { // cwg247: yes
+namespace cwg247 { // cwg247: 2.7
   struct A {};
   struct B : A {
     void f();
@@ -681,18 +819,18 @@ namespace cwg247 { // cwg247: yes
     void f(int);
   };
   void (F::*i)() = &F::f;
-}
+} // namespace cwg247
 
 namespace cwg248 { // cwg248: sup P1949
   int \u040d\u040e = 0;
-}
+} // namespace cwg248
 
-namespace cwg249 { // cwg249: yes
+namespace cwg249 { // cwg249: 2.7
   template<typename T> struct X { void f(); };
   template<typename T> void X<T>::f() {}
-}
+} // namespace cwg249
 
-namespace cwg250 { // cwg250: yes
+namespace cwg250 { // cwg250: 2.7
   typedef void (*FPtr)(double x[]);
 
   template<int I> void f(double x[]);
@@ -701,7 +839,7 @@ namespace cwg250 { // cwg250: yes
   template<int I = 3> void g(double x[]);
   // cxx98-error@-1 {{default template arguments for a function template are a C++11 extension}}
   FPtr gp = &g<>;
-}
+} // namespace cwg250
 
 namespace cwg252 { // cwg252: 3.1
   struct A {
@@ -752,7 +890,7 @@ namespace cwg252 { // cwg252: 3.1
     virtual ~G();
   };
   G::~G() {}
-}
+} // namespace cwg252
 
 namespace cwg254 { // cwg254: 2.9
   template<typename T> struct A {
@@ -767,12 +905,12 @@ namespace cwg254 { // cwg254: 2.9
   struct C { typedef struct {} type; }; // #cwg254-C
   A<B>::type n;
   A<C>::type n; // #cwg254-instantiation
-}
+} // namespace cwg254
 
-namespace cwg255 { // cwg255: yes
+namespace cwg255 { // cwg255: 2.7
 struct S {
-  void operator delete(void *){};
-  void operator delete(void *, int){};
+  void operator delete(void *){}
+  void operator delete(void *, int){}
 };
 void f(S *p) { delete p; }
 } // namespace cwg255
@@ -794,7 +932,7 @@ namespace cwg257 { // cwg257: 3.4
     //   expected-note@#cwg257-A {{'cwg257::A' declared here}}
     void f();
   };
-}
+} // namespace cwg257
 
 namespace cwg258 { // cwg258: 2.8
   struct A {
@@ -832,7 +970,7 @@ namespace cwg258 { // cwg258: 2.8
   } f;
   // expected-error@-1 {{variable type 'struct F' is an abstract class}}
   //   expected-note@#cwg258-E-f {{unimplemented pure virtual method 'f' in 'F'}}
-}
+} // namespace cwg258
 
 namespace cwg259 { // cwg259: 4
   template<typename T> struct A {};
@@ -867,7 +1005,7 @@ namespace cwg259 { // cwg259: 4
   template struct B<float>;
   // expected-warning@-1 {{explicit instantiation of 'B<float>' that occurs after an explicit specialization has no effect}}
   //   expected-note@#cwg259-B-float {{previous template specialization is here}}
-}
+} // namespace cwg259
 
 // FIXME: When cwg260 is resolved, also add tests for CWG507.
 
@@ -904,14 +1042,14 @@ namespace cwg261 { // cwg261: no
   // expected-warning@-1 {{'operator delete' was marked unused but was used}}
 
 #pragma clang diagnostic pop
-}
+} // namespace cwg261
 
-namespace cwg262 { // cwg262: yes
+namespace cwg262 { // cwg262: 2.7
   int f(int = 0, ...);
   int k = f();
   int l = f(0);
   int m = f(0, 0);
-}
+} // namespace cwg262
 
 namespace cwg263 { // cwg263: 3.3
   struct X {};
@@ -931,14 +1069,14 @@ namespace cwg263 { // cwg263: 3.3
     Y::~Y();
     // expected-error@-1 {{extra qualification on member '~Y'}}
   };
-}
+} // namespace cwg263
 
 // cwg265: dup 353
 // cwg266: na
 // cwg269: na
 // cwg270: na
 
-namespace cwg272 { // cwg272: yes
+namespace cwg272 { // cwg272: 2.7
   struct X {
     void f() {
       this->~X();
@@ -947,23 +1085,9 @@ namespace cwg272 { // cwg272: yes
       // expected-error@-1 {{invalid argument type 'X' to unary expression}}
     }
   };
-}
-
-#include <stdarg.h>
-#include <stddef.h>
-namespace cwg273 { // cwg273: yes
-  struct A {
-    int n;
-  };
-  void operator&(A);
-  void f(A a, ...) {
-    offsetof(A, n);
-    va_list val;
-    va_start(val, a);
-    va_end(val);
-  }
-}
+} // namespace cwg272
 
+// cwg273 is in cwg273.cpp
 // cwg274: na
 
 namespace cwg275 { // cwg275: no
@@ -1022,7 +1146,7 @@ namespace cwg275 { // cwg275: no
   // expected-error@-1 {{partial ordering for explicit instantiation of 'f' is ambiguous}}
   //   expected-note@#cwg275-f {{explicit instantiation candidate function 'cwg275::f<short>' template here [with T = short]}}
   //   expected-note@#cwg275-N-f {{explicit instantiation candidate function 'cwg275::N::f<short>' template here [with T = short]}}
-}
+} // namespace cwg275
 
 // cwg276: na
 
@@ -1030,7 +1154,7 @@ namespace cwg277 { // cwg277: 3.1
   typedef int *intp;
   int *p = intp();
   static_assert(__enable_constant_folding(!intp()), "");
-}
+} // namespace cwg277
 
 // cwg279 is in cwg279.cpp
 
@@ -1085,7 +1209,7 @@ namespace cwg280 { // cwg280: 2.9
     //   expected-note@#cwg280-B-f3 {{conversion candidate of type 'void (*)(int, int, int)'}}
     //   expected-note@#cwg280-C-f3 {{conversion candidate of type 'void (*)(int, int, int)'}}
   }
-}
+} // namespace cwg280
 
 namespace cwg281 { // cwg281: no
   void a();
@@ -1102,9 +1226,9 @@ namespace cwg281 { // cwg281: no
     friend inline void e() {}
     friend inline void f() {}
   };
-}
+} // namespace cwg281
 
-namespace cwg283 { // cwg283: yes
+namespace cwg283 { // cwg283: 2.7
   template<typename T> // #cwg283-template
   struct S {
     friend class T;
@@ -1114,7 +1238,7 @@ namespace cwg283 { // cwg283: yes
     // expected-error@-1 {{declaration of 'T' shadows template parameter}}
     //   expected-note@#cwg283-template {{template parameter is declared here}}
   };
-}
+} // namespace cwg283
 
 namespace cwg284 { // cwg284: no
   namespace A {
@@ -1152,16 +1276,16 @@ namespace cwg284 { // cwg284: no
   struct D::X {}; // FIXME: ill-formed
   enum D::Y e2; // ok per cwg417
   class D::Z z2; // ok per cwg417
-}
+} // namespace cwg284
 
-namespace cwg285 { // cwg285: yes
+namespace cwg285 { // cwg285: 2.7
   template<typename T> void f(T, int); // #cwg285-f-T-int
   template<typename T> void f(int, T); // #cwg285-f-int-T
   template<> void f<int>(int, int) {}
   // expected-error@-1 {{function template specialization 'f' ambiguously refers to more than one function template; explicitly specify additional template arguments to identify a particular function template}}
   //   expected-note@#cwg285-f-int-T {{function template 'cwg285::f<int>' matches specialization [with T = int]}}
   //   expected-note@#cwg285-f-T-int {{function template 'cwg285::f<int>' matches specialization [with T = int]}}
-}
+} // namespace cwg285
 
 namespace cwg286 { // cwg286: 2.8
   template<class T> struct A {
@@ -1177,11 +1301,11 @@ namespace cwg286 { // cwg286: 2.8
   A<short>::C::B<int*> absip;
   // expected-error@-1 {{'B' is a private member of 'cwg286::A<short>::C'}}
   //   expected-note@#cwg286-B {{implicitly declared private here}}
-}
+} // namespace cwg286
 
 // cwg288: na
 
-namespace cwg289 { // cwg289: yes
+namespace cwg289 { // cwg289: 2.7
   struct A; // #cwg289-A
   struct B : A {};
   // expected-error@-1 {{base class has incomplete type}}
@@ -1191,7 +1315,7 @@ namespace cwg289 { // cwg289: yes
   // expected-error@-1 {{type 'int' cannot be used prior to '::' because it has no members}}
   //   expected-note@#cwg289-C-int {{in instantiation of template class 'cwg289::C<int>' requested here}}
   struct D : C<int> {}; // #cwg289-C-int
-}
+} // namespace cwg289
 
 // cwg290: na
 // cwg291: dup 391
@@ -1223,7 +1347,7 @@ namespace cwg294 { // cwg294: no
     // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}}
     //   since-cxx17-note@-2 {{use 'noexcept(false)' instead}}
   }
-}
+} // namespace cwg294
 
 namespace cwg295 { // cwg295: 3.7
   typedef int f();
@@ -1242,14 +1366,14 @@ namespace cwg295 { // cwg295: 3.7
   typedef int (*V)();
   typedef volatile U *V;
   // expected-warning@-1 {{'volatile' qualifier on function type 'U' (aka 'int ()') has no effect}}
-}
+} // namespace cwg295
 
-namespace cwg296 { // cwg296: yes
+namespace cwg296 { // cwg296: 2.7
   struct A {
     static operator int() { return 0; }
     // expected-error@-1 {{conversion function must be a non-static member function}}
   };
-}
+} // namespace cwg296
 
 namespace cwg298 { // cwg298: 3.1
   struct A {
@@ -1288,7 +1412,7 @@ namespace cwg298 { // cwg298: 3.1
   };
   typedef const F G;
   G::~F() {} // ok
-}
+} // namespace cwg298
 
 namespace cwg299 { // cwg299: 2.8 c++11
   struct S {
@@ -1308,4 +1432,4 @@ namespace cwg299 { // cwg299: 2.8 c++11
   // since-cxx14-error-re@#cwg299-q {{{{conversion from 'T' to 'unsigned (long long|long|int)' is ambiguous}}}}
   //  since-cxx14-note@#cwg299-int {{candidate function}}
   //  since-cxx14-note@#cwg299-ushort {{candidate function}}
-}
+} // namespace cwg299
diff --git a/clang/test/CXX/drs/cwg3xx.cpp b/clang/test/CXX/drs/cwg3xx.cpp
index 26b0f97..b5e07a6 100644
--- a/clang/test/CXX/drs/cwg3xx.cpp
+++ b/clang/test/CXX/drs/cwg3xx.cpp
@@ -1,9 +1,10 @@
-// RUN: %clang_cc1 -std=c++23 -verify=expected,cxx20-23,cxx23,since-cxx11,since-cxx17,since-cxx23 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -verify=expected,cxx98-20,cxx20-23,since-cxx11,since-cxx17 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -verify=expected,cxx98-17,cxx98-20,since-cxx11,since-cxx17 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++98 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx98 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -verify=expected,cxx98-17,cxx98-20,since-cxx11,since-cxx17 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -verify=expected,cxx98-20,cxx20-23,since-cxx11,since-cxx17 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -verify=expected,cxx20-23,cxx23,since-cxx11,since-cxx17,since-cxx23 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -verify=expected,cxx20-23,cxx23,since-cxx11,since-cxx17,since-cxx23 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -16,11 +17,11 @@
 #define __enable_constant_folding
 #endif
 
-namespace cwg300 { // cwg300: yes
+namespace cwg300 { // cwg300: 2.7
   template<typename R, typename A> void f(R (&)(A)) {}
   int g(int);
   void h() { f(g); }
-}
+} // namespace cwg300
 
 namespace cwg301 { // cwg301: 3.5
   // see also cwg38
@@ -75,7 +76,7 @@ namespace cwg301 { // cwg301: 3.5
   // expected-error@-1 {{expected identifier}}
   // expected-error@-2 {{declaration of anonymous class must be a definition}}
   // expected-error@-3 {{declaration does not declare anything}}
-}
+} // namespace cwg301
 
 namespace cwg302 { // cwg302: 3.0
   struct A { A(); ~A(); };
@@ -108,7 +109,7 @@ namespace cwg302 { // cwg302: 3.0
     const int n = 0;
   } d = D();
 #endif
-}
+} // namespace cwg302
 
 // cwg303: na
 
@@ -123,7 +124,7 @@ namespace cwg304 { // cwg304: 2.9
   int m = S().b; // #cwg304-m
   // since-cxx11-error@-1 {{call to implicitly-deleted default constructor of 'S'}}
   //   since-cxx11-note@#cwg304-S {{default constructor of 'S' is implicitly deleted because field 'b' of reference type 'int &' would not be initialized}}
-}
+} // namespace cwg304
 
 namespace cwg305 { // cwg305: no
   struct A {
@@ -169,9 +170,9 @@ namespace cwg305 { // cwg305: no
   };
   void k(Z *z) {
     z->~T1<int>();
-    // expected-error@-1 {{no member named 'T1' in 'cwg305::Z'}}
+    // since-cxx11-error@-1 {{no member named 'T1' in 'cwg305::Z'}}
     z->~T2<int>();
-    // expected-error@-1 {{no member named '~int' in 'cwg305::Z'}}
+    // since-cxx11-error@-1 {{no member named '~int' in 'cwg305::Z'}}
     z->~T2<Z>();
   }
 
@@ -181,9 +182,9 @@ namespace cwg305 { // cwg305: no
   }
   template<typename A> using R = Q::R<int>;
   void qr(Q::R<int> x) { x.~R<char>(); }
-  // expected-error@-1 {{no member named '~R' in 'cwg305::Q::R<int>'}}
+  // since-cxx11-error@-1 {{no member named '~R' in 'cwg305::Q::R<int>'}}
 #endif
-}
+} // namespace cwg305
 
 namespace cwg306 { // cwg306: dup 39
   struct A { struct B {}; };
@@ -199,7 +200,7 @@ namespace cwg306 { // cwg306: dup 39
   // expected-error@-1 {{member 'X' found in multiple base classes of different types}}
   //   expected-note@#cwg306-X {{member type 'cwg306::X' found}}
   //   expected-note@#cwg306-typedef-X {{member type 'const cwg306::X' found}}
-}
+} // namespace cwg306
 
 // cwg307: na
 
@@ -229,7 +230,7 @@ namespace cwg308 { // cwg308: 3.7
       // get here instead
     }
   }
-}
+} // namespace cwg308
 
 // cwg309: dup 485
 
@@ -247,7 +248,7 @@ namespace cwg311 { // cwg311: 3.0
   // expected-warning@-2 {{extra qualification on member 'X'}}
   // expected-error@-3 {{a type specifier is required for all declarations}}
   // expected-error@-4 {{expected ';' after top level declarator}}
-}
+} // namespace cwg311
 
 // cwg312: dup 616
 
@@ -256,7 +257,7 @@ namespace cwg313 { // cwg313: dup 299 c++11
   // FIXME: should this be available in c++98 mode?
   int *p = new int[A()];
   // cxx98-error@-1 {{implicit conversion from array size expression of type 'A' to integral type 'int' is a C++11 extension}}
-}
+} // namespace cwg313
 
 namespace cwg314 { // cwg314: no
                   // NB: dup 1710
@@ -292,12 +293,12 @@ namespace cwg317 { // cwg317: 3.5
   inline int h();
   // expected-error@-1 {{inline declaration of 'h' follows non-inline definition}}
   //   expected-note@#cwg317-h {{previous definition is here}}
-}
+} // namespace cwg317
 
 namespace cwg318 { // cwg318: sup 1310
   struct A {};
   struct A::A a;
-}
+} // namespace cwg318
 
 namespace cwg319 { // cwg319: no
   // FIXME: dup cwg389
@@ -334,9 +335,9 @@ namespace cwg319 { // cwg319: no
     extern C c; // ok
     X<C> xc;
   }
-}
+} // namespace cwg319
 
-namespace cwg320 { // cwg320: yes
+namespace cwg320 { // cwg320: 3.1
 #if __cplusplus >= 201103L
   struct X {
     constexpr X() {}
@@ -347,7 +348,7 @@ namespace cwg320 { // cwg320: yes
   constexpr unsigned g(X x) { return x.copies; }
   static_assert(f(X()).copies == g(X()) + 1, "expected one extra copy for return value");
 #endif
-}
+} // namespace cwg320
 
 namespace cwg321 { // cwg321: dup 557
   namespace N {
@@ -367,7 +368,7 @@ namespace cwg321 { // cwg321: dup 557
   }
   N::I<int> i, j;
   bool x = i == j;
-}
+} // namespace cwg321
 
 namespace cwg322 { // cwg322: 2.8
   struct A {
@@ -375,7 +376,7 @@ namespace cwg322 { // cwg322: 2.8
   } a;
   int &r = static_cast<int&>(a);
   int &s = a;
-}
+} // namespace cwg322
 
 // cwg323: sup 820
 
@@ -403,12 +404,12 @@ namespace cwg324 { // cwg324: 3.6
   // expected-error@-1 {{address of bit-field requested}}
   int *i = &++s.n;
   // expected-error@-1 {{address of bit-field requested}}
-}
+} // namespace cwg324
 
 namespace cwg326 { // cwg326: 3.1
   struct S {};
   static_assert(__is_trivially_constructible(S, const S&), "");
-}
+} // namespace cwg326
 
 namespace cwg327 { // cwg327: dup 538
   struct A;
@@ -416,9 +417,9 @@ namespace cwg327 { // cwg327: dup 538
 
   class B;
   struct B {};
-}
+} // namespace cwg327
 
-namespace cwg328 { // cwg328: yes
+namespace cwg328 { // cwg328: 2.7
   struct A; // #cwg328-A
   struct B { A a; };
   // expected-error@-1 {{field has incomplete type 'A'}}
@@ -429,7 +430,7 @@ namespace cwg328 { // cwg328: yes
   A *p = new A[0];
   // expected-error@-1 {{allocation of incomplete type 'A'}}
   //   expected-note@#cwg328-A {{forward declaration of 'cwg328::A'}}
-}
+} // namespace cwg328
 
 namespace cwg329 { // cwg329: 3.5
   struct B {};
@@ -449,7 +450,7 @@ namespace cwg329 { // cwg329: 3.5
   void test() {
     h(a); // #cwg329-h-call
   }
-}
+} // namespace cwg329
 
 namespace cwg330 { // cwg330: 7
   // Conversions between P and Q will be allowed by P0388.
@@ -543,7 +544,7 @@ namespace cwg330 { // cwg330: 7
       (void) reinterpret_cast<B4*>(a);
     }
   }
-}
+} // namespace cwg330
 
 namespace cwg331 { // cwg331: 11
   struct A {
@@ -555,7 +556,7 @@ namespace cwg331 { // cwg331: 11
   const A b(a);
   // expected-error@-1 {{no matching constructor for initialization of 'const A'}}
   //   expected-note@#cwg331-A-ctor {{candidate constructor not viable: 1st argument ('const A') would lose const qualifier}}
-}
+} // namespace cwg331
 
 namespace cwg332 { // cwg332: dup 577
   void f(volatile void);
@@ -566,16 +567,16 @@ namespace cwg332 { // cwg332: dup 577
   void h(int n, volatile void);
   // expected-error@-1 {{'void' must be the first and only parameter if specified}}
   // cxx20-23-warning@-2 {{volatile-qualified parameter type 'volatile void' is deprecated}}
-}
+} // namespace cwg332
 
-namespace cwg333 { // cwg333: yes
+namespace cwg333 { // cwg333: 2.7
   int n = 0;
   int f(int(n));
   int g((int(n)));
   int h = f(g);
-}
+} // namespace cwg333
 
-namespace cwg334 { // cwg334: yes
+namespace cwg334 { // cwg334: 2.7
   template<typename T> void f() {
     T x;
     f((x, 123));
@@ -585,11 +586,11 @@ namespace cwg334 { // cwg334: yes
     friend void f(S);
   };
   template void f<S>();
-}
+} // namespace cwg334
 
 // cwg335: sup 820
 
-namespace cwg336 { // cwg336: yes
+namespace cwg336 { // cwg336: 2.7
   namespace Pre {
     template<class T1> class A {
       template<class T2> class B {
@@ -619,9 +620,9 @@ namespace cwg336 { // cwg336: yes
     template<class Y> template<> void A<Y>::B<double>::mf2() {}
     // expected-error@-1 {{nested name specifier 'A<Y>::B<double>::' for declaration does not refer into a class, class template or class template partial specialization}}
   }
-}
+} // namespace cwg336
 
-namespace cwg337 { // cwg337: yes
+namespace cwg337 { // cwg337: 2.7
   template<typename T> void f(T (*)[1]);
   template<typename T> int &f(...);
 
@@ -635,7 +636,7 @@ namespace cwg337 { // cwg337: yes
   int &s = f<B>(0);
   // expected-error@-1 {{non-const lvalue reference to type 'int' cannot bind to a temporary of type 'void'}}
   struct B { virtual ~B() = 0; };
-}
+} // namespace cwg337
 
 // cwg338: dup 1884
 
@@ -670,14 +671,14 @@ namespace cwg339 { // cwg339: 2.8
   static_assert(conv_int<char>::value, "");
   bool b = conv_int2<char>(A<1>());
   A<1> c = make_A<char>();
-}
+} // namespace cwg339
 
-namespace cwg340 { // cwg340: yes
+namespace cwg340 { // cwg340: 2.7
   struct A { A(int); };
   struct B { B(A, A, int); };
   int x, y;
   B b(A(x), A(y), 3);
-}
+} // namespace cwg340
 
 namespace cwg341 { // cwg341: sup 1708
   namespace A {
@@ -711,7 +712,7 @@ namespace cwg341 {
   namespace B { extern "C" void cwg341_e(); }
   // expected-error@-1 {{redefinition of 'cwg341_e' as different kind of symbol}}
   //   expected-note@#cwg341_e {{previous definition is here}}
-}
+} // namespace cwg341
 
 // cwg342: na
 
@@ -726,14 +727,14 @@ namespace cwg343 { // cwg343: no
     C() : A<T>::B<T>() {}
     // expected-error@-1 {{use 'template' keyword to treat 'B' as a dependent template name}}
   };
-}
+} // namespace cwg343
 
 namespace cwg344 { // cwg344: dup 1435
   struct A { inline virtual ~A(); };
   struct B { friend A::~A(); };
-}
+} // namespace cwg344
 
-namespace cwg345 { // cwg345: yes
+namespace cwg345 { // cwg345: 2.7
   struct A {
     struct X {};
     int X; // #cwg345-int-X
@@ -749,11 +750,11 @@ namespace cwg345 { // cwg345: yes
     f(b);
     f(a); // #cwg345-f-a
   }
-}
+} // namespace cwg345
 
 // cwg346: na
 
-namespace cwg347 { // cwg347: yes
+namespace cwg347 { // cwg347: 2.7
   struct base {
     struct nested;
     static int n;
@@ -773,7 +774,7 @@ namespace cwg347 { // cwg347: yes
   void derived::g() {}
   // expected-error@-1 {{out-of-line definition of 'g' does not match any declaration in 'cwg347::derived'}}
   //   expected-note@#cwg347-derived {{defined here}}
-}
+} // namespace cwg347
 
 // cwg348: na
 
@@ -802,7 +803,7 @@ namespace cwg349 { // cwg349: no
   // FIXME: This is invalid.
   B b;
   const int *const *const *p2 = b;
-}
+} // namespace cwg349
 
 // cwg351: na
 
@@ -944,11 +945,11 @@ namespace cwg352 { // cwg352: 2.8
       f(a1, a2);
     }
   }
-}
+} // namespace cwg352
 
 // cwg353 needs an IRGen test.
 
-namespace cwg354 { // cwg354: yes c++11
+namespace cwg354 { // cwg354: 3.1 c++11
   // FIXME: Should we allow this in C++98 too?
   struct S {};
 
@@ -1004,16 +1005,18 @@ namespace cwg354 { // cwg354: yes c++11
   // cxx11-14-error@#cwg354-m3 {{null non-type template argument of type 'int *' does not match template parameter of type 'int S::*'}}
   //   cxx11-14-note@#cwg354-ptr_mem {{template parameter is declared here}}
   // since-cxx17-error@#cwg354-m3 {{value of type 'int *' is not implicitly convertible to 'int S::*'}}
-}
+} // namespace cwg354
 
-struct cwg355_S; // cwg355: yes
+struct cwg355_S; // cwg355: 2.7
 struct ::cwg355_S {};
 // expected-warning@-1 {{extra qualification on member 'cwg355_S'}}
-namespace cwg355 { struct ::cwg355_S s; }
+namespace cwg355 {
+struct ::cwg355_S s;
+} // namespace cwg355
 
 // cwg356: na
 
-namespace cwg357 { // cwg357: yes
+namespace cwg357 { // cwg357: 2.7
   template<typename T> struct A { // #cwg357-A
     void f() const; // #cwg357-f
   };
@@ -1028,17 +1031,17 @@ namespace cwg357 { // cwg357: yes
   template<typename T> void B::f() const {}
   // expected-error@-1 {{out-of-line definition of 'f' does not match any declaration in 'cwg357::B'}}
   //   expected-note@#cwg357-B {{defined here}}
-}
+} // namespace cwg357
 
-namespace cwg358 { // cwg358: yes
+namespace cwg358 { // cwg358: 2.7
   extern "C" void cwg358_f();
   namespace N {
     int var;
     extern "C" void cwg358_f() { var = 10; }
   }
-}
+} // namespace cwg358
 
-namespace cwg359 { // cwg359: yes
+namespace cwg359 { // cwg359: 3.3
   // Note, the example in the DR is wrong; it doesn't contain an anonymous
   // union.
   struct E {
@@ -1065,9 +1068,9 @@ namespace cwg359 { // cwg359: yes
       };
     };
   };
-}
+} // namespace cwg359
 
-namespace cwg360 { // cwg360: yes
+namespace cwg360 { // cwg360: 2.8
 struct A {
   int foo();
   int bar();
@@ -1099,7 +1102,7 @@ int main() {
 // cwg362: na
 // cwg363: na
 
-namespace cwg364 { // cwg364: yes
+namespace cwg364 { // cwg364: 2.7
   struct S {
     static void f(int);
     void f(char);
@@ -1110,13 +1113,14 @@ namespace cwg364 { // cwg364: yes
     // expected-error@-1 {{call to non-static member function without an object argument}}
     S::f(0);
   }
-}
+} // namespace cwg364
 
-// cwg366: yes
+namespace cwg366 { // cwg366: 2.7
 #if "foo" // expected-error {{invalid token at start of a preprocessor expression}} 
 #endif
+} // namespace cwg366
 
-namespace cwg367 { // cwg367: yes
+namespace cwg367 { // cwg367: 2.7
   static_assert(__enable_constant_folding(true ? throw 0 : 4), "");
   // expected-error@-1 {{expression is not an integral constant expression}}
   static_assert(__enable_constant_folding(true ? 4 : throw 0), "");
@@ -1124,8 +1128,7 @@ namespace cwg367 { // cwg367: yes
   // expected-error@-1 {{expression is not an integral constant expression}}
   //   expected-note@-2 {{read of uninitialized object is not allowed in a constant expression}}
   static_assert(__enable_constant_folding(true ? 4 : *new int), "");
-
-}
+} // namespace cwg367
 
 namespace cwg368 { // cwg368: 3.6
   template<typename T, T> struct S {}; // #cwg368-S
@@ -1142,7 +1145,7 @@ namespace cwg368 { // cwg368: 3.6
   // cxx20-23-error@#cwg368-g-call {{call to 'g' is ambiguous}}
   //   cxx20-23-note@#cwg368-g {{candidate function [with T = cwg368::X]}}
   //   cxx20-23-note@#cwg368-g-2 {{candidate function [with T = cwg368::X]}}
-}
+} // namespace cwg368
 
 // cwg370: na
 
@@ -1248,7 +1251,7 @@ namespace cwg372 { // cwg372: no
     // expected-error@-1 {{'B' is a protected member of 'cwg372::badwolf::A'}}
     //   expected-note@#cwg372-B {{declared protected here}}
   }
-}
+} // namespace cwg372
 
 namespace cwg373 { // cwg373: 5
   namespace X { int cwg373; }
@@ -1270,7 +1273,7 @@ namespace cwg373 { // cwg373: 5
   using namespace A::B;
   // expected-error@-1 {{expected namespace name}}
   //   expected-note@#cwg373-A {{'A' declared here}}
-}
+} // namespace cwg373
 
 namespace cwg374 { // cwg374: 7
                   // NB 2.9 c++11
@@ -1281,12 +1284,12 @@ namespace cwg374 { // cwg374: 7
   template<> void N::f<char>() {}
   template<> void N::A<char>::f() {}
   template<> struct N::A<int> {};
-}
+} // namespace cwg374
 
 // cwg375: dup 345
 // cwg376: na
 
-namespace cwg377 { // cwg377: yes
+namespace cwg377 { // cwg377: 2.7
   enum E {
   // expected-error@-1 {{enumeration values exceed range of largest integer}}
     a = -__LONG_LONG_MAX__ - 1,
@@ -1295,12 +1298,12 @@ namespace cwg377 { // cwg377: yes
     // cxx98-error@-1 {{'long long' is a C++11 extension}}
     // cxx98-error@-2 {{'long long' is a C++11 extension}}
   };
-}
+} // namespace cwg377
 
 // cwg378: dup 276
 // cwg379: na
 
-namespace cwg381 { // cwg381: yes
+namespace cwg381 { // cwg381: 2.7
   struct A {
     int a;
   };
@@ -1318,9 +1321,9 @@ namespace cwg381 { // cwg381: yes
     F f;
     f.A::a = 1;
   }
-}
+} // namespace cwg381
 
-namespace cwg382 { // cwg382: yes c++11
+namespace cwg382 { // cwg382: 2.7 c++11
   // FIXME: Should we allow this in C++98 mode?
   struct A { typedef int T; };
   typename A::T t;
@@ -1329,17 +1332,17 @@ namespace cwg382 { // cwg382: yes c++11
   // cxx98-error@-1 {{'typename' occurs outside of a template}}
   typename A b;
   // expected-error@-1 {{expected a qualified name after 'typename'}}
-}
+} // namespace cwg382
 
-namespace cwg383 { // cwg383: yes
+namespace cwg383 { // cwg383: 2.7
   struct A { A &operator=(const A&); };
   struct B { ~B(); };
   union C { C &operator=(const C&); };
   union D { ~D(); };
   static_assert(!__is_pod(A) && !__is_pod(B) && !__is_pod(C) && !__is_pod(D), "");
-}
+} // namespace cwg383
 
-namespace cwg384 { // cwg384: yes
+namespace cwg384 { // cwg384: 2.7
   namespace N1 {
     template<typename T> struct Base {};
     template<typename T> struct X {
@@ -1359,7 +1362,7 @@ namespace cwg384 { // cwg384: yes
     N1::X<N2::Z> v;
     v.f(0);
   }
-}
+} // namespace cwg384
 
 namespace cwg385 { // cwg385: 2.8
   struct A { protected: void f(); };
@@ -1374,7 +1377,7 @@ namespace cwg385 { // cwg385: 2.8
   // expected-error@-1 {{'n' is a protected member of 'cwg385::D'}}
   //   expected-note@#cwg385-E {{constrained by protected inheritance here}}
   //   expected-note@#cwg385-n {{member is declared here}}
-}
+} // namespace cwg385
 
 namespace cwg386 { // cwg386: no
 namespace example1 {
@@ -1482,7 +1485,7 @@ namespace cwg387 { // cwg387: 2.8
       // expected-error@-1 {{use of undeclared identifier 'gcd'}}
     }
   }
-}
+} // namespace cwg387
 
 // FIXME: cwg388 needs libc++abi test
 
@@ -1606,7 +1609,7 @@ namespace cwg389 { // cwg389: no
     // FIXME: This is ill-formed.
     extern WithoutLinkage1 withoutLinkageLocal;
   }
-}
+} // namespace cwg389
 
 namespace cwg390 { // cwg390: 3.3
   template<typename T>
@@ -1625,7 +1628,7 @@ namespace cwg390 { // cwg390: 3.3
   struct B : A<int> { // #cwg390-A-int
     void f() {}
   } b;
-}
+} // namespace cwg390
 
 namespace cwg391 { // cwg391: 2.8 c++11
   // FIXME: Should this apply to C++98 too?
@@ -1648,7 +1651,7 @@ namespace cwg391 { // cwg391: 2.8 c++11
   };
   C<int> fc();
   const C<int> &c = fc();
-}
+} // namespace cwg391
 
 // cwg392 is in cwg392.cpp
 
@@ -1718,9 +1721,9 @@ namespace cwg395 { // cwg395: 3.0
     template<class T, class U> operator ptr_mem_fun_t<T, U>() const { return 0; };
   } null2;
   int (S::*q)() = null2;
-}
+} // namespace cwg395
 
-namespace cwg396 { // cwg396: yes
+namespace cwg396 { // cwg396: 3.0
   void f() {
     auto int a();
     // since-cxx11-error@-1 {{'auto' storage class specifier is not permitted in C++11, and will not be supported in future releases}}
@@ -1731,11 +1734,11 @@ namespace cwg396 { // cwg396: yes
     // expected-error@-2 {{redefinition of 'i'}}
     //   expected-note@#cwg396-i {{previous definition is here}}
   }
-}
+} // namespace cwg396
 
 // cwg397: sup 1823
 
-namespace cwg398 { // cwg398: yes
+namespace cwg398 { // cwg398: 2.7
   namespace example1 {
     struct S {
       static int const I = 42;
@@ -1778,7 +1781,7 @@ namespace cwg398 { // cwg398: yes
       //   expected-note@#cwg398-h {{candidate template ignored: substitution failure [with T = D]: 'TT' following the 'template' keyword does not refer to a template}}
     }
   }
-}
+} // namespace cwg398
 
 namespace cwg399 { // cwg399: 11
                   // NB: reuse cwg244 test
@@ -1872,4 +1875,4 @@ namespace cwg399 { // cwg399: 11
     }
     template void g(N::S<int>::Inner *);
   }
-}
+} // namespace cwg399
diff --git a/clang/test/CXX/drs/cwg492.cpp b/clang/test/CXX/drs/cwg492.cpp
index 7fc46b0..caa27cf 100644
--- a/clang/test/CXX/drs/cwg492.cpp
+++ b/clang/test/CXX/drs/cwg492.cpp
@@ -16,7 +16,7 @@ namespace std {
 struct type_info {
   const char* name() const NOTHROW;
 }; 
-}
+} // namespace std
 
 namespace cwg492 { // cwg492: 2.7
 
diff --git a/clang/test/CXX/drs/cwg4xx.cpp b/clang/test/CXX/drs/cwg4xx.cpp
index 8250658..bcaf7db 100644
--- a/clang/test/CXX/drs/cwg4xx.cpp
+++ b/clang/test/CXX/drs/cwg4xx.cpp
@@ -16,7 +16,7 @@ __extension__ typedef __SIZE_TYPE__ size_t;
 
 namespace std { struct type_info; }
 
-namespace cwg400 { // cwg400: yes
+namespace cwg400 { // cwg400: 2.7
   struct A { int a; struct a {}; }; // #cwg400-A
   struct B { int a; struct a {}; }; // #cwg400-B
   struct C : A, B { using A::a; struct a b; };
@@ -36,7 +36,7 @@ namespace cwg400 { // cwg400: yes
   // expected-error@-1 {{member 'a' found in multiple base classes of different types}}
   //   expected-note@#cwg400-A {{member type 'cwg400::A::a' found by ambiguous name lookup}}
   //   expected-note@#cwg400-B {{member type 'cwg400::B::a' found by ambiguous name lookup}}
-}
+} // namespace cwg400
 
 namespace cwg401 { // cwg401: 2.8
   template<class T, class U = typename T::type> class A : public T {}; // #cwg401-A
@@ -87,9 +87,9 @@ namespace cwg401 { // cwg401: 2.8
   void g(B b) { f(b); } // #cwg402-f-b
   // since-cxx11-error@-1 {{no matching function for call to 'f'}}
   //   since-cxx11-note@#cwg402-f {{candidate template ignored: substitution failure [with T = B, U = typename B::type]: 'type' is a protected member of 'cwg401::B'}}
-}
+} // namespace cwg401
 
-namespace cwg403 { // cwg403: yes
+namespace cwg403 { // cwg403: 2.7
   namespace A {
     struct S {};
     int f(void*);
@@ -101,13 +101,13 @@ namespace cwg403 { // cwg403: yes
                 // referring to an elaborated-type-specifier naming a
                 // injected-class-name, which is about as far from a
                 // template-id as we can make it.
-}
+} // namespace cwg403
 
 // cwg404: na
 // (NB: also sup 594)
 
-namespace cwg405 { // cwg405: yes
-                  // NB: also dup 218
+namespace cwg405 { // cwg405: 2.7
+                   // NB: also dup 218
   namespace A {
     struct S {};
     void f(S);
@@ -148,7 +148,7 @@ namespace cwg405 { // cwg405: yes
   }
   void testE(E::S es) { f(es); }
   // expected-error@-1 {{use of undeclared identifier 'f'}}
-}
+} // namespace cwg405
 
 namespace cwg406 { // cwg406: 2.9
   typedef struct {
@@ -159,7 +159,7 @@ namespace cwg406 { // cwg406: 2.9
     static int n;
     // expected-error@-1 {{static data member 'n' not allowed in anonymous union}}
   } B;
-}
+} // namespace cwg406
 
 namespace cwg407 { // cwg407: 3.8
                   // NB: reused by cwg1894 and cwg2199
@@ -219,7 +219,7 @@ namespace cwg407 { // cwg407: 3.8
       struct S s;
     }
   }
-}
+} // namespace cwg407
 
 namespace cwg408 { // cwg408: 3.4
   template<int N> void g() { static_assert(N != 1, ""); }
@@ -248,9 +248,9 @@ namespace cwg408 { // cwg408: 3.4
   }
   template<> int R<int>::arr[2];
   template void R<int>::f();
-}
+} // namespace cwg408
 
-namespace cwg409 { // cwg409: yes
+namespace cwg409 { // cwg409: 2.7
   template<typename T> struct A {
     typedef int B;
     B b1;
@@ -259,7 +259,7 @@ namespace cwg409 { // cwg409: yes
     A<T*>::B b4;
     // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name A<T *>::B; implicit 'typename' is a C++20 extension}}
   };
-}
+} // namespace cwg409
 
 namespace cwg410 { // cwg410: no
   template<class T> void f(T);
@@ -286,11 +286,11 @@ namespace cwg410 { // cwg410: no
   void g(int) { M::A::z(); }
   // expected-error@-1 {{'z' is a private member of 'cwg410::M::A'}}
   //   expected-note@#cwg410-z {{declared private here}}
-}
+} // namespace cwg410
 
 // cwg412 is in cwg412.cpp
 
-namespace cwg413 { // cwg413: yes
+namespace cwg413 { // cwg413: 2.7
   struct S {
     int a;
     int : 17;
@@ -309,7 +309,7 @@ namespace cwg413 { // cwg413: yes
   T t2 = { 1, 2 };
   // expected-error@-1 {{initializer for aggregate with no elements requires explicit braces}}
   //   expected-note@#cwg413-T {{'cwg413::T' declared here}}
-}
+} // namespace cwg413
 
 namespace cwg414 { // cwg414: dup 305
   struct X {};
@@ -318,21 +318,21 @@ namespace cwg414 { // cwg414: dup 305
     struct X {};
     x.~X();
   }
-}
+} // namespace cwg414
 
-namespace cwg415 { // cwg415: yes
+namespace cwg415 { // cwg415: 2.7
   template<typename T> void f(T, ...) { T::error; }
   void f(int, int);
   void g() { f(0, 0); } // ok
-}
+} // namespace cwg415
 
-namespace cwg416 { // cwg416: yes
+namespace cwg416 { // cwg416: 2.7
   extern struct A a;
   int &operator+(const A&, const A&);
   int &k = a + a;
   struct A { float &operator+(A&); };
   float &f = a + a;
-}
+} // namespace cwg416
 
 namespace cwg417 { // cwg417: no
   struct A;
@@ -367,7 +367,7 @@ namespace cwg417 { // cwg417: no
     struct cwg417::H {};
     // expected-error@-1 {{cannot define or redeclare 'H' here because namespace 'M' does not enclose namespace 'cwg417'}}
   }
-}
+} // namespace cwg417
 
 namespace cwg418 { // cwg418: no
 namespace example1 {
@@ -464,17 +464,17 @@ namespace cwg420 { // cwg420: 9
     q->A::template id<int>::~id<int>();
   }
 #endif
-}
+} // namespace cwg420
 
-namespace cwg421 { // cwg421: yes
+namespace cwg421 { // cwg421: 2.7
   struct X { X(); int n; int &r; };
   int *p = &X().n;
   // cxx98-error@-1 {{taking the address of a temporary object of type 'int'}}
   // since-cxx11-error@-2 {{cannot take the address of an rvalue of type 'int'}}
   int *q = &X().r;
-}
+} // namespace cwg421
 
-namespace cwg422 { // cwg422: yes
+namespace cwg422 { // cwg422: 2.7
   template<typename T, typename U> void f() {
     typedef T type; // #cwg422-typedef-T
     typedef U type;
@@ -484,14 +484,14 @@ namespace cwg422 { // cwg422: yes
   }
   template void f<int, int>();
   template void f<int, char>(); // #cwg422-f-int-char
-}
+} // namespace cwg422
 
-namespace cwg423 { // cwg423: yes
+namespace cwg423 { // cwg423: 2.7
   template<typename T> struct X { operator T&(); };
   void f(X<int> x) { x += 1; }
-}
+} // namespace cwg423
 
-namespace cwg424 { // cwg424: yes
+namespace cwg424 { // cwg424: 2.7
   struct A {
     typedef int N; // #cwg424-N
     typedef int N;
@@ -519,13 +519,13 @@ namespace cwg424 { // cwg424: yes
     // expected-error@-1 {{redefinition of 'M'}}
     //   expected-note@#cwg424-M {{previous definition is here}}
   };
-}
+} // namespace cwg424
 
-namespace cwg425 { // cwg425: yes
+namespace cwg425 { // cwg425: 2.7
   struct A { template<typename T> operator T() const; } a;
   float f = 1.0f * a;
   // expected-error@-1 {{use of overloaded operator '*' is ambiguous (with operand types 'float' and 'struct A')}}
-  //   expected-note@-2 +{{built-in candidate}}
+  //   expected-note@-2 +{{built-in candidate operator*}}
 
   template<typename T> struct is_float;
   template<> struct is_float<float> { typedef void type; };
@@ -535,9 +535,9 @@ namespace cwg425 { // cwg425: yes
     // cxx98-error@-1 {{default template arguments for a function template are a C++11 extension}}
   } b;
   float g = 1.0f * b; // ok
-}
+} // namespace cwg425
 
-namespace cwg427 { // cwg427: yes
+namespace cwg427 { // cwg427: 2.7
   struct B {};
   struct D : public B {
     D(B &) = delete; // #cwg427-D
@@ -551,33 +551,33 @@ namespace cwg427 { // cwg427: yes
   const D &d4(b);
   // expected-error@-1 {{conversion function from 'B' to 'const D' invokes a deleted function}}
   //   expected-note@#cwg427-D {{'D' has been explicitly marked deleted here}}
-}
+} // namespace cwg427
 
-namespace cwg428 { // cwg428: yes
+namespace cwg428 { // cwg428: 2.7
   template<typename T> T make();
   extern struct X x; // #cwg428-X
   void f() {
     throw void();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw object of incomplete type 'void'}}
     throw make<void*>();
     throw make<const volatile void*>();
     throw x;
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw object of incomplete type 'struct X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
     throw make<X&>();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw object of incomplete type 'cwg428::X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
     throw make<X*>();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw pointer to object of incomplete type 'cwg428::X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
     throw make<const volatile X&>();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw object of incomplete type 'cwg428::X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
     throw make<const volatile X*>();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw pointer to object of incomplete type 'const volatile cwg428::X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
   }
-}
+} // namespace cwg428
 
 namespace cwg429 { // cwg429: 2.8 c++11
   // FIXME: This rule is obviously intended to apply to C++98 as well.
@@ -592,18 +592,18 @@ namespace cwg429 { // cwg429: 2.8 c++11
     static void operator delete(void*);
     static void operator delete(void*, size_t);
   } *b = new (0) B; // ok, second delete is not a non-placement deallocation function
-}
+} // namespace cwg429
 
-namespace cwg430 { // cwg430: yes c++11
+namespace cwg430 { // cwg430: 2.7 c++11
   // resolved by n2239
   // FIXME: This should apply in C++98 too.
   void f(int n) {
     int a[] = { n++, n++, n++ };
     // cxx98-warning@-1 {{multiple unsequenced modifications to 'n'}}
   }
-}
+} // namespace cwg430
 
-namespace cwg431 { // cwg431: yes
+namespace cwg431 { // cwg431: 2.8
   struct A {
     template<typename T> T *get();
     template<typename T> struct B {
@@ -633,7 +633,7 @@ namespace cwg431 { // cwg431: yes
     // expected-error@-1 {{use 'template' keyword to treat 'get' as a dependent template name}}
     c->template get<int>();
   }
-}
+} // namespace cwg431
 
 namespace cwg432 { // cwg432: 3.0
   template<typename T> struct A {};
@@ -646,9 +646,9 @@ namespace cwg432 { // cwg432: 3.0
   // since-cxx11-error@-1 {{use of class template 'D' requires template arguments}}
   //   since-cxx11-note@-2 {{template is declared here}}
 #endif
-}
+} // namespace cwg432
 
-namespace cwg433 { // cwg433: yes
+namespace cwg433 { // cwg433: 2.7
   template<class T> struct S {
     void f(union U*);
   };
@@ -656,7 +656,7 @@ namespace cwg433 { // cwg433: yes
   template<class T> void S<T>::f(union U*) {}
 
   S<int> s;
-}
+} // namespace cwg433
 
 namespace cwg434 { // cwg434: sup 2352
   void f() {
@@ -674,16 +674,16 @@ namespace cwg434 { // cwg434: sup 2352
   const int * const &rcpci = pi;
   static_assert(&rcpci == &pi, "");
 #endif
-}
+} // namespace cwg434
 
 // cwg435: na
 
-namespace cwg436 { // cwg436: yes
+namespace cwg436 { // cwg436: 2.7
   enum E { f }; // #cwg436-f
   void f();
   // expected-error@-1 {{redefinition of 'f' as different kind of symbol}}
   //   expected-note@#cwg436-f {{previous definition is here}}
-}
+} // namespace cwg436
 
 namespace cwg437 { // cwg437: sup 1308
   // This is superseded by 1308, which is in turn superseded by 1330,
@@ -702,7 +702,7 @@ namespace cwg437 { // cwg437: sup 1308
     //   since-cxx17-note@-2 {{use 'noexcept(false)' instead}}
     struct U {};
   };
-}
+} // namespace cwg437
 
 // cwg438 is in cwg438.cpp
 // cwg439 is in cwg439.cpp
@@ -710,7 +710,7 @@ namespace cwg437 { // cwg437: sup 1308
 // cwg442: sup 348
 // cwg443: na
 
-namespace cwg444 { // cwg444: yes
+namespace cwg444 { // cwg444: 2.7
   struct D;
   struct B { // #cwg444-B
     D &operator=(D &) = delete; // #cwg444-deleted
@@ -728,7 +728,7 @@ namespace cwg444 { // cwg444: yes
     //   since-cxx11-note@#cwg444-B {{candidate function (the implicit move assignment operator) not viable: expects an rvalue for 1st argument}}
     //   since-cxx11-note@#cwg444-D {{candidate function (the implicit move assignment operator) not viable: expects an rvalue for 1st argument}}
   }
-}
+} // namespace cwg444
 
 namespace cwg445 { // cwg445: 3.2
   class A { void f(); }; // #cwg445-f
@@ -737,7 +737,7 @@ namespace cwg445 { // cwg445: 3.2
     // expected-error@-1 {{friend function 'f' is a private member of 'cwg445::A'}}
     //   expected-note@#cwg445-f {{implicitly declared private here}}
   };
-}
+} // namespace cwg445
 
 namespace cwg446 { // cwg446: 2.8
   struct C;
@@ -771,9 +771,9 @@ namespace cwg446 { // cwg446: 2.8
     // cxx98-14-error@-1 {{call to deleted constructor of 'A'}}
     //   cxx98-14-note@#cwg446-deleted {{'A' has been explicitly marked deleted here}}
   }
-}
+} // namespace cwg446
 
-namespace cwg447 { // cwg447: yes
+namespace cwg447 { // cwg447: 2.8
   struct A { int n; int a[4]; };
   template<int> struct U {
     typedef int type;
@@ -799,7 +799,7 @@ namespace cwg447 { // cwg447: yes
     // expected-error@-1 {{expected ';' after expression}}
     // expected-error@-2 {{use of undeclared identifier 'd'}}
   }
-}
+} // namespace cwg447
 
 namespace cwg448 { // cwg448: 2.8
   template<typename T = int> void f(int); // #cwg448-f-int
@@ -817,11 +817,11 @@ namespace cwg448 { // cwg448: 2.8
   namespace HideFromADL { struct X {}; }
   template void g(int); // ok
   template void g(HideFromADL::X); // #cwg448-g
-}
+} // namespace cwg448
 
 // cwg449: na
 
-namespace cwg450 { // cwg450: yes
+namespace cwg450 { // cwg450: 3.2
   typedef int A[3];
   void f1(const A &);
   void f2(A &); // #cwg450-f2
@@ -836,13 +836,13 @@ namespace cwg450 { // cwg450: yes
   void h() {
     f1(A{});
     f2(A{});
-    // expected-error@-1 {{no matching function for call to 'f2'}}}
-    //   expected-note@#cwg450-f2 {{candidate function not viable: expects an lvalue for 1st argument}}
+    // since-cxx11-error@-1 {{no matching function for call to 'f2'}}}
+    //   since-cxx11-note@#cwg450-f2 {{candidate function not viable: expects an lvalue for 1st argument}}
   }
 #endif
-}
+} // namespace cwg450
 
-namespace cwg451 { // cwg451: yes
+namespace cwg451 { // cwg451: 2.7
   const int a = 1 / 0;
   // expected-warning@-1 {{division by zero is undefined}}
   const int b = 1 / 0; // #cwg451-b
@@ -851,20 +851,20 @@ namespace cwg451 { // cwg451: yes
   // expected-error@-1 {{expression is not an integral constant expression}}
   //   expected-note@-2 {{initializer of 'b' is not a constant expression}}
   //   expected-note@#cwg451-b {{declared here}}
-}
+} // namespace cwg451
 
-namespace cwg452 { // cwg452: yes
+namespace cwg452 { // cwg452: 2.7
   struct A {
     int a, b, c;
     A *p;
     int f();
     A() : a(f()), b(this->f() + a), c(this->a), p(this) {}
   };
-}
+} // namespace cwg452
 
 // cwg454 FIXME write a codegen test
 
-namespace cwg456 { // cwg456: yes
+namespace cwg456 { // cwg456: 3.4
   // sup 903 c++11
   const int null = 0;
   void *p = null;
@@ -875,9 +875,9 @@ namespace cwg456 { // cwg456: yes
   void *q = f;
   // cxx98-warning@-1 {{initialization of pointer of type 'void *' to null from a constant boolean}}
   // since-cxx11-error@-2 {{cannot initialize a variable of type 'void *' with an lvalue of type 'const bool'}}
-}
+} // namespace cwg456
 
-namespace cwg457 { // cwg457: yes
+namespace cwg457 { // cwg457: 2.7
   const int a = 1;
   const volatile int b = 1;
   static_assert(a, "");
@@ -891,7 +891,7 @@ namespace cwg457 { // cwg457: yes
     // expected-error@-1 {{expression is not an integral constant expression}}
     //   expected-note@-2 {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
   };
-}
+} // namespace cwg457
 
 namespace cwg458 { // cwg458: 11
   struct A {
@@ -932,9 +932,9 @@ namespace cwg458 { // cwg458: 11
     // expected-error@-1 {{'T' does not refer to a value}}
     //   expected-note@#cwg458-h-T {{declared here}}
   }
-}
+} // namespace cwg458
 
-namespace cwg460 { // cwg460: yes
+namespace cwg460 { // cwg460: 2.7
   namespace X { namespace Q { int n; } }
   namespace Y {
     using X;
@@ -946,7 +946,7 @@ namespace cwg460 { // cwg460: yes
     // expected-error@-1 {{using declaration cannot refer to a namespace}}
     // expected-note@-2 {{did you mean 'using namespace'?}}
   }
-}
+} // namespace cwg460
 
 // cwg461: na
 // cwg462 is in cwg462.cpp
@@ -983,9 +983,9 @@ void g(int a, CI b, VI c) {
   c.CI::~CI();
   c.VI::~VI();
 }
-}
+} // namespace cwg466
 
-namespace cwg467 { // cwg467: yes
+namespace cwg467 { // cwg467: 2.7
   int stuff();
 
   int f() {
@@ -1005,9 +1005,9 @@ namespace cwg467 { // cwg467: yes
   later:
     return k;
   }
-}
+} // namespace cwg467
 
-namespace cwg468 { // cwg468: yes c++11
+namespace cwg468 { // cwg468: 2.7 c++11
   // FIXME: Should we allow this in C++98 too?
   template<typename> struct A {
     template<typename> struct B {
@@ -1017,7 +1017,7 @@ namespace cwg468 { // cwg468: yes c++11
   int k = cwg468::template A<int>::template B<char>::C;
   // cxx98-error@-1 {{'template' keyword outside of a template}}
   // cxx98-error@-2 {{'template' keyword outside of a template}}
-}
+} // namespace cwg468
 
 namespace cwg469 { // cwg469: no
   template<typename T> struct X; // #cwg469-X
@@ -1025,9 +1025,9 @@ namespace cwg469 { // cwg469: no
   X<int&> x;
   // expected-error@-1 {{implicit instantiation of undefined template 'cwg469::X<int &>'}}
   //   expected-note@#cwg469-X {{template is declared here}}
-}
+} // namespace cwg469
 
-namespace cwg470 { // cwg470: yes
+namespace cwg470 { // cwg470: 2.7
   template<typename T> struct A {
     struct B {};
   };
@@ -1042,7 +1042,7 @@ namespace cwg470 { // cwg470: yes
   // ok, instantiating C<char> doesn't instantiate base class members.
   template struct A<char>;
   template struct C<char>;
-}
+} // namespace cwg470
 
 namespace cwg471 { // cwg471: 2.8
   struct A { int n; };
@@ -1060,7 +1060,7 @@ namespace cwg471 { // cwg471: 2.8
   struct H : B, G { int f() { return n; } };
   // expected-error@-1 {{'n' is a private member of 'cwg471::G'}}
   //   expected-note@#cwg471-G-using {{declared private here}}
-}
+} // namespace cwg471
 
 namespace cwg472 { // cwg472: no drafting 2011-04
 struct B {
@@ -1077,7 +1077,7 @@ struct D : public I {
     bp->i = 5;
   }
 };
-}
+} // namespace cwg472
 
 namespace cwg474 { // cwg474: 3.4
   namespace N {
@@ -1094,7 +1094,7 @@ namespace cwg474 { // cwg474: 3.4
     // expected-error@-1 {{functions that differ only in their return type cannot be overloaded}}
     //   expected-note@#cwg474-g {{previous declaration is here}}
   }
-}
+} // namespace cwg474
 
 // cwg475 FIXME write a libc++abi test
 
@@ -1113,15 +1113,15 @@ namespace cwg477 { // cwg477: 3.5
   // expected-error@-1 {{can only be specified inside the class definition}}
   virtual void A::f() {}
   // expected-error@-1 {{can only be specified inside the class definition}}
-}
+} // namespace cwg477
 
-namespace cwg478 { // cwg478: yes
+namespace cwg478 { // cwg478: 2.7
   struct A { virtual void f() = 0; }; // #cwg478-f
   void f(A *a);
   void f(A a[10]);
   // expected-error@-1 {{array of abstract class type 'A'}}
   //   expected-note@#cwg478-f {{unimplemented pure virtual method 'f' in 'A'}}
-}
+} // namespace cwg478
 
 namespace cwg479 { // cwg479: 2.8
   struct S {
@@ -1162,9 +1162,9 @@ namespace cwg479 { // cwg479: 2.8
       //   expected-note@#cwg479-S-dtor {{declared private here}}
     }
   }
-}
+} // namespace cwg479
 
-namespace cwg480 { // cwg480: yes
+namespace cwg480 { // cwg480: 2.7
   struct A { int n; };
   struct B : A {};
   struct C : virtual B {};
@@ -1187,7 +1187,7 @@ namespace cwg480 { // cwg480: yes
   A &j = i;
   D &k = static_cast<D&>(j);
   // expected-error@-1 {{cannot cast 'A' to 'D &' via virtual base 'cwg480::B'}}
-}
+} // namespace cwg480
 
 namespace cwg481 { // cwg481: 2.8
   template<class T, T U> class A { T *x; };
@@ -1232,7 +1232,7 @@ namespace cwg481 { // cwg481: 2.8
   template<N X, typename N, template<N Y> class T> struct I;
   template<char*> struct J;
   I<123, char*, J> *j;
-}
+} // namespace cwg481
 
 namespace cwg482 { // cwg482: 3.5
   extern int a;
@@ -1274,12 +1274,12 @@ namespace cwg482 { // cwg482: 3.5
 #if __cplusplus >= 201103L
     enum class C;
     enum class A::C {};
-    // expected-error@-1 {{extra qualification on member 'C'}}
+    // since-cxx11-error@-1 {{extra qualification on member 'C'}}
 #endif
   };
-}
+} // namespace cwg482
 
-namespace cwg483 { // cwg483: yes
+namespace cwg483 { // cwg483: 2.7
   namespace climits {
     static_assert(__SCHAR_MAX__ >= 127, "");
     static_assert(__SHRT_MAX__ >= 32767, "");
@@ -1294,9 +1294,9 @@ namespace cwg483 { // cwg483: yes
     static_assert(__WCHAR_WIDTH__ >= 8, "");
     static_assert(__WINT_WIDTH__ >= 16, "");
   }
-}
+} // namespace cwg483
 
-namespace cwg484 { // cwg484: yes
+namespace cwg484 { // cwg484: 2.8
   struct A {
     A();
     void f();
@@ -1332,9 +1332,9 @@ namespace cwg484 { // cwg484: yes
     S();
     // expected-error@-1 {{a type specifier is required for all declarations}}
   } S;
-}
+} // namespace cwg484
 
-namespace cwg485 { // cwg485: yes
+namespace cwg485 { // cwg485: 2.7
   namespace N {
     struct S {};
     int operator+(S, S);
@@ -1345,9 +1345,9 @@ namespace cwg485 { // cwg485: yes
   N::S s;
   int a = operator+(s, s);
   int b = f<int>(s);
-}
+} // namespace cwg485
 
-namespace cwg486 { // cwg486: yes
+namespace cwg486 { // cwg486: 2.7
   template<typename T> T f(T *); // #cwg486-f
   int &f(...);
 
@@ -1364,18 +1364,18 @@ namespace cwg486 { // cwg486: yes
     // expected-error@-1 {{no matching function for call to 'f'}}
     //   expected-note@#cwg486-f {{candidate template ignored: substitution failure [with T = int[10]]: function cannot return array type 'int[10]'}}
   }
-}
+} // namespace cwg486
 
-namespace cwg487 { // cwg487: yes
+namespace cwg487 { // cwg487: 2.7
   enum E { e };
   int operator+(int, E); // #cwg487-operator-plus
   static_assert(4 + e, "");
   // expected-error@-1 {{expression is not an integral constant expression}}
   //   since-cxx11-note@-2 {{non-constexpr function 'operator+' cannot be used in a constant expression}}
   //   since-cxx11-note@#cwg487-operator-plus {{declared here}}
-}
+} // namespace cwg487
 
-namespace cwg488 { // cwg488: yes c++11
+namespace cwg488 { // cwg488: 2.9 c++11
   template <typename T> void f(T);
   void f(int);
   void g() {
@@ -1387,7 +1387,7 @@ namespace cwg488 { // cwg488: yes c++11
     f(e);
     // cxx98-error@-1 {{template argument uses local type 'E'}}
   }
-}
+} // namespace cwg488
 
 // cwg489: na
 
@@ -1433,13 +1433,13 @@ namespace cwg490 { // cwg490: 2.8
     // the class of the conversion function first.
     friend A::operator X<T>();
   };
-}
+} // namespace cwg490
 
 namespace cwg491 { // cwg491: dup 413
   struct A {} a, b[3] = { a, {} };
   A c[2] = { a, {}, b[1] };
   // expected-error@-1 {{excess elements in array initializer}}
-}
+} // namespace cwg491
 
 // cwg492 is in cwg492.cpp
 
@@ -1451,7 +1451,7 @@ namespace cwg493 { // cwg493: dup 976
     if (X()) {
     }
   }
-}
+} // namespace cwg493
 
 namespace cwg494 { // cwg494: dup 372
   class A {
@@ -1464,7 +1464,7 @@ namespace cwg494 { // cwg494: dup 372
       A::B y;
     };
   };
-}
+} // namespace cwg494
 
 namespace cwg495 { // cwg495: 3.5
   template<typename T>
@@ -1482,7 +1482,7 @@ namespace cwg495 { // cwg495: 3.5
   };
   S2<int> s2;
   long n2 = s2;
-}
+} // namespace cwg495
 
 namespace cwg496 { // cwg496: sup 2094
   struct A { int n; };
@@ -1494,7 +1494,7 @@ namespace cwg496 { // cwg496: sup 2094
   static_assert(__is_trivially_constructible(B, const B&), "");
   static_assert(__is_trivially_assignable(A, const A&), "");
   static_assert(__is_trivially_assignable(B, const B&), "");
-}
+} // namespace cwg496
 
 namespace cwg497 { // cwg497: sup 253
   void before() {
@@ -1517,9 +1517,9 @@ namespace cwg497 { // cwg497: sup 253
     cs.*pm = 88;
     // expected-error@-1 {{read-only variable is not assignable}}
   }
-}
+} // namespace cwg497
 
-namespace cwg499 { // cwg499: yes
+namespace cwg499 { // cwg499: 2.7
   extern char str[];
   void f() { throw str; }
-}
+} // namespace cwg499
diff --git a/clang/test/CXX/drs/cwg571.cpp b/clang/test/CXX/drs/cwg571.cpp
index 9f0f455f..09d12e1 100644
--- a/clang/test/CXX/drs/cwg571.cpp
+++ b/clang/test/CXX/drs/cwg571.cpp
@@ -12,7 +12,7 @@ namespace cwg571 { // cwg571: 2.7
   const ir r = n;
   // expected-warning@-1 {{'const' qualifier on reference type 'ir' (aka 'int &') has no effect}}
   ir r2 = n;
-}
+} // namespace cwg571
 
 // Entities have external linkage by default.
 
diff --git a/clang/test/CXX/drs/cwgr593.cpp b/clang/test/CXX/drs/cwg593.cpp
index d747f4e..d747f4e 100644
--- a/clang/test/CXX/drs/cwgr593.cpp
+++ b/clang/test/CXX/drs/cwg593.cpp
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index 9219a4c..1fdfe57 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -17,16 +17,16 @@
 __extension__ typedef __SIZE_TYPE__ size_t;
 void *operator new(size_t); // #cwg5xx-global-operator-new
 // cxx98-error@-1 {{'operator new' is missing exception specification 'throw(std::bad_alloc)'}}
-#if __cplusplus > 201402L
+#if __cplusplus >= 201703L
 namespace std {
   enum class align_val_t : size_t {};
-}
+} // namespace std
 void *operator new(size_t, std::align_val_t); // #cwg5xx-global-operator-new-aligned
 #endif
 
 namespace std {
   struct type_info;
-}
+} // namespace std
 
 namespace cwg500 { // cwg500: dup 372
   class D;
@@ -38,9 +38,9 @@ namespace cwg500 { // cwg500: dup 372
   class A::B {};
   class A::C : public A::B {};
   class D : public A::B {};
-}
+} // namespace cwg500
 
-namespace cwg501 { // cwg501: yes
+namespace cwg501 { // cwg501: 2.7
   struct A {
     friend void f() {}
     void g() {
@@ -48,9 +48,9 @@ namespace cwg501 { // cwg501: yes
       // expected-error@-1 {{use of undeclared identifier 'f'}}
     }
   };
-}
+} // namespace cwg501
 
-namespace cwg502 { // cwg502: yes
+namespace cwg502 { // cwg502: 2.7
   struct Q {};
   template<typename T> struct A {
     enum E { e = 1 };
@@ -63,9 +63,9 @@ namespace cwg502 { // cwg502: yes
   int f(A<int>::E);
   template<int N> int f(Q (&)[N]);
   template struct A<int>;
-}
+} // namespace cwg502
 
-namespace cwg505 { // cwg505: yes
+namespace cwg505 { // cwg505: 2.7
   const char *exts = "\e\(\{\[\%";
   // expected-error@-1 {{use of non-standard escape character '\e'}}
   // expected-error@-2 {{use of non-standard escape character '\('}}
@@ -74,15 +74,15 @@ namespace cwg505 { // cwg505: yes
   // expected-error@-5 {{use of non-standard escape character '\%'}}
   const char *unknown = "\Q";
   // expected-error@-1 {{unknown escape sequence '\Q'}}
-}
+} // namespace cwg505
 
-namespace cwg506 { // cwg506: yes
+namespace cwg506 { // cwg506: 2.7
   struct NonPod { ~NonPod(); };
   void f(...);
   void g(NonPod np) { f(np); }
   // cxx98-error@-1 {{cannot pass object of non-POD type 'NonPod' through variadic function; call will abort at runtime}}
   // since-cxx11-error@-2 {{cannot pass object of non-trivial type 'NonPod' through variadic function; call will abort at runtime}}
-}
+} // namespace cwg506
 
 // FIXME: Add tests here once CWG260 is resolved.
 // cwg507: dup 260
@@ -91,7 +91,7 @@ namespace cwg506 { // cwg506: yes
 // cwg509: na
 // cwg510: na
 
-namespace cwg512 { // cwg512: yes
+namespace cwg512 { // cwg512: 3.0
   struct A { // #cwg512-A
     A(int); // #cwg512-A-ctor
   };
@@ -99,14 +99,14 @@ namespace cwg512 { // cwg512: yes
   // cxx98-error@-1 {{union member 'a' has a non-trivial default constructor}}
   //   cxx98-note@#cwg512-A {{because type 'cwg512::A' has no default constructor}}
   //   cxx98-note@#cwg512-A-ctor {{implicit default constructor suppressed by user-declared constructor}}
-}
+} // namespace cwg512
 
 // cwg513: na
 
-namespace cwg514 { // cwg514: yes
+namespace cwg514 { // cwg514: 2.7
   namespace A { extern int x, y; }
   int A::x = y;
-}
+} // namespace cwg514
 
 namespace cwg515 { // cwg515: sup 1017
   // FIXME: cwg1017 reverses the wording of cwg515, but the current draft has
@@ -121,7 +121,7 @@ namespace cwg515 { // cwg515: sup 1017
   struct A { int a; };
   struct B { void f() { int k = sizeof(A::a); } };
   // cxx98-error@-1 {{invalid use of non-static data member 'a'}}
-}
+} // namespace cwg515
 
 // cwg516: na
 
@@ -144,12 +144,12 @@ namespace cwg517 { // cwg517: no
   // FIXME: These are both ill-formed.
   template<typename T> struct S<T&> {};
   template<typename T> int v<T&> = 0;
-}
+} // namespace cwg517
 
-namespace cwg518 { // cwg518: yes c++11
+namespace cwg518 { // cwg518: 2.7 c++11
   enum E { e, };
   // cxx98-error@-1 {{commas at the end of enumerator lists are a C++11 extension}}
-}
+} // namespace cwg518
 
 // cwg519 is in cwg519.cpp
 // cwg520: na
@@ -158,7 +158,7 @@ namespace cwg518 { // cwg518: yes c++11
 // FIXME: The wording here is broken. It's not reasonable to expect a
 // diagnostic here. Once the relevant DR gets a number, mark this as a dup.
 
-namespace cwg522 { // cwg522: yes
+namespace cwg522 { // cwg522: 2.7
   struct S {};
   template<typename T> void b1(volatile T &);
   template<typename T> void b2(volatile T * const *);
@@ -188,9 +188,9 @@ namespace cwg522 { // cwg522: yes
     b3(d);
     b3(cd);
   }
-}
+} // namespace cwg522
 
-namespace cwg524 { // cwg524: yes
+namespace cwg524 { // cwg524: 2.7
   template<typename T> void f(T a, T b) { operator+(a, b); }
   // expected-error@-1 {{call to function 'operator+' that is neither visible in the template definition nor found by argument-dependent lookup}}
   //   expected-note@#cwg524-f-N-S {{in instantiation of function template specialization 'cwg524::f<cwg524::N::S>' requested here}}
@@ -203,9 +203,9 @@ namespace cwg524 { // cwg524: yes
   namespace N { struct S {}; }
   void operator+(N::S, N::S); // #cwg524-operator-plus
   template void f(N::S, N::S); // #cwg524-f-N-S
-}
+} // namespace cwg524
 
-namespace cwg525 { // cwg525: yes
+namespace cwg525 { // cwg525: 2.7
   namespace before {
     // Note, the example was correct prior to the change; instantiation is
     // required for cases like this:
@@ -222,9 +222,9 @@ namespace cwg525 { // cwg525: yes
       delete ppp; // #cwg525-ppp
     }
   }
-}
+} // namespace cwg525
 
-namespace cwg526 { // cwg526: yes
+namespace cwg526 { // cwg526: 2.7
   template<int> struct S {};
   template<int N> void f1(S<N> s);
   template<int N> void f2(S<(N)> s); // #cwg526-f2
@@ -258,7 +258,7 @@ namespace cwg526 { // cwg526: yes
     X<+N>::type v3;
     // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name X<+N>::type; implicit 'typename' is a C++20 extension}}
   };
-}
+} // namespace cwg526
 
 namespace cwg527 { // cwg527: na
   // This DR is meaningless. It removes a required diagnostic from the case
@@ -273,7 +273,7 @@ namespace cwg527 { // cwg527: na
   namespace { struct F { static P f; }; }
 
   int ax = a.x, bx = b.x, cx = c.x, dx = d.x, ex = E::e->x, fx = F::f->x;
-}
+} // namespace cwg527
 
 namespace cwg528 { // cwg528: 2.7
 
@@ -287,14 +287,14 @@ void f() {
 
 } // namespace cwg528
 
-namespace cwg530 { // cwg530: yes
+namespace cwg530 { // cwg530: 2.7
   template<int*> struct S { enum { N = 1 }; };
   template<void(*)()> struct T { enum { N = 1 }; };
   int n;
   void f();
   int a[S<&n>::N];
   int b[T<&f>::N];
-}
+} // namespace cwg530
 
 namespace cwg531 { // cwg531: partial
   namespace good {
@@ -412,7 +412,7 @@ namespace cwg531 { // cwg531: partial
     // cxx98-17-error@-1 {{template parameter list matching the non-templated nested type 'cwg531::nested::A<int>::B<char>' should be empty ('template<>')}}
 #endif
   }
-}
+} // namespace cwg531
 
 // PR8130
 namespace cwg532 { // cwg532: 3.5
@@ -428,7 +428,7 @@ namespace cwg532 { // cwg532: 3.5
     B<A> b;
     int &ir = b * a;
   }
-}
+} // namespace cwg532
 
 // cwg533: na
 
@@ -437,9 +437,9 @@ namespace cwg534 { // cwg534: 2.9
   template<typename T> void operator+(S, T);
   template<typename T> void operator+<T*>(S, T*) {}
   // expected-error@-1 {{function template partial specialization is not allowed}}
-}
+} // namespace cwg534
 
-namespace cwg535 { // cwg535: yes
+namespace cwg535 { // cwg535: 3.1
   class X { private: X(const X&); };
   struct A {
     X x;
@@ -468,14 +468,14 @@ namespace cwg535 { // cwg535: yes
   // ok, copy is elided
   constexpr C x = c();
 #endif
-}
+} // namespace cwg535
 
 // cwg536: na
 // cwg537: na
 // cwg538: na
 
-// cwg539: yes
-const cwg539(
+namespace cwg539 { // cwg539: 3.4
+const f(
 // expected-error@-1 {{a type specifier is required for all declarations}}
     const a) {
     // expected-error@-1 {{unknown type name 'a'}}
@@ -509,11 +509,11 @@ const cwg539(
   int arr[3];
   // FIXME: The extra braces here are to avoid the parser getting too
   // badly confused when recovering here. We should fix this recovery.
-  { for (const n
+  { for (const n // #cwg539-for
   // since-cxx11-error@-1 {{unknown type name 'n'}}
-  //   since-cxx11-note@-2 {{}}
          : arr) ; {} }
          // since-cxx11-error@-1 +{{}}
+         //   since-cxx11-note@#cwg539-for {{}}
   (void) [](const) {};
   // since-cxx11-error@-1 {{a type specifier is required for all declarations}}
   (void) [](const n) {};
@@ -526,8 +526,9 @@ const cwg539(
   // since-cxx11-error@-1 {{expected a type}}
 #endif
 }
+} // namespace cwg539
 
-namespace cwg540 { // cwg540: yes
+namespace cwg540 { // cwg540: 2.7
   typedef int &a;
   typedef const a &a;
   // expected-warning@-1 {{'const' qualifier on reference type 'a' (aka 'int &') has no effect}}
@@ -539,9 +540,9 @@ namespace cwg540 { // cwg540: yes
   // expected-error@#cwg540-typedef-b-c {{typedef redefinition with different types ('const int &' vs 'int &')}}
   //   expected-note@#cwg540-typedef-a-c {{previous definition is here}}
   // expected-warning@#cwg540-typedef-b-c {{'const' qualifier on reference type 'b' (aka 'const int &') has no effect}}
-}
+} // namespace cwg540
 
-namespace cwg541 { // cwg541: yes
+namespace cwg541 { // cwg541: 2.7
   template<int> struct X { typedef int type; };
   template<typename T> struct S {
     int f(T);
@@ -569,9 +570,9 @@ namespace cwg541 { // cwg541: yes
       typename X<sizeof(h(0))>::type b;
     }
   };
-}
+} // namespace cwg541
 
-namespace cwg542 { // cwg542: yes
+namespace cwg542 { // cwg542: 3.5
 #if __cplusplus >= 201103L
   // In C++20 A and B are no longer aggregates and thus the constructor is
   // called, which fails.
@@ -591,7 +592,7 @@ namespace cwg542 { // cwg542: yes
   // since-cxx20-error@-1 {{calling a private constructor of class 'cwg542::B'}}
   //   since-cxx20-note@#cwg542-B-ctor {{declared private here}}
 #endif
-}
+} // namespace cwg542
 
 namespace cwg543 { // cwg543: 3.0
   // In C++98+CWG543, this is valid because value-initialization doesn't call a
@@ -607,22 +608,22 @@ namespace cwg543 { // cwg543: 3.0
   A a = A();
   // since-cxx11-error@-1 {{call to implicitly-deleted default constructor of 'A'}}
   //   since-cxx11-note@#cwg543-A-n {{default constructor of 'A' is implicitly deleted because field 'n' of const-qualified type 'const int' would not be initialized}}
-}
+} // namespace cwg543
 
-namespace cwg544 { // cwg544: yes
+namespace cwg544 { // cwg544: 2.7
   int *n;
 
   template<class T> struct A { int n; };
   template<class T> struct B : A<T> { int get(); };
   template<> int B<int>::get() { return n; }
   int k = B<int>().get();
-}
+} // namespace cwg544
 
-namespace cwg546 { // cwg546: yes
+namespace cwg546 { // cwg546: 2.7
   template<typename T> struct A { void f(); };
   template struct A<int>;
   template<typename T> void A<T>::f() { T::error; }
-}
+} // namespace cwg546
 
 namespace cwg547 { // cwg547: 3.2
   template<typename T> struct X;
@@ -631,18 +632,18 @@ namespace cwg547 { // cwg547: 3.2
 
   struct S { void f() const; };
   X<void() const> x = f(&S::f);
-}
+} // namespace cwg547
 
 namespace cwg548 { // cwg548: dup 482
   template<typename T> struct S {};
   template<typename T> void f() {}
   template struct cwg548::S<int>;
   template void cwg548::f<int>();
-}
+} // namespace cwg548
 
 // cwg550: dup 393
 
-namespace cwg551 { // cwg551: yes c++11
+namespace cwg551 { // cwg551: 2.7 c++11
   // FIXME: This obviously should apply in C++98 mode too.
   template<typename T> void f() {}
   template inline void f<int>();
@@ -657,13 +658,13 @@ namespace cwg551 { // cwg551: yes c++11
   };
   template inline void X<int>::f();
   // since-cxx11-error@-1 {{explicit instantiation cannot be 'inline'}}
-}
+} // namespace cwg551
 
-namespace cwg552 { // cwg552: yes
+namespace cwg552 { // cwg552: 2.7
   template<typename T, typename T::U> struct X {};
   struct Y { typedef int U; };
   X<Y, 0> x;
-}
+} // namespace cwg552
 
 // cwg553: 2.7
 struct cwg553_class {
@@ -683,7 +684,7 @@ namespace cwg553 {
     friend void *operator new(size_t, namespace_scope);
     // expected-error@-1 {{'operator new' cannot be declared inside a namespace}}
   };
-}
+} // namespace cwg553
 
 // cwg554: na
 
@@ -753,7 +754,7 @@ namespace cwg557 { // cwg557: 3.1
     f(p);
     g(q);
   }
-}
+} // namespace cwg557
 
 namespace cwg558 { // cwg558: 2.9
   wchar_t a = L'\uD7FF';
@@ -766,9 +767,11 @@ namespace cwg558 { // cwg558: 2.9
   wchar_t f = L'\xDFFF';
   wchar_t g = L'\uE000';
   wchar_t h = L'\xE000';
-}
+} // namespace cwg558
 
-template<typename> struct cwg559 { typedef int T; cwg559::T u; }; // cwg559: yes
+namespace cwg559 { // cwg559: 2.7
+template<typename> struct S { typedef int T; S::T u; };
+} // namespace cwg559
 
 namespace cwg560 { // cwg560: 16
 
@@ -784,7 +787,7 @@ Outer<T>::Inner* Outer<T>::Inner::self() { return this; }
 
 } // namespace cwg560
 
-namespace cwg561 { // cwg561: yes
+namespace cwg561 { // cwg561: 2.7
   template<typename T> void f(int);
   template<typename T> void g(T t) {
     f<T>(t);
@@ -796,19 +799,19 @@ namespace cwg561 { // cwg561: yes
   void h(S s) {
     g(s);
   }
-}
+} // namespace cwg561
 
 // cwg562: na
 // cwg563 is in cwg563.cpp
 
-namespace cwg564 { // cwg564: yes
+namespace cwg564 { // cwg564: 2.7
   extern "C++" void f(int);
   void f(int); // ok
   extern "C++" { extern int n; }
   int n; // ok
-}
+} // namespace cwg564
 
-namespace cwg565 { // cwg565: yes
+namespace cwg565 { // cwg565: 2.7
   namespace N {
     template<typename T> int f(T); // #cwg565-f
   }
@@ -822,13 +825,13 @@ namespace cwg565 { // cwg565: yes
   // expected-error@-1 {{declaration conflicts with target of using declaration already in scope}}
   //   expected-note@#cwg565-f {{target of using declaration}}
   //   expected-note@#cwg565-using {{using declaration}}
-}
+} // namespace cwg565
 
-namespace cwg566 { // cwg566: yes
+namespace cwg566 { // cwg566: 3.1
 #if __cplusplus >= 201103L
   static_assert(int(-3.99) == -3, "");
 #endif
-}
+} // namespace cwg566
 
 // cwg567: na
 
@@ -871,14 +874,14 @@ namespace cwg568 { // cwg568: 3.0 c++11
     trivial t; // #cwg568-t
   x: ;
   }
-}
+} // namespace cwg568
 
-namespace cwg569 { // cwg569: yes c++11
+namespace cwg569 { // cwg569: 2.7 c++11
   // FIXME: This is a DR issue against C++98, so should probably apply there
   // too.
   ;;;;;
   // cxx98-error@-1 {{C++11 extension}}
-}
+} // namespace cwg569
 
 namespace cwg570 { // cwg570: dup 633
   int n;
@@ -886,14 +889,14 @@ namespace cwg570 { // cwg570: dup 633
   int &r = n;
   // expected-error@-1 {{redefinition of 'r'}}
   //   expected-note@#cwg570-r {{previous definition is here}}
-}
+} // namespace cwg570
 
 // cwg571 is in cwg571.cpp
 
-namespace cwg572 { // cwg572: yes
+namespace cwg572 { // cwg572: 2.7
   enum E { a = 1, b = 2 };
   static_assert(a + b == 3, "");
-}
+} // namespace cwg572
 
 namespace cwg573 { // cwg573: no
   void *a;
@@ -910,7 +913,7 @@ namespace cwg573 { // cwg573: no
   // FIXME: This is ill-formed.
   template<void*> struct S;
   template<int*> struct T;
-}
+} // namespace cwg573
 
 namespace cwg574 { // cwg574: 3.0
   struct A {
@@ -965,9 +968,9 @@ namespace cwg574 { // cwg574: 3.0
     //   since-cxx11-note@#cwg574-D-copy-assign {{candidate function}}
 #endif
   };
-}
+} // namespace cwg574
 
-namespace cwg575 { // cwg575: yes
+namespace cwg575 { // cwg575: 2.7
   template<typename T, typename U = typename T::type> void a(T); void a(...);
   // cxx98-error@-1 {{default template arguments for a function template are a C++11 extension}}
   template<typename T, typename T::type U = 0> void b(T); void b(...);
@@ -993,7 +996,7 @@ namespace cwg575 { // cwg575: yes
   template<typename T> T &h(T *);
   template<typename T> T *h(T *);
   void *p = h((void*)0);
-}
+} // namespace cwg575
 
 namespace cwg576 { // cwg576: 3.5
   typedef void f() {}
@@ -1001,7 +1004,7 @@ namespace cwg576 { // cwg576: 3.5
   void f(typedef int n);
   // expected-error@-1 {{invalid storage class specifier in function declarator}}
   void f(char c) { typedef int n; }
-}
+} // namespace cwg576
 
 namespace cwg577 { // cwg577: 3.5
   typedef void V;
@@ -1036,7 +1039,7 @@ namespace cwg577 { // cwg577: 3.5
     // expected-error@-1 {{no matching function for call to 'j'}}
     //   expected-note@#cwg577-j {{candidate template ignored: substitution failure [with T = const void]: argument may not have 'void' type}}
   }
-}
+} // namespace cwg577
 
 namespace cwg580 { // cwg580: partial
   class C;
@@ -1086,7 +1089,7 @@ namespace cwg580 { // cwg580: partial
   //   expected-note@#cwg580-C-ctor {{implicitly declared private here}}
   // expected-error@#cwg580-c {{variable of type 'C' has private destructor}}
   //   expected-note@#cwg580-C-dtor {{implicitly declared private here}}
-}
+} // namespace cwg580
 
 // cwg582: na
 
@@ -1101,7 +1104,7 @@ namespace cwg583 { // cwg583: 4
   // expected-error@-1 {{ordered comparison between pointer and zero ('int *' and 'int')}}
   bool b4 = p >= 0;
   // expected-error@-1 {{ordered comparison between pointer and zero ('int *' and 'int')}}
-}
+} // namespace cwg583
 
 // cwg584: na
 
@@ -1127,7 +1130,7 @@ namespace cwg585 { // cwg585: 3.0
     template<typename U> friend T<U>;
     // expected-error@-1 {{friend type templates must use an elaborated type}}
   };
-}
+} // namespace cwg585
 
 // cwg586: na
 
@@ -1138,9 +1141,9 @@ namespace cwg587 { // cwg587: 3.2
   struct S {};
   template void f(bool, const int, int);
   template void f(bool, const S, S);
-}
+} // namespace cwg587
 
-namespace cwg588 { // cwg588: yes
+namespace cwg588 { // cwg588: 2.7
   struct A { int n; }; // #cwg588-A
   template<typename T> int f() {
     struct S : A, T { int f() { return n; } } s;
@@ -1153,9 +1156,9 @@ namespace cwg588 { // cwg588: yes
   }
   struct B { int n; }; // #cwg588-B
   int k = f<B>(); // #cwg588-k
-}
+} // namespace cwg588
 
-namespace cwg589 { // cwg589: yes
+namespace cwg589 { // cwg589: 2.7
   struct B { };
   struct D : B { };
   D f();
@@ -1165,9 +1168,9 @@ namespace cwg589 { // cwg589: yes
   // expected-error@-1 {{taking the address of a temporary object of type 'const B'}}
   const B *q = &(a ? D() : b);
   // expected-error@-1 {{taking the address of a temporary object of type 'const B'}}
-}
+} // namespace cwg589
 
-namespace cwg590 { // cwg590: yes
+namespace cwg590 { // cwg590: 2.7
   template<typename T> struct A {
     struct B {
       struct C {
@@ -1176,7 +1179,7 @@ namespace cwg590 { // cwg590: yes
     };
   };
   template<typename T> typename A<T>::B::C A<T>::B::C::f(A<T>::B::C) {}
-}
+} // namespace cwg590
 
 namespace cwg591 { // cwg591: 20
   template<typename T> struct A {
@@ -1245,7 +1248,7 @@ namespace cwg591 { // cwg591: 20
     M m;
     // expected-error@-1 {{field has incomplete type 'M' (aka 'void'}}
   };
-}
+} // namespace cwg591
 
 // cwg592: na
 // cwg593 is in cwg593.cpp
@@ -1260,11 +1263,11 @@ namespace cwg595 { // cwg595: dup 1330
   struct S {
     X<S> xs;
   };
-}
+} // namespace cwg595
 
 // cwg597: na
 
-namespace cwg598 { // cwg598: yes
+namespace cwg598 { // cwg598: 2.7
   namespace N {
     void f(int);
     void f(char);
@@ -1284,7 +1287,7 @@ namespace cwg598 { // cwg598: yes
   int &s = h(N::f);
   // expected-error@-1 {{use of undeclared identifier 'h'}}
   int &t = h(N::i);
-}
+} // namespace cwg598
 
 namespace cwg599 { // cwg599: partial
   typedef int Fn();
@@ -1311,4 +1314,4 @@ namespace cwg599 { // cwg599: partial
     //   expected-note@#cwg599-U {{conversion to pointer type 'void *'}}
     delete v;
   }
-}
+} // namespace cwg599
diff --git a/clang/test/CXX/drs/cwg6xx.cpp b/clang/test/CXX/drs/cwg6xx.cpp
index 1c56dd3..fb6acde 100644
--- a/clang/test/CXX/drs/cwg6xx.cpp
+++ b/clang/test/CXX/drs/cwg6xx.cpp
@@ -1,9 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-17,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,cxx98-14,since-cxx11,cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++20 %s -verify=expected,cxx11-20,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-17,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,cxx98-14,since-cxx11,cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected,cxx11-20,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -32,7 +33,7 @@ namespace std {
   __extension__ typedef __SIZE_TYPE__ size_t;
 } // namespace std
 
-namespace cwg601 { // cwg601: yes
+namespace cwg601 { // cwg601: 2.7
 #if __cplusplus >= 201103L
 #define MAX __LLONG_MAX__
 #else
@@ -60,9 +61,9 @@ static_assert(0x8000000000000000 < -1, "0x8000000000000000 should be unsigned");
 #endif
 
 #undef MAX
-}
+} // namespace cwg601
 
-namespace cwg602 { // cwg602: yes
+namespace cwg602 { // cwg602: 2.7
   template<class T> struct A {
     template<class U> friend struct A;
   };
@@ -75,14 +76,14 @@ namespace cwg602 { // cwg602: yes
     typename C::type ct; // ok, befriended
   };
   B<int> b;
-}
+} // namespace cwg602
 
-namespace cwg603 { // cwg603: yes
+namespace cwg603 { // cwg603: 3.1
   template<unsigned char> struct S {};
   typedef S<'\001'> S1;
   typedef S<(1ul << __CHAR_BIT__) + 1> S1;
-  // since-cxx11-error@-1 {{cannot be narrowed}}
-}
+  // since-cxx11-error@-1 {{non-type template argument evaluates to 257, which cannot be narrowed to type 'unsigned char'}}
+} // namespace cwg603
 
 // cwg604: na
 // cwg605 is in cwg605.cpp
@@ -107,9 +108,9 @@ namespace cwg606 { // cwg606: 3.0
     h(test); // ok, an rvalue reference can bind to a function lvalue
   }
 #endif
-}
+} // namespace cwg606
 
-namespace cwg607 { // cwg607: yes
+namespace cwg607 { // cwg607: 2.7
 namespace example1 {
 struct Y {};
 
@@ -137,23 +138,25 @@ N::D::D() : typedef_B(0) {}
 } // namespace example2
 } // namespace cwg607
 
-namespace cwg608 { // cwg608: yes
+namespace cwg608 { // cwg608: 2.7
   struct A { virtual void f(); };
   struct B : A {};
   struct C : A { void f(); };
   struct D : B, C {};
-}
+} // namespace cwg608
 
-static_assert(-0u == 0u, ""); // cwg610: yes
+namespace cwg610 { // cwg610: 2.7
+static_assert(-0u == 0u, "");
+} // namespace cwg610
 
-namespace cwg611 { // cwg611: yes
+namespace cwg611 { // cwg611: 2.7
   int k;
   struct S { int &r; } s = { k ? k : k };
-}
+} // namespace cwg611
 
 // cwg612: na
 
-namespace cwg613 { // cwg613: yes c++11
+namespace cwg613 { // cwg613: 3.1 c++11
   // see also n2253
   struct A { int n; static void f(); };
   int f(int);
@@ -188,15 +191,17 @@ namespace cwg613 { // cwg613: yes c++11
     // cxx98-error@-1 {{invalid use of member 'n' in static member function}}
     // since-cxx11-error@-2 {{invalid use of non-static data member 'n'}}
   }
-}
+} // namespace cwg613
 
-static_assert((-1) / 2 == 0, ""); // cwg614: yes
+namespace cwg614 { // cwg614: 2.7
+static_assert((-1) / 2 == 0, "");
 static_assert((-1) % 2 == -1, "");
+} // namespace cwg614
 
-namespace cwg615 { // cwg615: yes
+namespace cwg615 { // cwg615: 2.7
   int f();
   static int n = f();
-}
+} // namespace cwg615
 
 namespace cwg616 { // cwg616: 4
 #if __cplusplus >= 201103L
@@ -214,15 +219,15 @@ namespace cwg616 { // cwg616: 4
   using U = decltype(static_cast<S&&>(s).n);
   using U = int;
 #endif
-}
+} // namespace cwg616
 
-namespace cwg618 { // cwg618: yes
+namespace cwg618 { // cwg618: 2.7
 #if (unsigned)-1 > 0
 #error wrong
 #endif
-}
+} // namespace cwg618
 
-namespace cwg619 { // cwg619: yes
+namespace cwg619 { // cwg619: 3.4
   extern int x[10];
   struct S { static int x[10]; };
 
@@ -239,24 +244,24 @@ namespace cwg619 { // cwg619: yes
     sizeof(x);
     // expected-error@-1 {{invalid application of 'sizeof' to an incomplete type 'int[]'}}
   }
-}
+} // namespace cwg619
 
 // cwg620: dup 568
 
-namespace cwg621 { // cwg621: yes
+namespace cwg621 { // cwg621: 2.7
   template<typename T> T f();
   template<> int f() {} // #cwg621-f
   template<> int f<int>() {}
   // expected-error@-1 {{redefinition of 'f<int>'}}
   //   expected-note@#cwg621-f {{previous definition is here}}
-}
+} // namespace cwg621
 
 // cwg623: na
 // FIXME: Add documentation saying we allow invalid pointer values.
 
 // cwg624 needs a libc++abi test.
 
-namespace cwg625 { // cwg625: yes
+namespace cwg625 { // cwg625: 2.9
   template<typename T> struct A {};
   A<auto> x = A<int>();
   // cxx98-error@-1 {{'auto' type specifier is a C++11 extension}}
@@ -265,16 +270,16 @@ namespace cwg625 { // cwg625: yes
   void (*p)(auto) = f;
   // cxx98-error@-1 {{'auto' type specifier is a C++11 extension}}
   // expected-error@-2 {{'auto' not allowed in function prototype}}
-}
+} // namespace cwg625
 
-namespace cwg626 { // cwg626: yes
+namespace cwg626 { // cwg626: 2.7
 #define STR(x) #x
   char c[2] = STR(c); // ok, type matches
   wchar_t w[2] = STR(w);
   // expected-error@-1 {{initializing wide char array with non-wide string literal}}
-}
+} // namespace cwg626
 
-namespace cwg627 { // cwg627: yes
+namespace cwg627 { // cwg627: 2.7
   void f() {
     // FIXME: emitted diagnostic have a room for improvement
     true a = 0;
@@ -282,7 +287,7 @@ namespace cwg627 { // cwg627: yes
     // expected-error@-2 {{use of undeclared identifier 'a'}}
     // expected-warning@-3 {{expression result unused}}
   }
-}
+} // namespace cwg627
 
 // cwg628: na
 
@@ -297,9 +302,9 @@ namespace cwg629 { // cwg629: 2.9
     // since-cxx11-error@-1 {{redefinition of 'T'}}
     //   since-cxx11-note@#cwg629-T {{previous definition is here}}
   }
-}
+} // namespace cwg629
 
-namespace cwg630 { // cwg630: yes
+namespace cwg630 { // cwg630: 2.7
 const bool MB_EQ_WC =
     ' ' == L' ' && '\t' == L'\t' && '\v' == L'\v' && '\r' == L'\r' &&
     '\n' == L'\n' && //
@@ -331,19 +336,19 @@ static_assert(!MB_EQ_WC, "__STDC_MB_MIGHT_NEQ_WC__ but all basic source characte
 #else
 static_assert(MB_EQ_WC, "!__STDC_MB_MIGHT_NEQ_WC__ but some character differs");
 #endif
-}
+} // namespace cwg630
 
 // cwg631: na
 
-namespace cwg632 { // cwg632: yes
+namespace cwg632 { // cwg632: 2.7
   struct S { int n; } s = {{5}};
   // expected-warning@-1 {{braces around scalar initializer}}
-}
+} // namespace cwg632
 
 // cwg633: na
 // see also n2993
 
-namespace cwg634 { // cwg634: yes
+namespace cwg634 { // cwg634: 2.7
   struct S { S(); S(const S&); virtual void f(); ~S(); };
   int f(...);
   char f(int);
@@ -353,9 +358,9 @@ namespace cwg634 { // cwg634: yes
   int k = f(S());
   // cxx98-error@-1 {{cannot pass object of non-POD type 'S' through variadic function; call will abort at runtime}}
   // since-cxx11-error@-2 {{cannot pass object of non-trivial type 'S' through variadic function; call will abort at runtime}}
-}
+} // namespace cwg634
 
-namespace cwg635 { // cwg635: yes
+namespace cwg635 { // cwg635: 2.7
   template<typename T> struct A { A(); ~A(); };
   template<typename T> A<T>::A<T>() {}
   // expected-error@-1 {{out-of-line constructor for 'A' cannot have template arguments}}
@@ -380,15 +385,15 @@ namespace cwg635 { // cwg635: yes
   // expected-error@#cwg635-D-T {{out-of-line constructor for 'D' cannot have template arguments}}
   // expected-error@#cwg635-D-T {{redefinition of 'D<T>'}}
   //   expected-note@#cwg635-D {{previous definition is here}}
-}
+} // namespace cwg635
 
-namespace cwg637 { // cwg637: yes
+namespace cwg637 { // cwg637: 3.0
   void f(int i) {
     i = ++i + 1;
     i = i++ + 1;
     // cxx98-14-warning@-1 {{multiple unsequenced modifications to 'i'}}
   }
-}
+} // namespace cwg637
 
 namespace cwg638 { // cwg638: no
   template<typename T> struct A {
@@ -423,16 +428,16 @@ namespace cwg638 { // cwg638: no
       void h() { X::type e; } // FIXME: private
     };
   };
-}
+} // namespace cwg638
 
 namespace cwg639 { // cwg639: 3.3
   void f(int i) {
     void((i = 0) + (i = 0));
     // expected-warning@-1 {{multiple unsequenced modifications to 'i'}}
   }
-}
+} // namespace cwg639
 
-namespace cwg641 { // cwg641: yes
+namespace cwg641 { // cwg641: 2.7
   namespace std_example {
     struct abc;
 
@@ -480,9 +485,9 @@ namespace cwg641 { // cwg641: yes
     (void)A();
     (void)ca;
   }
-}
+} // namespace cwg641
 
-namespace cwg642 { // cwg642: yes
+namespace cwg642 { // cwg642: 2.7
   void f() {
     const int i = 2;
     {
@@ -496,10 +501,10 @@ namespace cwg642 { // cwg642: yes
     struct s *p = new struct s;
     p->a = s;
   }
-}
+} // namespace cwg642
 
-#if __cplusplus >= 201103L
 namespace cwg643 { // cwg643: 3.2
+#if __cplusplus >= 201103L
   struct A {
     int x;
     auto f() -> decltype(this->x);
@@ -513,11 +518,11 @@ namespace cwg643 { // cwg643: 3.2
     // since-cxx11-error@-1 {{use of undeclared identifier 'y'}}
     int y;
   };
-}
 #endif
+} // namespace cwg643
 
-#if __cplusplus >= 201103L
 namespace cwg644 { // cwg644: partial
+#if __cplusplus >= 201103L
   struct A {
     A() = default;
     int x, y;
@@ -540,15 +545,15 @@ namespace cwg644 { // cwg644: partial
     constexpr E() = default;
   };
   static_assert(!__is_literal_type(E<C>), "");
-}
 #endif
+} // namespace cwg644
 
 // cwg645 increases permission to optimize; it's not clear that it's possible to
 // test for this.
 // cwg645: na
 
-#if __cplusplus >= 201103L
 namespace cwg646 { // cwg646: sup 981
+#if __cplusplus >= 201103L
   struct A {
     constexpr A(const A&) = default; // ok
   };
@@ -558,11 +563,11 @@ namespace cwg646 { // cwg646: sup 981
     B(B&);
   };
   constexpr B b = {}; // ok
-}
 #endif
+} // namespace cwg646
 
-#if __cplusplus >= 201103L
 namespace cwg647 { // cwg647: 3.1
+#if __cplusplus >= 201103L
   // This is partially superseded by cwg1358.
   struct A {
     constexpr virtual void f() const;
@@ -619,20 +624,20 @@ namespace cwg647 { // cwg647: 3.1
         : n(get()),
           d(D(0) + f) {} // #cwg647-float-d
   };
-}
 #endif
+} // namespace cwg647
 
+namespace cwg648 { // cwg648: 2.7
 #if __cplusplus >= 201103L
-namespace cwg648 { // cwg648: yes
   int f();
   constexpr int a = (true ? 1 : f());
   constexpr int b = false && f();
   constexpr int c = true || f();
-}
 #endif
+} // namespace cwg648
 
-#if __cplusplus >= 201103L
 namespace cwg649 { // cwg649: 3.5
+#if __cplusplus >= 201103L
 // Maximum alignment is 8192 bytes for Windows, and 4 GB for Linux
 alignas(0x200000000) int n;
 // since-cxx11-error-re@-1 {{{{requested alignment must be (8192|4294967296) bytes or smaller}}}}
@@ -645,13 +650,13 @@ struct Y {
   struct alignas(256) Z {};
   // This part is superseded by cwg2130 and eventually by aligned allocation support.
   auto *p = new Z;
-}
 #endif
+} // namespace cwg649
 
 // cwg650 is in cwg650.cpp
 
+namespace cwg651 { // cwg651: 2.7
 #if __cplusplus >= 201103L
-namespace cwg651 { // cwg651: yes
   struct X {
     virtual X &f();
   };
@@ -660,20 +665,20 @@ namespace cwg651 { // cwg651: yes
   };
   using T = decltype(((X&&)Y()).f());
   using T = X &;
-}
 #endif
+} // namespace cwg651
 
+namespace cwg652 { // cwg652: 3.1
 #if __cplusplus >= 201103L
-namespace cwg652 { // cwg652: yes
   constexpr int n = 1.2 * 3.4;
   static_assert(n == 4, "");
-}
 #endif
+} // namespace cwg652
 
 // cwg653 is in cwg653.cpp
 
-#if __cplusplus >= 201103L
 namespace cwg654 { // cwg654: sup 1423
+#if __cplusplus >= 201103L
   void f() {
     if (nullptr) {}
     // since-cxx11-warning@-1 {{implicit conversion of nullptr constant to 'bool'}}
@@ -696,10 +701,10 @@ namespace cwg654 { // cwg654: sup 1423
     void(true ? nullptr : 0);
     void(true ? 0 : nullptr);
   }
-}
 #endif
+} // namespace cwg654
 
-namespace cwg655 { // cwg655: yes
+namespace cwg655 { // cwg655: 3.0
   struct A { A(int); }; // #cwg655-A
   struct B : A {
     A a; // #cwg655-a
@@ -715,9 +720,9 @@ namespace cwg655 { // cwg655: yes
     //   expected-note@#cwg655-a {{member is declared here}}
     //   expected-note@#cwg655-A {{'cwg655::A' declared here}}
   };
-}
+} // namespace cwg655
 
-namespace cwg656 { // cwg656: yes
+namespace cwg656 { // cwg656: 2.8
   struct A { A(const A&) = delete; };
   // cxx98-error@-1 {{deleted function definitions are a C++11 extension}}
   struct B : A {};
@@ -756,7 +761,7 @@ namespace cwg656 { // cwg656: yes
     //   expected-note@#cwg656-Y {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
     accept<const D&>(c);
   }
-}
+} // namespace cwg656
 
 namespace cwg657 { // cwg657: partial
   struct Abs { virtual void x() = 0; }; // #cwg657-Abs
@@ -794,12 +799,12 @@ namespace cwg657 { // cwg657: partial
 
   // FIXME: We should reject this too.
   Cnvt2<Abs>::type err;
-}
+} // namespace cwg657
 
 // cwg658 is in cwg658.cpp
 
-#if __cplusplus >= 201103L
 namespace cwg659 { // cwg659: 3.0
+#if __cplusplus >= 201103L
   static_assert(alignof(char) == alignof(char&), "");
   static_assert(alignof(int) == alignof(int&), "");
   int n = alignof(int(&)());
@@ -808,11 +813,11 @@ namespace cwg659 { // cwg659: 3.0
   int m = alignof(A&);
   // since-cxx11-error@-1 {{invalid application of 'alignof' to an incomplete type 'A'}}
   //   since-cxx11-note@#cwg659-A {{forward declaration of 'cwg659::A'}}
-}
 #endif
+} // namespace cwg659
 
-#if __cplusplus >= 201103L
 namespace cwg660 { // cwg660: 3.0
+#if __cplusplus >= 201103L
   enum : int { a };
   enum class { b };
   // since-cxx11-error@-1 {{scoped enumeration requires a name}}
@@ -824,12 +829,12 @@ namespace cwg660 { // cwg660: 3.0
     // since-cxx11-error@-1 {{scoped enumeration requires a name}}
   };
   auto y = X::a;
-}
 #endif
+} // namespace cwg660
 
 // cwg661 is in cwg661.cpp
 
-namespace cwg662 { // cwg662: yes
+namespace cwg662 { // cwg662: 2.7
   template <typename T> void f(T t) {
     T &tr = t;
     T *tp = &t;
@@ -840,22 +845,22 @@ namespace cwg662 { // cwg662: yes
 #endif
   }
   void g(int n) { f<int&>(n); } // #cwg662-f-call
-}
+} // namespace cwg662
 
 namespace cwg663 { // cwg663: sup P1949
   int ЍЎ = 123;
-}
+} // namespace cwg663
 
+namespace cwg664 { // cwg664: 2.7
 #if __cplusplus >= 201103L
-namespace cwg664 { // cwg664: yes
   struct A { A(const A&) = delete; };
   A &&f(A &&a, int n) {
     if (n)
       return f(static_cast<A&&>(a), n - 1);
     return static_cast<A&&>(a);
   }
-}
 #endif
+} // namespace cwg664
 
 namespace cwg665 { // cwg665: 2.8
   struct A { virtual ~A(); };
@@ -882,7 +887,7 @@ namespace cwg665 { // cwg665: 2.8
     //   expected-note@#cwg665-VC {{declared private here}}
     (void)dynamic_cast<A*>(vd);
   }
-}
+} // namespace cwg665
 
 namespace cwg666 { // cwg666: 2.8
   struct P { friend P operator*(P, P); P(int); } p(0);
@@ -901,11 +906,11 @@ namespace cwg666 { // cwg666: 2.8
   struct Y { typedef int type; };
   int a = f<X>();
   int b = f<Y>(); // #cwg666-f-Y
-}
+} // namespace cwg666
 
 // Triviality is entirely different in C++98.
-#if __cplusplus >= 201103L
 namespace cwg667 { // cwg667: 8
+#if __cplusplus >= 201103L
   struct A {
     A() = default; // #cwg667-A-ctor
     // since-cxx11-warning@-1 {{explicitly defaulted default constructor is implicitly deleted}}
@@ -926,13 +931,13 @@ namespace cwg667 { // cwg667: 8
   struct F { F &operator=(F&&) = delete; };
   struct G : F {};
   static_assert(!__is_trivially_assignable(G, G&&), "");
-}
 #endif
+} // namespace cwg667
 
 // cwg668 needs an libc++abi test
 
+namespace cwg669 { // cwg669: 3.1
 #if __cplusplus >= 201103L
-namespace cwg669 { // cwg669: yes
   void f() {
     int n;
     using T = decltype(n);
@@ -957,8 +962,8 @@ namespace cwg669 { // cwg669: yes
       }
     };
   }
-}
 #endif
+} // namespace cwg669
 
 namespace cwg671 { // cwg671: 2.9
   enum class E { e };
@@ -967,11 +972,11 @@ namespace cwg671 { // cwg671: 2.9
   int n = static_cast<int>(E::e);
   // cxx98-error@-1 {{use of enumeration in a nested name specifier is a C++11 extension}}
   int m = static_cast<int>(e);
-}
+} // namespace cwg671
 
 // cwg672 is in cwg672.cpp
 
-namespace cwg673 { // cwg673: yes
+namespace cwg673 { // cwg673: 2.7
   template<typename> struct X { static const int n = 0; };
 
   class A {
@@ -979,15 +984,15 @@ namespace cwg673 { // cwg673: yes
     class C *f();
     void f(class D *);
     enum { e = X<struct E>::n };
-    void g() { extern struct F *p; }
+    void g() { extern struct FF *p; }
   };
   B *b;
   C *c;
   D *d;
   E *e;
-  F *f;
-  // expected-error@-1 {{unknown type name 'F'}}
-}
+  FF *ff;
+  // expected-error@-1 {{unknown type name 'FF'}}
+} // namespace cwg673
 
 namespace cwg674 { // cwg674: 8
   template<typename T> int f(T);
@@ -1053,7 +1058,7 @@ namespace cwg674 { // cwg674: 8
   template int Y::f<>(int);
   template int Y::g<>(int); // #cwg674-Y-g-int
   template int Y::h<>(int);
-}
+} // namespace cwg674
 
 namespace cwg675 { // cwg675: dup 739
   template<typename T> struct A { T n : 1; };
@@ -1065,7 +1070,7 @@ namespace cwg675 { // cwg675: dup 739
   static_assert(A<long long>{1}.n < 0, "");
   // since-cxx11-warning@-1 {{implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1}}
 #endif
-}
+} // namespace cwg675
 
 // cwg676: na
 
@@ -1091,23 +1096,23 @@ namespace cwg677 { // cwg677: no
   B::~B() {}
   // expected-error@-1 {{attempt to use a deleted function}}
   //   expected-note@#cwg677-B-delete {{'operator delete' has been explicitly marked deleted here}}
-}
+} // namespace cwg677
 
 // cwg678 FIXME: check that the modules ODR check catches this
 
-namespace cwg679 { // cwg679: yes
+namespace cwg679 { // cwg679: 2.7
   struct X {};
   template<int> void operator+(X, X);
   template<> void operator+<0>(X, X) {} // #cwg679-def
   template<> void operator+<0>(X, X) {}
   // expected-error@-1 {{redefinition of 'operator+<0>'}}
   //   expected-note@#cwg679-def {{previous definition is here}}
-}
+} // namespace cwg679
 
 // cwg680: na
 
-#if __cplusplus >= 201103L
 namespace cwg681 { // cwg681: partial
+#if __cplusplus >= 201103L
   auto *a() -> int;
   // since-cxx11-error@-1 {{function with trailing return type must specify return type 'auto', not 'auto *'}}
   auto (*b)() -> int;
@@ -1126,11 +1131,11 @@ namespace cwg681 { // cwg681: partial
 
   auto f() -> int (*)();
   auto g() -> auto (*)() -> int;
-}
 #endif
+} // namespace cwg681
 
+namespace cwg683 { // cwg683: 3.3
 #if __cplusplus >= 201103L
-namespace cwg683 { // cwg683: yes
   struct A {
     A() = default;
     A(const A&) = default;
@@ -1144,22 +1149,22 @@ namespace cwg683 { // cwg683: yes
   static_assert(__is_trivially_constructible(B, const B&), "");
   static_assert(__is_trivially_constructible(B, B&), "");
   static_assert(__is_trivial(B), "");
-}
 #endif
+} // namespace cwg683
 
-#if __cplusplus >= 201103L
 namespace cwg684 { // cwg684: sup 1454
+#if __cplusplus >= 201103L
   void f() {
     int a;  // #cwg684-a
     constexpr int *p = &a;
-    // expected-error@-1 {{constexpr variable 'p' must be initialized by a constant expression}}
-    //   expected-note@-2 {{pointer to 'a' is not a constant expression}}
-    //   expected-note@#cwg684-a {{here}}
+    // since-cxx11-error@-1 {{constexpr variable 'p' must be initialized by a constant expression}}
+    //   since-cxx11-note@-2 {{pointer to 'a' is not a constant expression}}
+    //   since-cxx11-note@#cwg684-a {{here}}
   }
-}
 #endif
+} // namespace cwg684
 
-namespace cwg685 { // cwg685: yes
+namespace cwg685 { // cwg685: 10
   enum E : long { e };
   // cxx98-error@-1 {{enumeration types with a fixed underlying type are a C++11 extension}}
   void f(int);
@@ -1187,7 +1192,7 @@ namespace cwg685 { // cwg685: yes
   int k(short);
   void k(int);
   int x = k(g);
-}
+} // namespace cwg685
 
 namespace cwg686 { // cwg686: 3.0
   void f() {
@@ -1244,18 +1249,18 @@ namespace cwg686 { // cwg686: 3.0
     //   expected-note@-2 {{forward declaration of 'P'}}
     catch (struct P {} *) {}
     // expected-error@-1 {{'P' cannot be defined in a type specifier}}
-#if __cplusplus < 201703L
+#if __cplusplus <= 201402L
     void g() throw(struct Q);
-    // cxx98-17-error@-1 {{incomplete type 'struct Q' is not allowed in exception specification}}
-    //   cxx98-17-note@-2 {{forward declaration of 'Q'}}
+    // cxx98-14-error@-1 {{incomplete type 'struct Q' is not allowed in exception specification}}
+    //   cxx98-14-note@-2 {{forward declaration of 'Q'}}
     void h() throw(struct Q {});
-    // cxx98-17-error@-1 {{'Q' cannot be defined in a type specifier}}
+    // cxx98-14-error@-1 {{'Q' cannot be defined in a type specifier}}
 #endif
   }
   template<struct R *> struct X;
   template<struct R {} *> struct Y;
   // expected-error@-1 {{'cwg686::R' cannot be defined in a type specifier}}
-}
+} // namespace cwg686
 
 namespace cwg687 { // cwg687 (9 c++20, but the issue is still considered open)
   template<typename T> void f(T a) {
@@ -1267,7 +1272,7 @@ namespace cwg687 { // cwg687 (9 c++20, but the issue is still considered open)
     template g<int>(a);
     // expected-error@-1 {{expected '<' after 'template'}}
   }
-}
+} // namespace cwg687
 
 namespace cwg692 { // cwg692: 16
   // Also see cwg1395.
@@ -1358,7 +1363,7 @@ namespace cwg692 { // cwg692: 16
     template<class T> void f(T){}
     template void f(int*);
   }
-}
+} // namespace cwg692
 
 namespace cwg696 { // cwg696: 3.1
   void f(const int*);
@@ -1384,4 +1389,4 @@ namespace cwg696 { // cwg696: 3.1
     //   since-cxx11-note@-7 {{default capture by reference}}
 #endif
   }
-}
+} // namespace cwg696
diff --git a/clang/test/CXX/drs/cwg722.cpp b/clang/test/CXX/drs/cwg722.cpp
index 6e7d456..1d95e14 100644
--- a/clang/test/CXX/drs/cwg722.cpp
+++ b/clang/test/CXX/drs/cwg722.cpp
@@ -12,7 +12,7 @@
 #if __cplusplus >= 201103L
 namespace std {
   using nullptr_t = decltype(nullptr);
-}
+} // namespace std
 
 void f(std::nullptr_t, ...);
 std::nullptr_t g();
diff --git a/clang/test/CXX/drs/cwg7xx.cpp b/clang/test/CXX/drs/cwg7xx.cpp
index 507eb8f..a8ab2e2 100644
--- a/clang/test/CXX/drs/cwg7xx.cpp
+++ b/clang/test/CXX/drs/cwg7xx.cpp
@@ -2,14 +2,16 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 %s -verify=expected,cxx98-14,cxx98-11,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++14 %s -verify=expected,cxx98-14,since-cxx14,since-cxx11,cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++17 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++2a %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++23 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++2c %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
 // cxx98-error@-1 {{variadic macros are a C99 feature}}
 #endif
 
-namespace cwg705 { // cwg705: yes
+namespace cwg705 { // cwg705: 2.7
   namespace N {
     struct S {};
     void f(S); // #cwg705-f
@@ -22,7 +24,7 @@ namespace cwg705 { // cwg705: yes
     // expected-error@-1 {{use of undeclared identifier 'f'}}
     //   expected-note@#cwg705-f {{'N::f' declared here}}
   }
-}
+} // namespace cwg705
 
 namespace cwg712 { // cwg712: partial
   void use(int);
@@ -74,7 +76,7 @@ namespace cwg712 { // cwg712: partial
     };
   }
 #endif
-}
+} // namespace cwg712
 
 namespace cwg713 { // cwg713: 3.0
 template<typename T>
@@ -322,7 +324,7 @@ namespace cwg727 { // cwg727: partial
     //   expected-note@#cwg727-S2-T {{previous}}
   };
   Collision<int, int> c; // #cwg727-Collision-int-int
-}
+} // namespace cwg727
 
 namespace cwg777 { // cwg777: 3.7
 #if __cplusplus >= 201103L
@@ -336,4 +338,16 @@ void g(int i = 0, T ...args, T ...args2) {}
 template <typename... T>
 void h(int i = 0, T ...args, int j = 1) {}
 #endif
-}
+} // namespace cwg777
+
+namespace cwg794 { // cwg794: 2.7
+struct B {};
+struct D : B {};
+struct X {
+  D d;
+};
+struct Y : X {};
+B Y::*pm = &X::d;
+// expected-error@-1 {{cannot initialize a variable of type 'B Y::*' with an rvalue of type 'D cwg794::X::*': different classes ('Y' vs 'cwg794::X')}}
+// FIXME: why diagnostic says just `Y` and not `cwg794::Y`, like `cwg794::X`?
+} // namespace cwg794
diff --git a/clang/test/CXX/drs/cwg8xx.cpp b/clang/test/CXX/drs/cwg8xx.cpp
index 38bff3ad..ecb9113 100644
--- a/clang/test/CXX/drs/cwg8xx.cpp
+++ b/clang/test/CXX/drs/cwg8xx.cpp
@@ -13,7 +13,7 @@ export template <class T> struct B {};
 export template<typename T> void f() {}
 // cxx98-17-warning@-1 {{exported templates are unsupported}}
 // since-cxx20-error@-2 {{export declaration can only be used within a module purview}}
-}
+} // namespace cwg820
 
 namespace cwg873 { // cwg873: 3.0
 #if __cplusplus >= 201103L
diff --git a/clang/test/CXX/drs/cwg9xx.cpp b/clang/test/CXX/drs/cwg9xx.cpp
index d4f54bc..96e4674 100644
--- a/clang/test/CXX/drs/cwg9xx.cpp
+++ b/clang/test/CXX/drs/cwg9xx.cpp
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
 namespace std {
   __extension__ typedef __SIZE_TYPE__ size_t;
@@ -12,7 +13,7 @@ namespace std {
     const T *p; size_t n;
     initializer_list(const T *p, size_t n);
   };
-}
+} // namespace std
 
 namespace cwg930 { // cwg930: 2.7
 #if __cplusplus >= 201103L
@@ -48,7 +49,7 @@ namespace cwg948 { // cwg948: 3.7
      while (constexpr A i = 0) { }
   }
 #endif
-}
+} // namespace cwg948
 
 namespace cwg952 { // cwg952: 2.8
 namespace example1 {
@@ -143,15 +144,15 @@ class B : A {
 
 } // namespace cwg960
 
-namespace cwg974 { // cwg974: yes
+namespace cwg974 { // cwg974: 3.3
 #if __cplusplus >= 201103L
   void test() {
     auto lam = [](int x = 42) { return x; };
   }
 #endif
-}
+} // namespace cwg974
 
-namespace cwg977 { // cwg977: yes
+namespace cwg977 { // cwg977: 2.7
 enum E { e = E() }; // #cwg977-E
 #if !defined(_WIN32) || defined(__MINGW32__)
 // expected-error@#cwg977-E {{invalid use of incomplete type 'E'}}
@@ -197,4 +198,4 @@ namespace cwg990 { // cwg990: 3.5
   };
   D d{};
 #endif
-}
+} // namespace cwg990
diff --git a/clang/test/CodeGen/AArch64/fmv-dependencies.c b/clang/test/CodeGen/AArch64/fmv-dependencies.c
index 3a524b8..097b85e 100644
--- a/clang/test/CodeGen/AArch64/fmv-dependencies.c
+++ b/clang/test/CodeGen/AArch64/fmv-dependencies.c
@@ -42,7 +42,7 @@ __attribute__((target_version("flagm"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Mflagm2() #[[flagm2:[0-9]+]] {
 __attribute__((target_version("flagm2"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Mfp() #[[default:[0-9]+]] {
+// CHECK: define dso_local i32 @fmv._Mfp() #[[fp:[0-9]+]] {
 __attribute__((target_version("fp"))) int fmv(void) { return 0; }
 
 // CHECK: define dso_local i32 @fmv._Mfp16() #[[fp16:[0-9]+]] {
@@ -99,7 +99,7 @@ __attribute__((target_version("sha2"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Msha3() #[[sha3:[0-9]+]] {
 __attribute__((target_version("sha3"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Msimd() #[[default]] {
+// CHECK: define dso_local i32 @fmv._Msimd() #[[simd:[0-9]+]] {
 __attribute__((target_version("simd"))) int fmv(void) { return 0; }
 
 // CHECK: define dso_local i32 @fmv._Msm4() #[[sm4:[0-9]+]] {
@@ -150,48 +150,49 @@ int caller() {
   return fmv();
 }
 
-// CHECK: attributes #[[aes]] = { {{.*}} "target-features"="+aes,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[bf16]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[bti]] = { {{.*}} "target-features"="+bti,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[crc]] = { {{.*}} "target-features"="+crc,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[dit]] = { {{.*}} "target-features"="+dit,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[dotprod]] = { {{.*}} "target-features"="+dotprod,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[dpb]] = { {{.*}} "target-features"="+ccpp,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[dpb2]] = { {{.*}} "target-features"="+ccdp,+ccpp,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[f32mm]] = { {{.*}} "target-features"="+f32mm,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
-// CHECK: attributes #[[f64mm]] = { {{.*}} "target-features"="+f64mm,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
-// CHECK: attributes #[[fcma]] = { {{.*}} "target-features"="+complxnum,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[flagm]] = { {{.*}} "target-features"="+flagm,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[flagm2]] = { {{.*}} "target-features"="+altnzcv,+flagm,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[default]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[fp16]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[fp16fml]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fp16fml,+fullfp16,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[frintts]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fptoint,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[i8mm]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+i8mm,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[jscvt]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+jsconv,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[ls64]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+ls64,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[lse]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+lse,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[memtag]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+mte,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[mops]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[predres]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+predres,+v8a"
-// CHECK: attributes #[[rcpc]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rcpc,+v8a"
-// CHECK: attributes #[[rcpc2]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+v8a"
-// CHECK: attributes #[[rcpc3]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+rcpc3,+v8a"
-// CHECK: attributes #[[rdm]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rdm,+v8a"
-// CHECK: attributes #[[rng]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rand,+v8a"
-// CHECK: attributes #[[sb]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+sb,+v8a"
-// CHECK: attributes #[[sha2]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+sha2,+v8a"
-// CHECK: attributes #[[sha3]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+sha2,+sha3,+v8a"
-// CHECK: attributes #[[sm4]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+sm4,+v8a"
-// CHECK: attributes #[[sme]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+v8a"
-// CHECK: attributes #[[sme_f64f64]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme-f64f64,+v8a"
-// CHECK: attributes #[[sme_i16i64]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme-i16i64,+v8a"
-// CHECK: attributes #[[sme2]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme2,+v8a"
-// CHECK: attributes #[[ssbs]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+ssbs,+v8a"
-// CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
-// CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a"
-// CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+sve2-aes,+v8a"
-// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a"
-// CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a"
-// CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sm4,+sve,+sve2,+sve2-sm4,+v8a"
-// CHECK: attributes #[[wfxt]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a,+wfxt"
+// CHECK: attributes #[[aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[bf16]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[bti]] = { {{.*}} "target-features"="+bti,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[crc]] = { {{.*}} "target-features"="+crc,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dit]] = { {{.*}} "target-features"="+dit,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dotprod]] = { {{.*}} "target-features"="+dotprod,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dpb]] = { {{.*}} "target-features"="+ccpp,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dpb2]] = { {{.*}} "target-features"="+ccdp,+ccpp,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[f32mm]] = { {{.*}} "target-features"="+f32mm,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[f64mm]] = { {{.*}} "target-features"="+f64mm,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[fcma]] = { {{.*}} "target-features"="+complxnum,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[flagm]] = { {{.*}} "target-features"="+flagm,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[flagm2]] = { {{.*}} "target-features"="+altnzcv,+flagm,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[fp]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[fp16]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[fp16fml]] = { {{.*}} "target-features"="+fp-armv8,+fp16fml,+fullfp16,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[frintts]] = { {{.*}} "target-features"="+fp-armv8,+fptoint,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[i8mm]] = { {{.*}} "target-features"="+fp-armv8,+i8mm,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[jscvt]] = { {{.*}} "target-features"="+fp-armv8,+jsconv,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[ls64]] = { {{.*}} "target-features"="+fp-armv8,+ls64,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[lse]] = { {{.*}} "target-features"="+fp-armv8,+lse,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[memtag]] = { {{.*}} "target-features"="+fp-armv8,+mte,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[mops]] = { {{.*}} "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[predres]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+predres,+v8a"
+// CHECK: attributes #[[rcpc]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+v8a"
+// CHECK: attributes #[[rcpc2]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+v8a"
+// CHECK: attributes #[[rcpc3]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+rcpc3,+v8a"
+// CHECK: attributes #[[rdm]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rdm,+v8a"
+// CHECK: attributes #[[rng]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rand,+v8a"
+// CHECK: attributes #[[sb]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sb,+v8a"
+// CHECK: attributes #[[sha2]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sha2,+v8a"
+// CHECK: attributes #[[sha3]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sha2,+sha3,+v8a"
+// CHECK: attributes #[[simd]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[sm4]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sm4,+v8a"
+// CHECK: attributes #[[sme]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+v8a"
+// CHECK: attributes #[[sme_f64f64]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme-f64f64,+v8a"
+// CHECK: attributes #[[sme_i16i64]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme-i16i64,+v8a"
+// CHECK: attributes #[[sme2]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme2,+v8a"
+// CHECK: attributes #[[ssbs]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+ssbs,+v8a"
+// CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a"
+// CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+sve2-aes,+v8a"
+// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a"
+// CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a"
+// CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sm4,+sve,+sve2,+sve2-sm4,+v8a"
+// CHECK: attributes #[[wfxt]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+wfxt"
diff --git a/clang/test/CodeGen/AArch64/fmv-features.c b/clang/test/CodeGen/AArch64/fmv-features.c
new file mode 100644
index 0000000..f78bf4b
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fmv-features.c
@@ -0,0 +1,200 @@
+// Test all of the AArch64 fmv-features metadata without any dependency expansion.
+// It is used to propagate the attribute string information from C/C++ source to LLVM IR.
+
+// RUN: %clang --target=aarch64-linux-gnu --rtlib=compiler-rt -emit-llvm -S -o - %s | FileCheck %s
+
+// CHECK: define dso_local i32 @fmv._Maes() #[[aes:[0-9]+]] {
+// CHECK: define dso_local i32 @fmv._Mbf16() #[[bf16:[0-9]+]] {
+__attribute__((target_clones("aes", "bf16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mbti() #[[bti:[0-9]+]] {
+__attribute__((target_version("bti"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mcrc() #[[crc:[0-9]+]] {
+__attribute__((target_version("crc"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdit() #[[dit:[0-9]+]] {
+__attribute__((target_version("dit"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdotprod() #[[dotprod:[0-9]+]] {
+__attribute__((target_version("dotprod"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdpb() #[[dpb:[0-9]+]] {
+__attribute__((target_version("dpb"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdpb2() #[[dpb2:[0-9]+]] {
+__attribute__((target_version("dpb2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mf32mm() #[[f32mm:[0-9]+]] {
+__attribute__((target_version("f32mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mf64mm() #[[f64mm:[0-9]+]] {
+__attribute__((target_version("f64mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfcma() #[[fcma:[0-9]+]] {
+__attribute__((target_version("fcma"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mflagm() #[[flagm:[0-9]+]] {
+__attribute__((target_version("flagm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mflagm2() #[[flagm2:[0-9]+]] {
+__attribute__((target_version("flagm2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp() #[[fp:[0-9]+]] {
+__attribute__((target_version("fp"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp16() #[[fp16:[0-9]+]] {
+__attribute__((target_version("fp16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp16fml() #[[fp16fml:[0-9]+]] {
+__attribute__((target_version("fp16fml"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfrintts() #[[frintts:[0-9]+]] {
+__attribute__((target_version("frintts"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mi8mm() #[[i8mm:[0-9]+]] {
+__attribute__((target_version("i8mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mjscvt() #[[jscvt:[0-9]+]] {
+__attribute__((target_version("jscvt"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mls64() #[[ls64:[0-9]+]] {
+__attribute__((target_version("ls64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mlse() #[[lse:[0-9]+]] {
+__attribute__((target_version("lse"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mmemtag() #[[memtag:[0-9]+]] {
+__attribute__((target_version("memtag"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mmops() #[[mops:[0-9]+]] {
+__attribute__((target_version("mops"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mpredres() #[[predres:[0-9]+]] {
+__attribute__((target_version("predres"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc() #[[rcpc:[0-9]+]] {
+__attribute__((target_version("rcpc"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc2() #[[rcpc2:[0-9]+]] {
+__attribute__((target_version("rcpc2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc3() #[[rcpc3:[0-9]+]] {
+__attribute__((target_version("rcpc3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrdm() #[[rdm:[0-9]+]] {
+__attribute__((target_version("rdm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrng() #[[rng:[0-9]+]] {
+__attribute__((target_version("rng"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msb() #[[sb:[0-9]+]] {
+__attribute__((target_version("sb"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msha2() #[[sha2:[0-9]+]] {
+__attribute__((target_version("sha2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msha3() #[[sha3:[0-9]+]] {
+__attribute__((target_version("sha3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msimd() #[[simd:[0-9]+]] {
+__attribute__((target_version("simd"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msm4() #[[sm4:[0-9]+]] {
+__attribute__((target_version("sm4"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme() #[[sme:[0-9]+]] {
+__attribute__((target_version("sme"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme-f64f64() #[[sme_f64f64:[0-9]+]] {
+__attribute__((target_version("sme-f64f64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme-i16i64() #[[sme_i16i64:[0-9]+]] {
+__attribute__((target_version("sme-i16i64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme2() #[[sme2:[0-9]+]] {
+__attribute__((target_version("sme2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mssbs() #[[ssbs:[0-9]+]] {
+__attribute__((target_version("ssbs"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve() #[[sve:[0-9]+]] {
+__attribute__((target_version("sve"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2() #[[sve2:[0-9]+]] {
+__attribute__((target_version("sve2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-aes() #[[sve2_aes:[0-9]+]] {
+__attribute__((target_version("sve2-aes"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-bitperm() #[[sve2_bitperm:[0-9]+]] {
+__attribute__((target_version("sve2-bitperm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-sha3() #[[sve2_sha3:[0-9]+]] {
+__attribute__((target_version("sve2-sha3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-sm4() #[[sve2_sm4:[0-9]+]] {
+__attribute__((target_version("sve2-sm4"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mwfxt() #[[wfxt:[0-9]+]] {
+__attribute__((target_version("wfxt"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._MaesMbf16MbtiMcrc() #[[multiple_features:[0-9]+]] {
+__attribute__((target_version("aes+bf16+bti+crc"))) int fmv(void) { return 0; }
+
+// CHECK-NOT: define dso_local i32 @fmv._M{{.*}}
+__attribute__((target_version("non_existent_extension"))) int fmv(void);
+
+__attribute__((target_version("default"))) int fmv(void);
+
+int caller() {
+  return fmv();
+}
+
+// CHECK: attributes #[[aes]] = { {{.*}} "fmv-features"="+aes"
+// CHECK: attributes #[[bf16]] = { {{.*}} "fmv-features"="+bf16"
+// CHECK: attributes #[[bti]] = { {{.*}} "fmv-features"="+bti"
+// CHECK: attributes #[[crc]] = { {{.*}} "fmv-features"="+crc"
+// CHECK: attributes #[[dit]] = { {{.*}} "fmv-features"="+dit"
+// CHECK: attributes #[[dotprod]] = { {{.*}} "fmv-features"="+dotprod"
+// CHECK: attributes #[[dpb]] = { {{.*}} "fmv-features"="+dpb"
+// CHECK: attributes #[[dpb2]] = { {{.*}} "fmv-features"="+dpb2"
+// CHECK: attributes #[[f32mm]] = { {{.*}} "fmv-features"="+f32mm"
+// CHECK: attributes #[[f64mm]] = { {{.*}} "fmv-features"="+f64mm"
+// CHECK: attributes #[[fcma]] = { {{.*}} "fmv-features"="+fcma"
+// CHECK: attributes #[[flagm]] = { {{.*}} "fmv-features"="+flagm"
+// CHECK: attributes #[[flagm2]] = { {{.*}} "fmv-features"="+flagm2"
+// CHECK: attributes #[[fp]] = { {{.*}} "fmv-features"="+fp"
+// CHECK: attributes #[[fp16]] = { {{.*}} "fmv-features"="+fp16"
+// CHECK: attributes #[[fp16fml]] = { {{.*}} "fmv-features"="+fp16fml"
+// CHECK: attributes #[[frintts]] = { {{.*}} "fmv-features"="+frintts"
+// CHECK: attributes #[[i8mm]] = { {{.*}} "fmv-features"="+i8mm"
+// CHECK: attributes #[[jscvt]] = { {{.*}} "fmv-features"="+jscvt"
+// CHECK: attributes #[[ls64]] = { {{.*}} "fmv-features"="+ls64"
+// CHECK: attributes #[[lse]] = { {{.*}} "fmv-features"="+lse"
+// CHECK: attributes #[[memtag]] = { {{.*}} "fmv-features"="+memtag"
+// CHECK: attributes #[[mops]] = { {{.*}} "fmv-features"="+mops"
+// CHECK: attributes #[[predres]] = { {{.*}} "fmv-features"="+predres"
+// CHECK: attributes #[[rcpc]] = { {{.*}} "fmv-features"="+rcpc"
+// CHECK: attributes #[[rcpc2]] = { {{.*}} "fmv-features"="+rcpc2"
+// CHECK: attributes #[[rcpc3]] = { {{.*}} "fmv-features"="+rcpc3"
+// CHECK: attributes #[[rdm]] = { {{.*}} "fmv-features"="+rdm"
+// CHECK: attributes #[[rng]] = { {{.*}} "fmv-features"="+rng"
+// CHECK: attributes #[[sb]] = { {{.*}} "fmv-features"="+sb"
+// CHECK: attributes #[[sha2]] = { {{.*}} "fmv-features"="+sha2"
+// CHECK: attributes #[[sha3]] = { {{.*}} "fmv-features"="+sha3"
+// CHECK: attributes #[[simd]] = { {{.*}} "fmv-features"="+simd"
+// CHECK: attributes #[[sm4]] = { {{.*}} "fmv-features"="+sm4"
+// CHECK: attributes #[[sme]] = { {{.*}} "fmv-features"="+sme"
+// CHECK: attributes #[[sme_f64f64]] = { {{.*}} "fmv-features"="+sme-f64f64"
+// CHECK: attributes #[[sme_i16i64]] = { {{.*}} "fmv-features"="+sme-i16i64"
+// CHECK: attributes #[[sme2]] = { {{.*}} "fmv-features"="+sme2"
+// CHECK: attributes #[[ssbs]] = { {{.*}} "fmv-features"="+ssbs"
+// CHECK: attributes #[[sve]] = { {{.*}} "fmv-features"="+sve"
+// CHECK: attributes #[[sve2]] = { {{.*}} "fmv-features"="+sve2"
+// CHECK: attributes #[[sve2_aes]] = { {{.*}} "fmv-features"="+sve2-aes"
+// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "fmv-features"="+sve2-bitperm"
+// CHECK: attributes #[[sve2_sha3]] = { {{.*}} "fmv-features"="+sve2-sha3"
+// CHECK: attributes #[[sve2_sm4]] = { {{.*}} "fmv-features"="+sve2-sm4"
+// CHECK: attributes #[[wfxt]] = { {{.*}} "fmv-features"="+wfxt"
+// CHECK: attributes #[[multiple_features]] = { {{.*}} "fmv-features"="+aes,+bf16,+bti,+crc"
diff --git a/clang/test/CodeGen/AArch64/fmv-priority.c b/clang/test/CodeGen/AArch64/fmv-priority.c
new file mode 100644
index 0000000..080bb547
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fmv-priority.c
@@ -0,0 +1,55 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Priority biskmasks after feature dependency expansion:
+//
+// MSB                                                    LSB
+//
+// sme2 | ls64 | sme | bf16 |       |      | fp16 | simd | fp
+// -----+------+-----+------+-------+------+------+------+---
+// sme2 |      | sme | bf16 | rcpc2 | rcpc | fp16 | simd | fp
+//
+// Dependencies should not affect priorities, since a
+// feature can only depend on lower priority features:
+// https://github.com/ARM-software/acle/pull/376
+
+__attribute__((target_version("sme2+ls64"))) int fn(void);
+__attribute__((target_version("sme2+rcpc2"))) int fn(void);
+__attribute__((target_version("default"))) int fn(void) { return 0; }
+
+int call() { return fn(); }
+
+// CHECK-LABEL: define dso_local i32 @fn.default(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define dso_local i32 @call(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fn()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-LABEL: define weak_odr ptr @fn.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 153126785511392000
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 153126785511392000
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @fn._Mls64Msme2
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 144119586269233920
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 144119586269233920
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label %[[RESOLVER_RETURN1:.*]], label %[[RESOLVER_ELSE2:.*]]
+// CHECK:       [[RESOLVER_RETURN1]]:
+// CHECK-NEXT:    ret ptr @fn._Mrcpc2Msme2
+// CHECK:       [[RESOLVER_ELSE2]]:
+// CHECK-NEXT:    ret ptr @fn.default
+//
diff --git a/clang/test/CodeGen/AArch64/fmv-streaming.c b/clang/test/CodeGen/AArch64/fmv-streaming.c
index dc0c35a..68ba3e5 100644
--- a/clang/test/CodeGen/AArch64/fmv-streaming.c
+++ b/clang/test/CodeGen/AArch64/fmv-streaming.c
@@ -53,10 +53,10 @@ __attribute__((target_version("default"))) void sc_callee(void) __arm_streaming_
 
 
 // CHECK-LABEL: define {{[^@]+}}@n_caller
-// CHECK-SAME: () #[[caller:[0-9]+]] {
+// CHECK-SAME: () #[[default]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[callsite_streaming:[0-9]+]]
-// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible:[0-9]+]]
+// CHECK:    call void @s_callee() #[[streaming:[0-9]+]]
+// CHECK:    call void @sc_callee() #[[streaming_compatible:[0-9]+]]
 //
 void n_caller(void) {
   n_callee();
@@ -66,10 +66,10 @@ void n_caller(void) {
 
 
 // CHECK-LABEL: define {{[^@]+}}@s_caller
-// CHECK-SAME: () #[[caller_streaming:[0-9]+]] {
+// CHECK-SAME: () #[[default_streaming]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[callsite_streaming]]
-// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible]]
+// CHECK:    call void @s_callee() #[[streaming]]
+// CHECK:    call void @sc_callee() #[[streaming_compatible]]
 //
 void s_caller(void) __arm_streaming {
   n_callee();
@@ -79,10 +79,10 @@ void s_caller(void) __arm_streaming {
 
 
 // CHECK-LABEL: define {{[^@]+}}@sc_caller
-// CHECK-SAME: () #[[caller_streaming_compatible:[0-9]+]] {
+// CHECK-SAME: () #[[default_streaming_compatible]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[callsite_streaming]]
-// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible]]
+// CHECK:    call void @s_callee() #[[streaming]]
+// CHECK:    call void @sc_callee() #[[streaming_compatible]]
 //
 void sc_caller(void) __arm_streaming_compatible {
   n_callee();
@@ -103,8 +103,5 @@ void sc_caller(void) __arm_streaming_compatible {
 // CHECK: attributes #[[simd_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
 // CHECK: attributes #[[locally_streaming_sme2_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_compatible"
 // CHECK: attributes #[[default_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
-// CHECK: attributes #[[caller]] = {{.*}}
-// CHECK: attributes #[[caller_streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
-// CHECK: attributes #[[caller_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
-// CHECK: attributes #[[callsite_streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
-// CHECK: attributes #[[callsite_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
diff --git a/clang/test/CodeGen/AArch64/fpm-helpers.c b/clang/test/CodeGen/AArch64/fpm-helpers.c
index 4bced01..6264b5c 100644
--- a/clang/test/CodeGen/AArch64/fpm-helpers.c
+++ b/clang/test/CodeGen/AArch64/fpm-helpers.c
@@ -35,7 +35,7 @@ extern "C" {
 //
 fpm_t test_init() { return __arm_fpm_init(); }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_src1_1(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -6) i64 @test_src1_1(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 -8
@@ -44,7 +44,7 @@ fpm_t test_src1_1() {
   return __arm_set_fpm_src1_format(INIT_ONES, __ARM_FPM_E5M2);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_src1_2(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -6) i64 @test_src1_2(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 1
@@ -53,7 +53,7 @@ fpm_t test_src1_2() {
   return __arm_set_fpm_src1_format(INIT_ZERO, __ARM_FPM_E4M3);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_src2_1(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -48) i64 @test_src2_1(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 -57
@@ -62,7 +62,7 @@ fpm_t test_src2_1() {
   return __arm_set_fpm_src2_format(INIT_ONES, __ARM_FPM_E5M2);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_src2_2(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -48) i64 @test_src2_2(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 8
@@ -71,7 +71,7 @@ fpm_t test_src2_2() {
   return __arm_set_fpm_src2_format(INIT_ZERO, __ARM_FPM_E4M3);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_dst1_1(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -384) i64 @test_dst1_1(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 -449
@@ -80,7 +80,7 @@ fpm_t test_dst1_1() {
   return __arm_set_fpm_dst_format(INIT_ONES, __ARM_FPM_E5M2);
 }
 
-// CHECK-LABEL: define dso_local noundef i64 @test_dst2_2(
+// CHECK-LABEL: define dso_local noundef range(i64 0, -384) i64 @test_dst2_2(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 64
@@ -139,21 +139,21 @@ fpm_t test_lscale() { return __arm_set_fpm_lscale(INIT_ZERO, 127); }
 //
 fpm_t test_lscale2() { return __arm_set_fpm_lscale2(INIT_ZERO, 63); }
 
-// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_1(
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4278190081) i64 @test_nscale_1(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 2147483648
 //
 fpm_t test_nscale_1() { return __arm_set_fpm_nscale(INIT_ZERO, -128); }
 
-// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_2(
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4278190081) i64 @test_nscale_2(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 2130706432
 //
 fpm_t test_nscale_2() { return __arm_set_fpm_nscale(INIT_ZERO, 127); }
 
-// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_3(
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4278190081) i64 @test_nscale_3(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i64 4278190080
diff --git a/clang/test/CodeGen/AArch64/neon-vcmla.c b/clang/test/CodeGen/AArch64/neon-vcmla.c
index 0217152..d860411 100644
--- a/clang/test/CodeGen/AArch64/neon-vcmla.c
+++ b/clang/test/CodeGen/AArch64/neon-vcmla.c
@@ -1,444 +1,913 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon \
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple arm64 -target-feature +neon \
 // RUN:        -target-feature +v8.3a \
 // RUN:        -target-feature +fullfp16 \
-// RUN:        -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
+// RUN:        -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes="mem2reg,instsimplify" | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vcmla_f16(
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_F163_I]]
+//
 float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_F323_I]]
+//
 float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_f16(
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_F163_I]]
+//
 float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_f32(
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_F323_I]]
+//
 float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_f64(
-// CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
-// CHECK: ret <2 x double> [[RES]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_f64(
+// CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]])
+// CHECK-NEXT:    ret <2 x double> [[VCMLAQ_F643_I]]
+//
 float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_f64(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_f16(
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT90_F163_I]]
+//
 float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot90_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT90_F323_I]]
+//
 float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot90_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_f16(
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT90_F163_I]]
+//
 float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot90_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_f32(
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT90_F323_I]]
+//
 float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot90_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_f64(
-// CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
-// CHECK: ret <2 x double> [[RES]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot90_f64(
+// CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]])
+// CHECK-NEXT:    ret <2 x double> [[VCMLAQ_ROT90_F643_I]]
+//
 float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_rot90_f64(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_f16(
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT180_F163_I]]
+//
 float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot180_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT180_F323_I]]
+//
 float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot180_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_f16(
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT180_F163_I]]
+//
 float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot180_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_f32(
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT180_F323_I]]
+//
 float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot180_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_f64(
-// CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
-// CHECK: ret <2 x double> [[RES]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot180_f64(
+// CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]])
+// CHECK-NEXT:    ret <2 x double> [[VCMLAQ_ROT180_F643_I]]
+//
 float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_rot180_f64(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_f16(
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT270_F163_I]]
+//
 float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot270_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT270_F323_I]]
+//
 float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot270_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_f16(
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT270_F163_I]]
+//
 float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot270_f16(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_f32(
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT270_F323_I]]
+//
 float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot270_f32(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_f64(
-// CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
-// CHECK: ret <2 x double> [[RES]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot270_f64(
+// CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]])
+// CHECK-NEXT:    ret <2 x double> [[VCMLAQ_ROT270_F643_I]]
+//
 float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
   return vcmlaq_rot270_f64(acc, lhs, rhs);
 }
 
-// CHECK-LABEL: @test_vcmla_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_150:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_150:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <4 x half> [[RHS]], ptr [[__REINT_150]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_150]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_150]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT5]], ptr [[__REINT1_150]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_150]], align 8
+// CHECK-NEXT:    [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_F163_I]]
+//
 float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_lane_f16(acc, lhs, rhs, 1);
 }
 
 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_154:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_154:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <8 x half> [[RHS]], ptr [[__REINT_154]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_154]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_154]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT5]], ptr [[__REINT1_154]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_154]], align 8
+// CHECK-NEXT:    [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_F163_I]]
+//
 float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
   return vcmla_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmlaq_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_152:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_152:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <4 x half> [[RHS]], ptr [[__REINT_152]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_152]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_152]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[__REINT_152]], align 8
+// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGET_LANE8]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[__REINT_152]], align 8
+// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGET_LANE13]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT15]], ptr [[__REINT1_152]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_152]], align 16
+// CHECK-NEXT:    [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_F163_I]]
+//
 float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
   return vcmlaq_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_156:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_156:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <8 x half> [[RHS]], ptr [[__REINT_156]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_156]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_156]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[__REINT_156]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGETQ_LANE8]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[__REINT_156]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE13:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGETQ_LANE13]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT15]], ptr [[__REINT1_156]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_156]], align 16
+// CHECK-NEXT:    [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_F163_I]]
+//
 float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmla_lane_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_182:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[__REINT1_182:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    store <2 x float> [[RHS]], ptr [[__REINT_182]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_182]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    store <1 x i64> [[VECINIT]], ptr [[__REINT1_182]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_182]], align 8
+// CHECK-NEXT:    [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_F323_I]]
+//
 float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_lane_f32(acc, lhs, rhs, 0);
 }
 
 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_186:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__REINT1_186:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    store <4 x float> [[RHS]], ptr [[__REINT_186]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_186]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    store <1 x i64> [[VECINIT]], ptr [[__REINT1_186]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_186]], align 8
+// CHECK-NEXT:    [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_F323_I]]
+//
 float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
   return vcmla_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_lane_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
-// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
-// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_184:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[__REINT1_184:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <2 x float> [[RHS]], ptr [[__REINT_184]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_184]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[__REINT_184]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i64> [[VECINIT5]], ptr [[__REINT1_184]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_184]], align 16
+// CHECK-NEXT:    [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_F323_I]]
+//
 float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_laneq_f32(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_188:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__REINT1_188:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <4 x float> [[RHS]], ptr [[__REINT_188]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_188]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__REINT_188]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i64> [[VECINIT5]], ptr [[__REINT1_188]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_188]], align 16
+// CHECK-NEXT:    [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_F323_I]]
+//
 float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_174:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_174:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <4 x half> [[RHS]], ptr [[__REINT_174]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_174]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_174]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT5]], ptr [[__REINT1_174]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_174]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT90_F163_I]]
+//
 float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
 }
 
 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_rot90_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_178:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_178:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <8 x half> [[RHS]], ptr [[__REINT_178]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_178]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_178]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT5]], ptr [[__REINT1_178]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_178]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT90_F163_I]]
+//
 float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
   return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_176:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_176:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <4 x half> [[RHS]], ptr [[__REINT_176]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_176]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_176]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[__REINT_176]], align 8
+// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGET_LANE8]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[__REINT_176]], align 8
+// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGET_LANE13]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT15]], ptr [[__REINT1_176]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_176]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT90_F163_I]]
+//
 float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
   return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_180:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_180:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <8 x half> [[RHS]], ptr [[__REINT_180]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_180]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_180]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[__REINT_180]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGETQ_LANE8]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[__REINT_180]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE13:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGETQ_LANE13]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT15]], ptr [[__REINT1_180]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_180]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT90_F163_I]]
+//
 float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmla_rot90_lane_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_206:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[__REINT1_206:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    store <2 x float> [[RHS]], ptr [[__REINT_206]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_206]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    store <1 x i64> [[VECINIT]], ptr [[__REINT1_206]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_206]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT90_F323_I]]
+//
 float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
 }
 
 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_210:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__REINT1_210:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    store <4 x float> [[RHS]], ptr [[__REINT_210]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_210]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    store <1 x i64> [[VECINIT]], ptr [[__REINT1_210]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_210]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT90_F323_I]]
+//
 float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
   return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
-// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
-// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_208:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[__REINT1_208:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <2 x float> [[RHS]], ptr [[__REINT_208]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_208]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[__REINT_208]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i64> [[VECINIT5]], ptr [[__REINT1_208]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_208]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT90_F323_I]]
+//
 float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_212:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__REINT1_212:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <4 x float> [[RHS]], ptr [[__REINT_212]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_212]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__REINT_212]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i64> [[VECINIT5]], ptr [[__REINT1_212]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_212]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT90_F323_I]]
+//
 float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_158:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_158:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <4 x half> [[RHS]], ptr [[__REINT_158]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_158]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_158]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT5]], ptr [[__REINT1_158]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_158]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT180_F163_I]]
+//
 float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
 }
 
 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_162:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_162:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <8 x half> [[RHS]], ptr [[__REINT_162]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_162]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_162]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT5]], ptr [[__REINT1_162]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_162]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT180_F163_I]]
+//
 float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
   return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_160:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_160:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <4 x half> [[RHS]], ptr [[__REINT_160]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_160]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_160]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[__REINT_160]], align 8
+// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGET_LANE8]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[__REINT_160]], align 8
+// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGET_LANE13]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT15]], ptr [[__REINT1_160]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_160]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT180_F163_I]]
+//
 float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
   return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_164:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_164:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <8 x half> [[RHS]], ptr [[__REINT_164]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_164]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_164]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[__REINT_164]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGETQ_LANE8]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[__REINT_164]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE13:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGETQ_LANE13]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT15]], ptr [[__REINT1_164]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_164]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT180_F163_I]]
+//
 float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmla_rot180_lane_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_190:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[__REINT1_190:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    store <2 x float> [[RHS]], ptr [[__REINT_190]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_190]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    store <1 x i64> [[VECINIT]], ptr [[__REINT1_190]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_190]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT180_F323_I]]
+//
 float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
 }
 
 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_194:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__REINT1_194:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    store <4 x float> [[RHS]], ptr [[__REINT_194]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_194]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    store <1 x i64> [[VECINIT]], ptr [[__REINT1_194]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_194]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT180_F323_I]]
+//
 float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
   return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
-// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
-// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_192:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[__REINT1_192:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <2 x float> [[RHS]], ptr [[__REINT_192]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_192]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[__REINT_192]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i64> [[VECINIT5]], ptr [[__REINT1_192]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_192]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT180_F323_I]]
+//
 float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_196:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__REINT1_196:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <4 x float> [[RHS]], ptr [[__REINT_196]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_196]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__REINT_196]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i64> [[VECINIT5]], ptr [[__REINT1_196]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_196]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT180_F323_I]]
+//
 float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_166:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_166:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <4 x half> [[RHS]], ptr [[__REINT_166]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_166]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_166]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT5]], ptr [[__REINT1_166]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_166]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT270_F163_I]]
+//
 float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
   return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
 }
 
 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
-// CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
-// CHECK: ret <4 x half> [[RES]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_170:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_170:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <8 x half> [[RHS]], ptr [[__REINT_170]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_170]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_170]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT5]], ptr [[__REINT1_170]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_170]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x half> [[VCMLA_ROT270_F163_I]]
+//
 float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
   return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_168:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_168:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <4 x half> [[RHS]], ptr [[__REINT_168]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_168]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_168]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[__REINT_168]], align 8
+// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGET_LANE8]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[__REINT_168]], align 8
+// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGET_LANE13]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT15]], ptr [[__REINT1_168]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_168]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT270_F163_I]]
+//
 float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
   return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f16(
-// CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
-// CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
-// CHECK: ret <8 x half> [[RES]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_172:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_172:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT:    store <8 x half> [[RHS]], ptr [[__REINT_172]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_172]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_172]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[__REINT_172]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGETQ_LANE8]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[__REINT_172]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE13:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGETQ_LANE13]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT15]], ptr [[__REINT1_172]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_172]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]])
+// CHECK-NEXT:    ret <8 x half> [[VCMLAQ_ROT270_F163_I]]
+//
 float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
   return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3);
 }
 
-// CHECK-LABEL: @test_vcmla_rot270_lane_f32(
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_198:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[__REINT1_198:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    store <2 x float> [[RHS]], ptr [[__REINT_198]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_198]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    store <1 x i64> [[VECINIT]], ptr [[__REINT1_198]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_198]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT270_F323_I]]
+//
 float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
   return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
 }
 
 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
-// CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
-// CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> poison, <1 x i32> <i32 1>
-// CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
-// CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
-// CHECK: ret <2 x float> [[RES]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_202:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__REINT1_202:%.*]] = alloca <1 x i64>, align 8
+// CHECK-NEXT:    store <4 x float> [[RHS]], ptr [[__REINT_202]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_202]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    store <1 x i64> [[VECINIT]], ptr [[__REINT1_202]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_202]], align 8
+// CHECK-NEXT:    [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]])
+// CHECK-NEXT:    ret <2 x float> [[VCMLA_ROT270_F323_I]]
+//
 float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
   return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
-// CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
-// CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> poison, i64 [[CPLX]], i64 0
-// CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_200:%.*]] = alloca <2 x float>, align 8
+// CHECK-NEXT:    [[__REINT1_200:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <2 x float> [[RHS]], ptr [[__REINT_200]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_200]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGET_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[__REINT_200]], align 8
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i64> [[VECINIT5]], ptr [[__REINT1_200]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_200]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT270_F323_I]]
+//
 float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
   return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
 }
 
-// CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
-// CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-// CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
-// CHECK: ret <4 x float> [[RES]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_204:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__REINT1_204:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <4 x float> [[RHS]], ptr [[__REINT_204]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_204]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGETQ_LANE]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__REINT_204]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE3]], i32 1
+// CHECK-NEXT:    store <2 x i64> [[VECINIT5]], ptr [[__REINT1_204]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_204]], align 16
+// CHECK-NEXT:    [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x float> [[VCMLAQ_ROT270_F323_I]]
+//
 float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
   return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);
 }
diff --git a/clang/test/CodeGen/AArch64/sincos.c b/clang/test/CodeGen/AArch64/sincos.c
new file mode 100644
index 0000000..b77d98c
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sincos.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -O1 %s -o - | FileCheck --check-prefix=NO-MATH-ERRNO %s
+// RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -fmath-errno %s -o - | FileCheck --check-prefix=MATH-ERRNO %s
+
+// NO-MATH-ERRNO-LABEL: @sincos_f32
+//      NO-MATH-ERRNO:    [[SINCOS:%.*]] = tail call { float, float } @llvm.sincos.f32(float {{.*}})
+// NO-MATH-ERRNO-NEXT:    [[SIN:%.*]] = extractvalue { float, float } [[SINCOS]], 0
+// NO-MATH-ERRNO-NEXT:    [[COS:%.*]] = extractvalue { float, float } [[SINCOS]], 1
+// NO-MATH-ERRNO-NEXT:    store float [[SIN]], ptr {{.*}}, align 4, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]]
+// NO-MATH-ERRNO-NEXT:    store float [[COS]], ptr {{.*}}, align 4, !noalias [[SINCOS_ALIAS_SCOPE]]
+//
+// MATH-ERRNO-LABEL: @sincos_f32
+//      MATH-ERRNO:    call void @sincosf(
+//
+void sincos_f32(float x, float* fp0, float* fp1) {
+  __builtin_sincosf(x, fp0, fp1);
+}
+
+// NO-MATH-ERRNO-LABEL: @sincos_f64
+//      NO-MATH-ERRNO:    [[SINCOS:%.*]] = tail call { double, double } @llvm.sincos.f64(double {{.*}})
+// NO-MATH-ERRNO-NEXT:    [[SIN:%.*]] = extractvalue { double, double } [[SINCOS]], 0
+// NO-MATH-ERRNO-NEXT:    [[COS:%.*]] = extractvalue { double, double } [[SINCOS]], 1
+// NO-MATH-ERRNO-NEXT:    store double [[SIN]], ptr {{.*}}, align 8, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]]
+// NO-MATH-ERRNO-NEXT:    store double [[COS]], ptr {{.*}}, align 8, !noalias [[SINCOS_ALIAS_SCOPE]]
+//
+// MATH-ERRNO-LABEL: @sincos_f64
+//      MATH-ERRNO:    call void @sincos(
+//
+void sincos_f64(double x, double* dp0, double* dp1) {
+  __builtin_sincos(x, dp0, dp1);
+}
+
+// NO-MATH-ERRNO-LABEL: @sincos_f128
+//      NO-MATH-ERRNO:    [[SINCOS:%.*]] = tail call { fp128, fp128 } @llvm.sincos.f128(fp128 {{.*}})
+// NO-MATH-ERRNO-NEXT:    [[SIN:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 0
+// NO-MATH-ERRNO-NEXT:    [[COS:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 1
+// NO-MATH-ERRNO-NEXT:    store fp128 [[SIN]], ptr {{.*}}, align 16, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]]
+// NO-MATH-ERRNO-NEXT:    store fp128 [[COS]], ptr {{.*}}, align 16, !noalias [[SINCOS_ALIAS_SCOPE]]
+//
+// MATH-ERRNO-LABEL: @sincos_f128
+//      MATH-ERRNO:    call void @sincosl(
+//
+void sincos_f128(long double x, long double* ldp0, long double* ldp1) {
+  __builtin_sincosl(x, ldp0, ldp1);
+}
diff --git a/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c b/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c
index ce6f203..2071e66 100644
--- a/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c
+++ b/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme %s -DUSE_FLATTEN -o - | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme %s -DUSE_ALWAYS_INLINE_STMT -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme -target-feature +sme2 %s -DUSE_FLATTEN -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme -target-feature +sme2 %s -DUSE_ALWAYS_INLINE_STMT -o - | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -20,6 +20,7 @@ void fn_streaming_compatible(void) __arm_streaming_compatible { was_inlined(); }
 void fn_streaming(void) __arm_streaming { was_inlined(); }
 __arm_locally_streaming void fn_locally_streaming(void) { was_inlined(); }
 __arm_new("za") void fn_streaming_new_za(void) __arm_streaming { was_inlined(); }
+__arm_new("zt0") void fn_streaming_new_zt0(void) __arm_streaming { was_inlined(); }
 
 FN_ATTR
 void caller(void) {
@@ -28,6 +29,7 @@ void caller(void) {
     STMT_ATTR fn_streaming();
     STMT_ATTR fn_locally_streaming();
     STMT_ATTR fn_streaming_new_za();
+    STMT_ATTR fn_streaming_new_zt0();
 }
 // CHECK-LABEL: void @caller()
 //  CHECK-NEXT: entry:
@@ -36,6 +38,7 @@ void caller(void) {
 //  CHECK-NEXT:   call void @fn_streaming
 //  CHECK-NEXT:   call void @fn_locally_streaming
 //  CHECK-NEXT:   call void @fn_streaming_new_za
+//  CHECK-NEXT:   call void @fn_streaming_new_zt0
 
 FN_ATTR void caller_streaming_compatible(void) __arm_streaming_compatible {
     STMT_ATTR fn();
@@ -43,6 +46,7 @@ FN_ATTR void caller_streaming_compatible(void) __arm_streaming_compatible {
     STMT_ATTR fn_streaming();
     STMT_ATTR fn_locally_streaming();
     STMT_ATTR fn_streaming_new_za();
+    STMT_ATTR fn_streaming_new_zt0();
 }
 // CHECK-LABEL: void @caller_streaming_compatible()
 //  CHECK-NEXT: entry:
@@ -51,6 +55,7 @@ FN_ATTR void caller_streaming_compatible(void) __arm_streaming_compatible {
 //  CHECK-NEXT:   call void @fn_streaming
 //  CHECK-NEXT:   call void @fn_locally_streaming
 //  CHECK-NEXT:   call void @fn_streaming_new_za
+//  CHECK-NEXT:   call void @fn_streaming_new_zt0
 
 FN_ATTR void caller_streaming(void) __arm_streaming {
     STMT_ATTR fn();
@@ -58,6 +63,7 @@ FN_ATTR void caller_streaming(void) __arm_streaming {
     STMT_ATTR fn_streaming();
     STMT_ATTR fn_locally_streaming();
     STMT_ATTR fn_streaming_new_za();
+    STMT_ATTR fn_streaming_new_zt0();
 }
 // CHECK-LABEL: void @caller_streaming()
 //  CHECK-NEXT: entry:
@@ -66,6 +72,7 @@ FN_ATTR void caller_streaming(void) __arm_streaming {
 //  CHECK-NEXT:   call void @was_inlined
 //  CHECK-NEXT:   call void @was_inlined
 //  CHECK-NEXT:   call void @fn_streaming_new_za
+//  CHECK-NEXT:   call void @fn_streaming_new_zt0
 
 FN_ATTR __arm_locally_streaming
 void caller_locally_streaming(void) {
@@ -74,6 +81,7 @@ void caller_locally_streaming(void) {
     STMT_ATTR fn_streaming();
     STMT_ATTR fn_locally_streaming();
     STMT_ATTR fn_streaming_new_za();
+    STMT_ATTR fn_streaming_new_zt0();
 }
 // CHECK-LABEL: void @caller_locally_streaming()
 //  CHECK-NEXT: entry:
@@ -82,3 +90,4 @@ void caller_locally_streaming(void) {
 //  CHECK-NEXT:   call void @was_inlined
 //  CHECK-NEXT:   call void @was_inlined
 //  CHECK-NEXT:   call void @fn_streaming_new_za
+//  CHECK-NEXT:   call void @fn_streaming_new_zt0
diff --git a/clang/test/CodeGen/AArch64/sme-inline-streaming-attrs.c b/clang/test/CodeGen/AArch64/sme-inline-streaming-attrs.c
index 9c3d08a..68102c9 100644
--- a/clang/test/CodeGen/AArch64/sme-inline-streaming-attrs.c
+++ b/clang/test/CodeGen/AArch64/sme-inline-streaming-attrs.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_NONE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_COMPATIBLE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -target-feature +sme2 -verify -DTEST_NONE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -target-feature +sme2 -verify -DTEST_COMPATIBLE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -target-feature +sme2 -verify -DTEST_STREAMING %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -target-feature +sme2 -verify -DTEST_LOCALLY %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -10,6 +10,8 @@ __ai void inlined_fn(void) {}
 __ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {}
 __ai void inlined_fn_streaming(void) __arm_streaming {}
 __ai __arm_locally_streaming void inlined_fn_local(void) {}
+__ai __arm_new("za") void inlined_fn_za(void) {}
+__ai __arm_new("zt0") void inlined_fn_zt0(void) {}
 
 #ifdef TEST_NONE
 void caller(void) {
@@ -17,6 +19,8 @@ void caller(void) {
     inlined_fn_streaming_compatible();
     inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller' have mismatching streaming attributes}}
     inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller' have mismatching streaming attributes}}
+    inlined_fn_za(); // expected-error {{always_inline function 'inlined_fn_za' has new za state}}
+    inlined_fn_zt0(); // expected-error {{always_inline function 'inlined_fn_zt0' has new zt0 state}}
 }
 #endif
 
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c
index 9ba1527..72f2d17 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c
@@ -6,34 +6,53 @@
 
 #include <arm_sme.h>
 
-// CHECK-LABEL: @test_in_streaming_mode(
+// CHECK-LABEL: @test_in_streaming_mode_streaming_compatible(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
-// CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
-// CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
-// CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.aarch64.sme.in.streaming.mode()
+// CHECK-NEXT:    ret i1 [[TMP0]]
 //
-// CPP-CHECK-LABEL: @_Z22test_in_streaming_modev(
+// CPP-CHECK-LABEL: @_Z43test_in_streaming_mode_streaming_compatiblev(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
-// CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.aarch64.sme.in.streaming.mode()
+// CPP-CHECK-NEXT:    ret i1 [[TMP0]]
+//
+bool test_in_streaming_mode_streaming_compatible(void) __arm_streaming_compatible {
+  return __arm_in_streaming_mode();
+}
+
+// CHECK-LABEL: @test_in_streaming_mode_streaming(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i1 true
+//
+// CPP-CHECK-LABEL: @_Z32test_in_streaming_mode_streamingv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret i1 true
+//
+bool test_in_streaming_mode_streaming(void) __arm_streaming {
+//
+  return __arm_in_streaming_mode();
+}
+
+// CHECK-LABEL: @test_in_streaming_mode_non_streaming(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i1 false
+//
+// CPP-CHECK-LABEL: @_Z36test_in_streaming_mode_non_streamingv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret i1 false
 //
-bool test_in_streaming_mode(void) __arm_streaming_compatible {
+bool test_in_streaming_mode_non_streaming(void) {
   return __arm_in_streaming_mode();
 }
 
 // CHECK-LABEL: @test_za_disable(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR3]]
+// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR7:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_za_disablev(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR3]]
+// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR7:[0-9]+]]
 // CPP-CHECK-NEXT:    ret void
 //
 void test_za_disable(void) __arm_streaming_compatible {
@@ -42,14 +61,14 @@ void test_za_disable(void) __arm_streaming_compatible {
 
 // CHECK-LABEL: @test_has_sme(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR7]]
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
 //
 // CPP-CHECK-LABEL: @_Z12test_has_smev(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR7]]
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
 // CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
@@ -72,12 +91,12 @@ void test_svundef_za(void) __arm_streaming_compatible __arm_out("za") {
 
 // CHECK-LABEL: @test_sc_memcpy(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CHECK-NEXT:    ret ptr [[CALL]]
 //
 // CPP-CHECK-LABEL: @_Z14test_sc_memcpyPvPKvm(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CPP-CHECK-NEXT:    ret ptr [[CALL]]
 //
 void *test_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible {
@@ -86,12 +105,12 @@ void *test_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_comp
 
 // CHECK-LABEL: @test_sc_memmove(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CHECK-NEXT:    ret ptr [[CALL]]
 //
 // CPP-CHECK-LABEL: @_Z15test_sc_memmovePvPKvm(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CPP-CHECK-NEXT:    ret ptr [[CALL]]
 //
 void *test_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible {
@@ -100,12 +119,12 @@ void *test_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_com
 
 // CHECK-LABEL: @test_sc_memset(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CHECK-NEXT:    ret ptr [[CALL]]
 //
 // CPP-CHECK-LABEL: @_Z14test_sc_memsetPvim(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CPP-CHECK-NEXT:    ret ptr [[CALL]]
 //
 void *test_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible {
@@ -114,12 +133,12 @@ void *test_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible {
 
 // CHECK-LABEL: @test_sc_memchr(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CHECK-NEXT:    ret ptr [[CALL]]
 //
 // CPP-CHECK-LABEL: @_Z14test_sc_memchrPvim(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CPP-CHECK-NEXT:    ret ptr [[CALL]]
 //
 void *test_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible {
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index bf10743..d7bf7d5 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -38,6 +38,31 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // NO__ERRNO-NEXT: [[FREXP_F128_0:%.+]] = extractvalue { fp128, i32 } [[FREXP_F128]], 0
 
 
+// NO__ERRNO: [[SINCOS_F64:%.+]] = call { double, double } @llvm.sincos.f64(double %{{.+}})
+// NO__ERRNO-NEXT: [[SINCOS_F64_0:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 0
+// NO__ERRNO-NEXT: [[SINCOS_F64_1:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 1
+// NO__ERRNO-NEXT: store double [[SINCOS_F64_0]], ptr %{{.+}}, align 8
+// NO__ERRNO-NEXT: store double [[SINCOS_F64_1]], ptr %{{.+}}, align 8
+
+// NO__ERRNO: [[SINCOS_F32:%.+]] = call { float, float } @llvm.sincos.f32(float %{{.+}})
+// NO__ERRNO-NEXT: [[SINCOS_F32_0:%.+]] = extractvalue { float, float } [[SINCOS_F32]], 0
+// NO__ERRNO-NEXT: [[SINCOS_F32_1:%.+]] = extractvalue { float, float } [[SINCOS_F32]], 1
+// NO__ERRNO-NEXT: store float [[SINCOS_F32_0]], ptr %{{.+}}, align 4
+// NO__ERRNO-NEXT: store float [[SINCOS_F32_1]], ptr %{{.+}}, align 4
+
+// NO__ERRNO: [[SINCOS_F80:%.+]] = call { x86_fp80, x86_fp80 } @llvm.sincos.f80(x86_fp80 %{{.+}})
+// NO__ERRNO-NEXT: [[SINCOS_F80_0:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[SINCOS_F80]], 0
+// NO__ERRNO-NEXT: [[SINCOS_F80_1:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[SINCOS_F80]], 1
+// NO__ERRNO-NEXT: store x86_fp80 [[SINCOS_F80_0]], ptr %{{.+}}, align 16
+// NO__ERRNO-NEXT: store x86_fp80 [[SINCOS_F80_1]], ptr %{{.+}}, align 16
+
+// NO__ERRNO: [[SINCOS_F128:%.+]] = call { fp128, fp128 } @llvm.sincos.f128(fp128 %{{.+}})
+// NO__ERRNO-NEXT: [[SINCOS_F128_0:%.+]] = extractvalue { fp128, fp128 } [[SINCOS_F128]], 0
+// NO__ERRNO-NEXT: [[SINCOS_F128_1:%.+]] = extractvalue { fp128, fp128 } [[SINCOS_F128]], 1
+// NO__ERRNO-NEXT: store fp128 [[SINCOS_F128_0]], ptr %{{.+}}, align 16
+// NO__ERRNO-NEXT: store fp128 [[SINCOS_F128_1]], ptr %{{.+}}, align 16
+
+
 // HAS_ERRNO: declare double @fmod(double noundef, double noundef) [[NOT_READNONE:#[0-9]+]]
 // HAS_ERRNO: declare float @fmodf(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @fmodl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -665,6 +690,16 @@ __builtin_sinh(f);       __builtin_sinhf(f);      __builtin_sinhl(f); __builtin_
 // HAS_ERRNO: declare x86_fp80 @sinhl(x86_fp80 noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare fp128 @sinhf128(fp128 noundef) [[NOT_READNONE]]
 
+__builtin_sincos(f,d,d); __builtin_sincosf(f,fp,fp); __builtin_sincosl(f,l,l); __builtin_sincosf128(f,l,l);
+// NO__ERRNO: declare { double, double } @llvm.sincos.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { float, float } @llvm.sincos.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.sincos.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { fp128, fp128 } @llvm.sincos.f128(fp128) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare void @sincos(double noundef, ptr noundef, ptr noundef) [[NOT_READNONE]]
+// HAS_ERRNO: declare void @sincosf(float noundef, ptr noundef, ptr noundef) [[NOT_READNONE]]
+// HAS_ERRNO: declare void @sincosl(x86_fp80 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]]
+// HAS_ERRNO: declare void @sincosf128(fp128 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]]
+
 __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_sqrtf128(f);
 
 // NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
@@ -733,4 +768,3 @@ __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin
 
 // HAS_ERRNO_GNU: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO_WIN: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
-
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 6b7acbb..b7e3a32 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -64,20 +64,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 33664
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 33664
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 69793284352
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 69793284352
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @ftc._MaesMlse
+// CHECK-NEXT:    ret ptr @ftc._Msve2
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 69793284352
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 69793284352
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 33664
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 33664
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc._Msve2
+// CHECK-NEXT:    ret ptr @ftc._MaesMlse
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    ret ptr @ftc.default
 //
@@ -252,56 +252,56 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc.default
-// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_def.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup1.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup2.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup3.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2._Mfp16
-// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2._MfcmaMsve2-bitperm
-// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
@@ -330,28 +330,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._MrngMsimd
-// CHECK-SAME: () #[[ATTR12:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc
-// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._Msve2-aesMwfxt
-// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
@@ -395,14 +395,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline3._MsbMsve
-// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline3.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
@@ -411,20 +411,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817985280
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817985280
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1125899906842624
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1125899906842624
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @ftc_inline3._MsbMsve
+// CHECK-NEXT:    ret ptr @ftc_inline3._Mbti
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 70369817985280
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 70369817985280
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc_inline3._Mbti
+// CHECK-NEXT:    ret ptr @ftc_inline3._MsbMsve
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    ret ptr @ftc_inline3.default
 //
@@ -521,20 +521,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 33664
-// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 33664
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 69793284352
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 69793284352
 // CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK-MTE-BTI:       resolver_return:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._MaesMlse
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._Msve2
 // CHECK-MTE-BTI:       resolver_else:
 // CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 69793284352
-// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 69793284352
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 33664
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 33664
 // CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK-MTE-BTI:       resolver_return1:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._Msve2
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._MaesMlse
 // CHECK-MTE-BTI:       resolver_else2:
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc.default
 //
@@ -548,7 +548,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def._MmemtagMsha2
-// CHECK-MTE-BTI-SAME: () #[[ATTR2]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
@@ -598,14 +598,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2._Mfp
-// CHECK-MTE-BTI-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2._McrcMdotprod
-// CHECK-MTE-BTI-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
@@ -634,14 +634,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3._Mmemtag
-// CHECK-MTE-BTI-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3._Mbti
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
@@ -670,7 +670,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@foo
-// CHECK-MTE-BTI-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    [[CALL:%.*]] = call i32 @ftc()
 // CHECK-MTE-BTI-NEXT:    [[CALL1:%.*]] = call i32 @ftc_def()
@@ -686,14 +686,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_direct
-// CHECK-MTE-BTI-SAME: () #[[ATTR6]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@main
-// CHECK-MTE-BTI-SAME: () #[[ATTR6]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-MTE-BTI-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -709,56 +709,56 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 0
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup1.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2._Mfp16
-// CHECK-MTE-BTI-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2._MfcmaMsve2-bitperm
-// CHECK-MTE-BTI-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR10:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
@@ -787,28 +787,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._MrngMsimd
-// CHECK-MTE-BTI-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc
-// CHECK-MTE-BTI-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._Msve2-aesMwfxt
-// CHECK-MTE-BTI-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
@@ -845,21 +845,21 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3._Mbti
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR7]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3._MsbMsve
-// CHECK-MTE-BTI-SAME: () #[[ATTR12:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR14:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
@@ -868,20 +868,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817985280
-// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817985280
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1125899906842624
 // CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK-MTE-BTI:       resolver_return:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._MsbMsve
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._Mbti
 // CHECK-MTE-BTI:       resolver_else:
 // CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
-// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 70369817985280
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 70369817985280
 // CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK-MTE-BTI:       resolver_return1:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._Mbti
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._MsbMsve
 // CHECK-MTE-BTI:       resolver_else2:
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3.default
 //
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 951401c..336d8b0 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -141,6 +141,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK: @fmv_two = weak_odr ifunc i32 (), ptr @fmv_two.resolver
 // CHECK: @fmv_e = weak_odr ifunc i32 (), ptr @fmv_e.resolver
 // CHECK: @fmv_d = internal ifunc i32 (), ptr @fmv_d.resolver
+// CHECK: @fmv_default = weak_odr ifunc i32 (), ptr @fmv_default.resolver
 // CHECK: @fmv_c = weak_odr ifunc void (), ptr @fmv_c.resolver
 // CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
 // CHECK: @reca = weak_odr ifunc void (), ptr @reca.resolver
@@ -271,7 +272,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo
-// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one()
@@ -297,7 +298,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
-// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
@@ -311,7 +312,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@goo
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
@@ -323,7 +324,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@recur
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @reca()
 // CHECK-NEXT:    ret void
@@ -331,7 +332,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@hoo
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
@@ -348,28 +349,28 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops
-// CHECK-SAME: () #[[ATTR19:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod
-// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR18:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_decl._Maes
-// CHECK-SAME: () #[[ATTR5]] {
+// CHECK-SAME: () #[[ATTR19:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve
-// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
@@ -383,7 +384,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def._Mfp16
-// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
@@ -397,49 +398,49 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse
-// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm
-// CHECK-SAME: () #[[ATTR24:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt
-// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR25:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm
-// CHECK-SAME: () #[[ATTR24]] {
+// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@caller
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @used_def_without_default_decl()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @used_decl_without_default_decl()
@@ -462,12 +463,12 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 66315
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 66315
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 144119586256651008
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 144119586256651008
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv._MflagmMfp16fmlMrng
+// CHECK-NEXT:    ret ptr @fmv._Msme2
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72061992218723078
@@ -478,60 +479,60 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv._Mflagm2Msme-i16i64
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 9007199254741776
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 9007199254741776
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 9007199254742016
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 9007199254742016
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
 // CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @fmv._MdotprodMls64
+// CHECK-NEXT:    ret ptr @fmv._McrcMls64
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 9007199254742016
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 9007199254742016
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 9007199254741776
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 9007199254741776
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv._McrcMls64
+// CHECK-NEXT:    ret ptr @fmv._MdotprodMls64
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 17592186110728
-// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 17592186110728
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 1125899906842624
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 1125899906842624
 // CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
 // CHECK-NEXT:    br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]]
 // CHECK:       resolver_return7:
-// CHECK-NEXT:    ret ptr @fmv._Mfp16fmlMmemtag
+// CHECK-NEXT:    ret ptr @fmv._Mbti
 // CHECK:       resolver_else8:
 // CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 33536
-// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 33536
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 17592186110728
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 17592186110728
 // CHECK-NEXT:    [[TMP23:%.*]] = and i1 true, [[TMP22]]
 // CHECK-NEXT:    br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]]
 // CHECK:       resolver_return9:
-// CHECK-NEXT:    ret ptr @fmv._MaesMfp
+// CHECK-NEXT:    ret ptr @fmv._Mfp16fmlMmemtag
 // CHECK:       resolver_else10:
 // CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 4992
-// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 4992
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 66315
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 66315
 // CHECK-NEXT:    [[TMP27:%.*]] = and i1 true, [[TMP26]]
 // CHECK-NEXT:    br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]]
 // CHECK:       resolver_return11:
-// CHECK-NEXT:    ret ptr @fmv._MlseMsha2
+// CHECK-NEXT:    ret ptr @fmv._MflagmMfp16fmlMrng
 // CHECK:       resolver_else12:
 // CHECK-NEXT:    [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 144119586256651008
-// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 144119586256651008
+// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 33536
+// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 33536
 // CHECK-NEXT:    [[TMP31:%.*]] = and i1 true, [[TMP30]]
 // CHECK-NEXT:    br i1 [[TMP31]], label [[RESOLVER_RETURN13:%.*]], label [[RESOLVER_ELSE14:%.*]]
 // CHECK:       resolver_return13:
-// CHECK-NEXT:    ret ptr @fmv._Msme2
+// CHECK-NEXT:    ret ptr @fmv._MaesMfp
 // CHECK:       resolver_else14:
 // CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 1125899906842624
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 1125899906842624
+// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 4992
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 4992
 // CHECK-NEXT:    [[TMP35:%.*]] = and i1 true, [[TMP34]]
 // CHECK-NEXT:    br i1 [[TMP35]], label [[RESOLVER_RETURN15:%.*]], label [[RESOLVER_ELSE16:%.*]]
 // CHECK:       resolver_return15:
-// CHECK-NEXT:    ret ptr @fmv._Mbti
+// CHECK-NEXT:    ret ptr @fmv._MlseMsha2
 // CHECK:       resolver_else16:
 // CHECK-NEXT:    ret ptr @fmv.default
 //
@@ -630,6 +631,11 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv_d.default
 //
 //
+// CHECK-LABEL: define {{[^@]+}}@fmv_default.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @fmv_default.default
+//
+//
 // CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -767,60 +773,60 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4398182892352
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4398182892352
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 864708720653762560
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 864708720653762560
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_inline._MfcmaMfp16MrdmMsme
+// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMmopsMrcpc3
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 864708720653762560
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 864708720653762560
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 19861002584864
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 19861002584864
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMmopsMrcpc3
+// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMsve2-sm4
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 894427038464
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 894427038464
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 4398182892352
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 4398182892352
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
 // CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-aesMsve2-bitperm
+// CHECK-NEXT:    ret ptr @fmv_inline._MfcmaMfp16MrdmMsme
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 35433583360
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 35433583360
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 1444182864640
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 1444182864640
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv_inline._MaesMf64mmMsha2
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-aesMsve2-sha3
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 18320798464
-// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 18320798464
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 894427038464
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 894427038464
 // CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
 // CHECK-NEXT:    br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]]
 // CHECK:       resolver_return7:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mf32mmMi8mmMsha3
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-aesMsve2-bitperm
 // CHECK:       resolver_else8:
 // CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 19861002584864
-// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 19861002584864
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 35433583360
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 35433583360
 // CHECK-NEXT:    [[TMP23:%.*]] = and i1 true, [[TMP22]]
 // CHECK-NEXT:    br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]]
 // CHECK:       resolver_return9:
-// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMsve2-sm4
+// CHECK-NEXT:    ret ptr @fmv_inline._MaesMf64mmMsha2
 // CHECK:       resolver_else10:
 // CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 1444182864640
-// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 1444182864640
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 18320798464
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 18320798464
 // CHECK-NEXT:    [[TMP27:%.*]] = and i1 true, [[TMP26]]
 // CHECK-NEXT:    br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]]
 // CHECK:       resolver_return11:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-aesMsve2-sha3
+// CHECK-NEXT:    ret ptr @fmv_inline._Mf32mmMi8mmMsha3
 // CHECK:       resolver_else12:
 // CHECK-NEXT:    [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 1208025856
@@ -984,7 +990,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@func
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
@@ -1008,21 +1014,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv
-// CHECK-NOFMV-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_one
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_two
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
@@ -1048,21 +1054,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_d
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_c
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret void
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_default
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 111
 //
@@ -1115,7 +1121,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@main
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-NOFMV-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1126,7 +1132,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_default_def
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
 //
diff --git a/clang/test/CodeGen/builtin-memfns.c b/clang/test/CodeGen/builtin-memfns.c
index 23c3c60..581eb85e 100644
--- a/clang/test/CodeGen/builtin-memfns.c
+++ b/clang/test/CodeGen/builtin-memfns.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple i386-pc-linux-gnu -emit-llvm < %s| FileCheck %s
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -emit-llvm -fexperimental-new-constant-interpreter < %s| FileCheck %s
 
 typedef __WCHAR_TYPE__ wchar_t;
 typedef __SIZE_TYPE__ size_t;
diff --git a/clang/test/CodeGen/code-coverage.c b/clang/test/CodeGen/code-coverage.c
index 4e3364d..5fa6236 100644
--- a/clang/test/CodeGen/code-coverage.c
+++ b/clang/test/CodeGen/code-coverage.c
@@ -3,18 +3,14 @@
 /// 4.7 enables cfg_checksum.
 /// 4.8 (default, compatible with gcov 7) emits the exit block the second.
 // RUN: rm -rf %t && mkdir %t && cd %t
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,304 %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,407 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='B21*' %s -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,1210 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,408 %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,304 %s
-// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,407 %s
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,1110 %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='B21*' %s -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,1210 %s
 // RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,408 %s
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,1110 %s
 
 // RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-notes-file=aaa.gcno -coverage-data-file=bbb.gcda -debug-info-kind=limited -dwarf-version=4 %s -o - | FileCheck %s --check-prefix GCOV_FILE_INFO
 
@@ -48,12 +44,10 @@ int test2(int b) {
 // CHECK-SAME: [%emit_function_args_ty { i32 0, i32 {{[-0-9]+}}, i32 {{[-0-9]+}} }, %emit_function_args_ty { i32 1, i32 {{[-0-9]+}}, i32 {{[-0-9]+}} }]
 
 // CHECK: @__llvm_internal_gcov_emit_file_info = internal unnamed_addr constant [1 x %file_info]
-/// 0x3330342a '3' '0' '4' '*'
-// 304-SAME: i32 858797098
-/// 0x3430372a '4' '0' '7' '*'
-// 407-SAME: i32 875575082
-/// 0x3430382a '4' '0' '8' '*'
-// 408-SAME: i32 875575338
+/// 0x4231312a 'B' '1' '1' '*'
+// 1110-SAME: i32 1110520106
+/// 0x4232312a 'B' '2' '1' '*'
+// 1210-SAME: i32 1110585642
 
 // Check for gcov initialization function pointers.
 // CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit"
diff --git a/clang/test/CodeGen/nvptx_attributes.c b/clang/test/CodeGen/nvptx_attributes.c
index 7dbd9f1..8b9f3a2 100644
--- a/clang/test/CodeGen/nvptx_attributes.c
+++ b/clang/test/CodeGen/nvptx_attributes.c
@@ -10,8 +10,14 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[RET_ADDR]], align 8
 // CHECK-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CHECK-NEXT:    ret void
+//
 __attribute__((nvptx_kernel)) void foo(int *ret) {
   *ret = 1;
 }
 
-// CHECK: !0 = !{ptr @foo, !"kernel", i32 1}
+//.
+// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx32,+sm_61" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/sanitize-type-globals.cpp b/clang/test/CodeGen/sanitize-type-globals.cpp
index 7cb8de8..1154ab4 100644
--- a/clang/test/CodeGen/sanitize-type-globals.cpp
+++ b/clang/test/CodeGen/sanitize-type-globals.cpp
@@ -3,7 +3,10 @@
 
 //.
 // CHECK: @x = global %struct.CompleteS zeroinitializer, align 8
+// CHECK: @xExtern = external global %struct.CompleteS, align 8
 // CHECK: @y = external global %struct.S, align 1
+// CHECK: @d = global %class.b zeroinitializer, align 1
+// CHECK: @_ZN1b1eE = external global %class.a, align 1
 // CHECK: @__tysan_shadow_memory_address = external global i64
 // CHECK: @__tysan_app_memory_mask = external global i64
 // CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
@@ -12,8 +15,9 @@
 // CHECK: @__tysan_v1_any_20pointer = linkonce_odr constant { i64, i64, ptr, i64, [12 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [12 x i8] c"any pointer\00" }, comdat
 // CHECK: @__tysan_v1_p1_20int = linkonce_odr constant { i64, i64, ptr, i64, [7 x i8] } { i64 2, i64 1, ptr @__tysan_v1_any_20pointer, i64 0, [7 x i8] c"p1 int\00" }, comdat
 // CHECK: @__tysan_v1___ZTS9CompleteS = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [15 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_p1_20int, i64 8, [15 x i8] c"_ZTS9CompleteS\00" }, comdat
-// CHECK: @llvm.used = appending global [7 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_p1_20int, ptr @__tysan_v1___ZTS9CompleteS], section "llvm.metadata"
-// CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+// CHECK: @__tysan_v1___ZTS1b = linkonce_odr constant { i64, i64, [7 x i8] } { i64 2, i64 0, [7 x i8] c"_ZTS1b\00" }, comdat
+// CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_p1_20int, ptr @__tysan_v1___ZTS9CompleteS, ptr @__tysan_v1___ZTS1b], section "llvm.metadata"
+// CHECK: @llvm.global_ctors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_sanitize_type_globals.cpp, ptr null }, { i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
 //.
 struct CompleteS {
   int x;
@@ -22,13 +26,18 @@ struct CompleteS {
 
 void f(CompleteS *);
 CompleteS x;
+extern CompleteS xExtern;
 // CHECK-LABEL: define dso_local void @_Z1gv(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK:  [[ENTRY:.*:]]
 // CHECK:    call void @_Z1fP9CompleteS(ptr noundef @x)
+// CHECK:    call void @_Z1fP9CompleteS(ptr noundef @xExtern)
 // CHECK:    ret void
 //
-void g() { f(&x); }
+void g() {
+  f(&x);
+  f(&xExtern);
+}
 
 typedef struct S IncompleteS;
 void f(IncompleteS *);
@@ -40,11 +49,21 @@ extern IncompleteS y;
 // CHECK:    ret void
 //
 void h() { f(&y); }
+
+class a;
+class b {
+public:
+  using c = a;
+  static c e;
+  b(int, c & = e);
+} d = 0;
+
 //.
 // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone sanitize_type "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { noinline nounwind sanitize_type "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind }
 //.
 // CHECK: [[META0:![0-9]+]] = !{ptr @x, [[META1:![0-9]+]]}
 // CHECK: [[META1]] = !{!"_ZTS9CompleteS", [[META2:![0-9]+]], i64 0, [[META5:![0-9]+]], i64 8}
@@ -53,6 +72,8 @@ void h() { f(&y); }
 // CHECK: [[META4]] = !{!"Simple C++ TBAA"}
 // CHECK: [[META5]] = !{!"p1 int", [[META6:![0-9]+]], i64 0}
 // CHECK: [[META6]] = !{!"any pointer", [[META3]], i64 0}
-// CHECK: [[META7:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-// CHECK: [[META8:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// CHECK: [[META7:![0-9]+]] = !{ptr @d, [[META8:![0-9]+]]}
+// CHECK: [[META8]] = !{!"_ZTS1b"}
+// CHECK: [[META9:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META10:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 //.
diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c
index cf98812..545a6c9 100644
--- a/clang/test/CodeGen/scoped-atomic-ops.c
+++ b/clang/test/CodeGen/scoped-atomic-ops.c
@@ -1,15 +1,17 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
+// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
 
 // AMDGCN-LABEL: define hidden i32 @fi1a(
-// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV: define hidden spir_func i32 @fi1a(
 // SPIRV:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("device") monotonic, align 4
@@ -27,11 +29,11 @@ int fi1a(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi1b(
-// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi1b(
 // SPIRV:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
@@ -48,11 +50,11 @@ int fi1b(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi2a(
-// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi2a(
 // SPIRV:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
 // SPIRV:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
@@ -69,11 +71,11 @@ void fi2a(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi2b(
-// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi2b(
 // SPIRV:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
 // SPIRV:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
@@ -89,14 +91,14 @@ void fi2b(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3a(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4
@@ -118,14 +120,14 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("agent-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("agent") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3b(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("device") monotonic, align 4
@@ -147,14 +149,14 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3c(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4
@@ -176,14 +178,14 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("wavefront-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("wavefront") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3d(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("subgroup") monotonic, align 4
@@ -205,14 +207,14 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3e(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4
@@ -234,7 +236,7 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4a(
-// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("one-as") acquire acquire, align 4
+// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4a(
 // SPIRV-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
 _Bool fi4a(int *i) {
@@ -246,7 +248,7 @@ _Bool fi4a(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4b(
-// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent-one-as") acquire acquire, align 4
+// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4b(
 // SPIRV-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4
 _Bool fi4b(int *i) {
@@ -258,7 +260,7 @@ _Bool fi4b(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4c(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4c(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
 _Bool fi4c(int *i) {
@@ -270,7 +272,7 @@ _Bool fi4c(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4d(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4d(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4
 _Bool fi4d(int *i) {
@@ -282,7 +284,7 @@ _Bool fi4d(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4e(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4e(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
 _Bool fi4e(int *i) {
@@ -294,7 +296,7 @@ _Bool fi4e(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5a(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5a(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
 _Bool fi5a(int *i) {
@@ -305,7 +307,7 @@ _Bool fi5a(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5b(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5b(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4
 _Bool fi5b(int *i) {
@@ -316,7 +318,7 @@ _Bool fi5b(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5c(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5c(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
 _Bool fi5c(int *i) {
@@ -326,7 +328,7 @@ _Bool fi5c(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5d(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5d(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4
 _Bool fi5d(int *i) {
@@ -336,7 +338,7 @@ _Bool fi5d(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5e(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5e(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
 _Bool fi5e(int *i) {
@@ -346,7 +348,7 @@ _Bool fi5e(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6a(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
 int fi6a(int *c, int *d) {
@@ -356,7 +358,7 @@ int fi6a(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6b(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4
 int fi6b(int *c, int *d) {
@@ -366,7 +368,7 @@ int fi6b(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6c(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
 int fi6c(int *c, int *d) {
@@ -376,7 +378,7 @@ int fi6c(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6d(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4
 int fi6d(int *c, int *d) {
@@ -386,7 +388,7 @@ int fi6d(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6e(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
 int fi6e(int *c, int *d) {
@@ -396,7 +398,7 @@ int fi6e(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7a(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1
 _Bool fi7a(_Bool *c) {
@@ -405,7 +407,7 @@ _Bool fi7a(_Bool *c) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("agent-one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("agent") monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7b(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("device") monotonic, align 1
 _Bool fi7b(_Bool *c) {
@@ -414,7 +416,7 @@ _Bool fi7b(_Bool *c) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup-one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7c(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1
 _Bool fi7c(_Bool *c) {
@@ -423,7 +425,7 @@ _Bool fi7c(_Bool *c) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("wavefront-one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("wavefront") monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7d(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("subgroup") monotonic, align 1
 _Bool fi7d(_Bool *c) {
@@ -432,7 +434,7 @@ _Bool fi7d(_Bool *c) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread-one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7e(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1
 _Bool fi7e(_Bool *c) {
diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c
index 376cb11..d83ae05 100644
--- a/clang/test/CodeGen/scoped-fence-ops.c
+++ b/clang/test/CodeGen/scoped-fence-ops.c
@@ -1,6 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
+// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-unknown-linux-gnu -ffreestanding \
@@ -9,7 +11,7 @@
 // AMDGCN-LABEL: define hidden void @fe1a(
 // AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    fence syncscope("workgroup") release
 // AMDGCN-NEXT:    ret void
 //
 // SPIRV-LABEL: define hidden spir_func void @fe1a(
@@ -45,13 +47,13 @@ void fe1a() {
 // AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
 // AMDGCN-NEXT:    ret void
 // AMDGCN:       [[ACQUIRE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") acquire
+// AMDGCN-NEXT:    fence syncscope("workgroup") acquire
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[RELEASE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    fence syncscope("workgroup") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[ACQREL]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") acq_rel
+// AMDGCN-NEXT:    fence syncscope("workgroup") acq_rel
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[SEQCST]]:
 // AMDGCN-NEXT:    fence syncscope("workgroup") seq_cst
@@ -134,19 +136,19 @@ void fe1b(int ord) {
 // AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
 // AMDGCN-NEXT:    ret void
 // AMDGCN:       [[DEVICE_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("agent-one-as") release
+// AMDGCN-NEXT:    fence syncscope("agent") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[SYSTEM_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("one-as") release
+// AMDGCN-NEXT:    fence release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[WORKGROUP_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    fence syncscope("workgroup") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[WAVEFRONT_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("wavefront-one-as") release
+// AMDGCN-NEXT:    fence syncscope("wavefront") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[SINGLE_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("singlethread-one-as") release
+// AMDGCN-NEXT:    fence syncscope("singlethread") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 //
 // SPIRV-LABEL: define hidden spir_func void @fe1c(
@@ -237,7 +239,7 @@ void fe2a() {
 // AMDGCN-LABEL: define hidden void @fe2b(
 // AMDGCN-SAME: ) #[[ATTR0]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    fence syncscope("one-as") release
+// AMDGCN-NEXT:    fence release
 // AMDGCN-NEXT:    ret void
 //
 // SPIRV-LABEL: define hidden spir_func void @fe2b(
diff --git a/clang/test/CodeGen/tbaa-pointers.c b/clang/test/CodeGen/tbaa-pointers.c
index 6aee2ff..4aae255 100644
--- a/clang/test/CodeGen/tbaa-pointers.c
+++ b/clang/test/CodeGen/tbaa-pointers.c
@@ -193,11 +193,10 @@ typedef struct {
 void unamed_struct_typedef(TypedefS *ptr) {
 // COMMON-LABEL: define void @unamed_struct_typedef(
 // COMMON-SAME: ptr noundef [[PTRA:%.+]])
-// COMMON:   [[PTR_ADDR:%.+]]  = alloca ptr, align 8
+// COMMON:        [[PTR_ADDR:%.+]]  = alloca ptr, align 8
 // DISABLE-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// DISABLE-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
-// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[P1TYPEDEF:!.+]]
-// DEFAULT-NEXT:  [[L0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa  [[P1TYPEDEF]]
+// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR:!.+]]
+// COMMON-NEXT:   [[L0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
 // COMMON-NEXT:   [[GEP:%.+]]  = getelementptr inbounds nuw %struct.TypedefS, ptr [[L0]], i32 0, i32 0
 // COMMON-NEXT:   store i32 0, ptr [[GEP]], align 4
 // COMMON-NEXT:   ret void
@@ -205,6 +204,24 @@ void unamed_struct_typedef(TypedefS *ptr) {
   ptr->i1 = 0;
 }
 
+int void_ptrs(void **ptr) {
+// COMMON-LABEL: define i32 @void_ptrs(
+// COMMON-SAME: ptr noundef [[PTRA:%.+]])
+// COMMON:        [[PTR_ADDR:%.+]]  = alloca ptr, align 8
+// DISABLE-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+// DISABLE-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
+// DISABLE-NEXT:  [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[ANYPTR]]
+// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2VOID:!.+]]
+// DEFAULT-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[P2VOID]]
+// DEFAULT-NEXT:  [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[P1VOID:!.+]]
+// COMMON-NEXT:   [[BOOL:%.+]] = icmp ne ptr [[L1]], null
+// COMMON-NEXT:   [[BOOL_EXT:%.+]] = zext i1 [[BOOL]] to i64
+// COMMON-NEXT:   [[COND:%.+]] = select i1 [[BOOL]], i32 0, i32 1
+// COMMON-NEXT:   ret i32 [[COND]]
+
+  return *ptr ? 0 : 1;
+}
+
 // DEFAULT: [[P2INT_0]] = !{[[P2INT:!.+]], [[P2INT]], i64 0}
 // DEFAULT: [[P2INT]] = !{!"p2 int", [[ANY_POINTER:!.+]], i64 0}
 // DISABLE: [[ANYPTR]] = !{[[ANY_POINTER:!.+]], [[ANY_POINTER]], i64 0}
@@ -236,4 +253,8 @@ void unamed_struct_typedef(TypedefS *ptr) {
 // DISABLE: [[S2_TY]]  = !{!"S2", [[ANY_POINTER]], i64 0}
 // COMMON:  [[INT_TAG]] = !{[[INT_TY:!.+]], [[INT_TY]], i64 0}
 // COMMON:  [[INT_TY]] = !{!"int", [[CHAR]], i64 0}
-// DEFAULT: [[P1TYPEDEF]] = !{[[ANY_POINTER]],  [[ANY_POINTER]], i64 0}
+// DEFAULT: [[ANYPTR]] = !{[[ANY_POINTER]],  [[ANY_POINTER]], i64 0}
+// DEFAULT: [[P2VOID]] = !{[[P2VOID_TY:!.+]], [[P2VOID_TY]], i64 0}
+// DEFAULT: [[P2VOID_TY]] = !{!"p2 void", [[ANY_POINTER]], i64 0}
+// DEFAULT: [[P1VOID]] = !{[[P1VOID_TY:!.+]], [[P1VOID_TY]], i64 0}
+// DEFAULT: [[P1VOID_TY]] = !{!"p1 void", [[ANY_POINTER]], i64 0}
diff --git a/clang/test/CodeGen/xcore-abi.c b/clang/test/CodeGen/xcore-abi.c
index bb8d2fe..40e2f41 100644
--- a/clang/test/CodeGen/xcore-abi.c
+++ b/clang/test/CodeGen/xcore-abi.c
@@ -76,7 +76,8 @@ void testva (int n, ...) {
   // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[V5]], ptr align 4 [[P]], i32 20, i1 false)
   // CHECK: call void @f(ptr noundef [[V5]])
 
-  int* v6 = va_arg (ap, int[4]);  // an unusual aggregate type
+  // an unusual aggregate type
+  int* v6 = va_arg (ap, int[4]);  // expected-warning{{second argument to 'va_arg' is of array type 'int[4]'}}
   f(v6);
   // CHECK: [[I:%[a-z0-9]+]] = load ptr, ptr [[AP]]
   // CHECK: [[P:%[a-z0-9]+]] = load ptr, ptr [[I]]
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index 0e5fe8f..47fa396 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -26,10 +26,10 @@ __global__ void ffp1(float *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4{{$}}
-  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]]{{$}}
-  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
@@ -37,8 +37,8 @@ __global__ void ffp1(float *p) {
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]], !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE: _Z4ffp1Pf
   // SAFE: global_atomic_cmpswap
@@ -73,19 +73,19 @@ __global__ void ffp2(double *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
-  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE-LABEL: @_Z4ffp2Pd
   // SAFE: global_atomic_cmpswap_b64
@@ -119,19 +119,19 @@ __global__ void ffp3(long double *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
-  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE-LABEL: @_Z4ffp3Pe
   // SAFE: global_atomic_cmpswap_b64
@@ -185,10 +185,10 @@ __global__ void ffp6(_Float16 *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2{{$}}
-  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
@@ -196,8 +196,8 @@ __global__ void ffp6(_Float16 *p) {
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE: _Z4ffp6PDF16
   // SAFE: global_atomic_cmpswap
@@ -228,8 +228,8 @@ __global__ void ffp6(_Float16 *p) {
 // CHECK-LABEL: @_Z12test_cmpxchgPiii
 // CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} acquire acquire, align 4{{$}}
 // CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} acquire acquire, align 4{{$}}
-// CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-// CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+// CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+// CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 __device__ int test_cmpxchg(int *ptr, int cmp, int desired) {
   bool flag = __atomic_compare_exchange(ptr, &cmp, &desired, 0, memory_order_acquire, memory_order_acquire);
   flag = __atomic_compare_exchange_n(ptr, &cmp, desired, 1, memory_order_acquire, memory_order_acquire);
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index fab87aa..19730e39 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -33,7 +33,7 @@
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-SPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -58,7 +58,7 @@
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -102,7 +102,7 @@ __global__ void kernel1(int *x) {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-SPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -126,7 +126,7 @@ __global__ void kernel1(int *x) {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -171,7 +171,7 @@ __global__ void kernel2(int &x) {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
-// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8
 // CHECK-SPIRV-NEXT:    [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -195,7 +195,7 @@ __global__ void kernel2(int &x) {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
-// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] {
+// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4
 // OPT-SPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4
@@ -302,7 +302,7 @@ struct S {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
-// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[S:%.*]] = alloca [[STRUCT_S]], align 8
 // CHECK-SPIRV-NEXT:    [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4)
@@ -343,7 +343,7 @@ struct S {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
-// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 0
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 1
@@ -406,7 +406,7 @@ __global__ void kernel4(struct S s) {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[S:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-SPIRV-NEXT:    [[S_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -432,7 +432,7 @@ __global__ void kernel4(struct S s) {
 // CHECK-SPIRV-NEXT:    ret void
 //
 // OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S(
-// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // OPT-NEXT:  [[ENTRY:.*:]]
 // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8
 // OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -446,7 +446,7 @@ __global__ void kernel4(struct S s) {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[S_COERCE]] to i64
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -511,7 +511,7 @@ struct T {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
-// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[T:%.*]] = alloca [[STRUCT_T]], align 8
 // CHECK-SPIRV-NEXT:    [[T1:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4)
@@ -551,7 +551,7 @@ struct T {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
-// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0
 // OPT-SPIRV-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x ptr addrspace(4)] [[TMP0]], 0
@@ -606,7 +606,7 @@ __global__ void kernel6(struct T t) {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-SPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -631,7 +631,7 @@ __global__ void kernel6(struct T t) {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
-// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -677,7 +677,7 @@ struct SS {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
-// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[A:%.*]] = alloca [[STRUCT_SS]], align 8
 // CHECK-SPIRV-NEXT:    [[A1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
@@ -700,7 +700,7 @@ struct SS {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
-// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_SS]] [[A_COERCE]], 0
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(4) [[TMP0]], align 4
@@ -727,5 +727,9 @@ __global__ void kernel8(struct SS a) {
   *a.x += 3.f;
 }
 //.
+// CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
+//.
 // OPT: [[META4]] = !{}
 //.
+// OPT-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
+//.
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index 11a133f..253ac08 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -4,6 +4,9 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
 // RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
 // RUN:     | FileCheck -check-prefixes=CHECK,MAX1024 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa --gpu-max-threads-per-block=1024 \
+// RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN:     | FileCheck -check-prefixes=CHECK-SPIRV,MAX1024-SPIRV %s
 // RUN: %clang_cc1 -triple nvptx \
 // RUN:     -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
 // RUN:     -check-prefix=NAMD
@@ -21,12 +24,14 @@
 
 __global__ void flat_work_group_size_default() {
 // CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z28flat_work_group_size_defaultv(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_DEFAULT:![0-9]+]]
 // NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]]
 }
 
 __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
 __global__ void flat_work_group_size_32_64() {
 // CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
+// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z26flat_work_group_size_32_64v(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_64:![0-9]+]]
 }
 __attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics
 __global__ void waves_per_eu_2() {
@@ -82,7 +87,9 @@ template __global__ void template_32_4_a_max_num_work_groups<2>();
 
 // DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// MAX1024-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_DEFAULT]] = !{i32 1024, i32 1, i32 1}
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
+// CHECK-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_64]] = !{i32 64, i32 1, i32 1}
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
 // CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
diff --git a/clang/test/CodeGenCUDA/atomic-ops.cu b/clang/test/CodeGenCUDA/atomic-ops.cu
index 1accd17..d8489b4 100644
--- a/clang/test/CodeGenCUDA/atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/atomic-ops.cu
@@ -2,18 +2,18 @@
 #include "Inputs/cuda.h"
 
 // CHECK-LABEL: @_Z24atomic32_op_singlethreadPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK:[0-9]+]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 4{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK:[0-9]+]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("singlethread") monotonic, align 4{{$}}
 __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -31,8 +31,8 @@ __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z25atomicu32_op_singlethreadPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -40,18 +40,18 @@ __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned in
 }
 
 // CHECK-LABEL: @_Z21atomic32_op_wavefrontPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("wavefront") monotonic, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("wavefront") monotonic, align 4{{$}}
 __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -69,8 +69,8 @@ __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z22atomicu32_op_wavefrontPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -78,17 +78,17 @@ __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int v
 }
 
 // CHECK-LABEL: @_Z21atomic32_op_workgroupPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("workgroup") monotonic, align 4{{$}}
 __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -105,8 +105,8 @@ __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z22atomicu32_op_workgroupPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -114,17 +114,17 @@ __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int v
 }
 
 // CHECK-LABEL: @_Z17atomic32_op_agentPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("agent") monotonic, align 4{{$}}
 __device__ int atomic32_op_agent(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -141,8 +141,8 @@ __device__ int atomic32_op_agent(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z18atomicu32_op_agentPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -150,18 +150,18 @@ __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val,
 }
 
 // CHECK-LABEL: @_Z18atomic32_op_systemPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 // CHECK: load i32, ptr %{{.*}}, align 4{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} monotonic, align 4{{$}}
 __device__ int atomic32_op_system(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -179,8 +179,8 @@ __device__ int atomic32_op_system(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z19atomicu32_op_systemPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -188,17 +188,17 @@ __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val,
 }
 
 // CHECK-LABEL: @_Z24atomic64_op_singlethreadPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread") monotonic, align 8{{$}}
 __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -215,10 +215,10 @@ __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, l
 }
 
 // CHECK-LABEL: @_Z25atomicu64_op_singlethreadPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i64, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr %{{.*}} syncscope("singlethread") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -228,18 +228,18 @@ __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr,
 }
 
 // CHECK-LABEL: @_Z21atomic64_op_wavefrontPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront") monotonic, align 8{{$}}
 __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -257,10 +257,10 @@ __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long
 }
 
 // CHECK-LABEL: @_Z22atomicu64_op_wavefrontPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -270,17 +270,17 @@ __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, un
 }
 
 // CHECK-LABEL: @_Z21atomic64_op_workgroupPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup") monotonic, align 8{{$}}
 __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -297,9 +297,9 @@ __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long
 }
 
 // CHECK-LABEL: @_Z22atomicu64_op_workgroupPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -308,17 +308,17 @@ __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, un
 }
 
 // CHECK-LABEL: @_Z17atomic64_op_agentPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent") monotonic, align 8{{$}}
 __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -335,9 +335,9 @@ __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long lon
 }
 
 // CHECK-LABEL: @_Z18atomicu64_op_agentPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -346,18 +346,18 @@ __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsign
 }
 
 // CHECK-LABEL: @_Z18atomic64_op_systemPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 // CHECK: load i64, ptr %{{.*}}, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} monotonic, align 8{{$}}
 __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -375,10 +375,10 @@ __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long lo
 }
 
 // CHECK-LABEL: @_Z19atomicu64_op_systemPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 // CHECK: load i64, ptr %{{.*}}, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_system(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
diff --git a/clang/test/CodeGenCUDA/device-fun-linkage.cu b/clang/test/CodeGenCUDA/device-fun-linkage.cu
index 54899e0..bdac62d 100644
--- a/clang/test/CodeGenCUDA/device-fun-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-fun-linkage.cu
@@ -17,8 +17,8 @@ template __device__ void func<int>();
 // RDC:       define weak_odr void @_Z4funcIiEvv()
 
 template __global__ void kernel<int>();
-// NORDC:     define void @_Z6kernelIiEvv()
-// RDC:       define weak_odr void @_Z6kernelIiEvv()
+// NORDC:     define ptx_kernel void @_Z6kernelIiEvv()
+// RDC:       define weak_odr ptx_kernel void @_Z6kernelIiEvv()
 
 // Ensure that unused static device function is eliminated
 static __device__ void static_func() {}
@@ -28,5 +28,5 @@ static __device__ void static_func() {}
 // Ensure that kernel function has external or weak_odr
 // linkage regardless static specifier
 static __global__ void static_kernel() {}
-// NORDC:     define void @_ZL13static_kernelv()
-// RDC:       define weak_odr void @_ZL13static_kernelv[[FILEID:.*]]()
+// NORDC:     define ptx_kernel void @_ZL13static_kernelv()
+// RDC:       define weak_odr ptx_kernel void @_ZL13static_kernelv[[FILEID:.*]]()
diff --git a/clang/test/CodeGenCUDA/grid-constant.cu b/clang/test/CodeGenCUDA/grid-constant.cu
index 8d4be9c..e7000ca 100644
--- a/clang/test/CodeGenCUDA/grid-constant.cu
+++ b/clang/test/CodeGenCUDA/grid-constant.cu
@@ -21,11 +21,11 @@ void foo() {
 }
 //.
 //.
-// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"kernel", i32 1, !"grid_constant", [[META1:![0-9]+]]}
+// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"grid_constant", [[META1:![0-9]+]]}
 // CHECK: [[META1]] = !{i32 1, i32 3}
-// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3:![0-9]+]]}
+// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"grid_constant", [[META3:![0-9]+]]}
 // CHECK: [[META3]] = !{i32 1}
-// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3]]}
-// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"kernel", i32 1, !"grid_constant", [[META6:![0-9]+]]}
+// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"grid_constant", [[META3]]}
+// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"grid_constant", [[META6:![0-9]+]]}
 // CHECK: [[META6]] = !{i32 2}
 //.
diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu b/clang/test/CodeGenCUDA/offload_via_llvm.cu
index 434eba9..62942d8 100644
--- a/clang/test/CodeGenCUDA/offload_via_llvm.cu
+++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu
@@ -7,7 +7,7 @@
 #define __OFFLOAD_VIA_LLVM__ 1
 #include "Inputs/cuda.h"
 
-// HST-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_(
+// HST-LABEL: define dso_local ptx_kernel void @_Z18__device_stub__fooisPvS_(
 // HST-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
 // HST-NEXT:  [[ENTRY:.*:]]
 // HST-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
@@ -50,7 +50,7 @@
 // HST:       [[SETUP_END]]:
 // HST-NEXT:    ret void
 //
-// DEV-LABEL: define dso_local void @_Z3fooisPvS_(
+// DEV-LABEL: define dso_local ptx_kernel void @_Z3fooisPvS_(
 // DEV-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
 // DEV-NEXT:  [[ENTRY:.*:]]
 // DEV-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
diff --git a/clang/test/CodeGenCUDA/ptx-kernels.cu b/clang/test/CodeGenCUDA/ptx-kernels.cu
index b7172b7..a7d5e11 100644
--- a/clang/test/CodeGenCUDA/ptx-kernels.cu
+++ b/clang/test/CodeGenCUDA/ptx-kernels.cu
@@ -10,7 +10,7 @@
 extern "C"
 __device__ void device_function() {}
 
-// CHECK-LABEL: define{{.*}} void @global_function
+// CHECK-LABEL: define{{.*}} ptx_kernel void @global_function
 extern "C"
 __global__ void global_function() {
   // CHECK: call void @device_function
@@ -19,7 +19,7 @@ __global__ void global_function() {
 
 // Make sure host-instantiated kernels are preserved on device side.
 template <typename T> __global__ void templated_kernel(T param) {}
-// CHECK-DAG: define{{.*}} void @_Z16templated_kernelIiEvT_(
+// CHECK-DAG: define{{.*}} ptx_kernel void @_Z16templated_kernelIiEvT_(
 
 namespace {
 __global__ void anonymous_ns_kernel() {}
@@ -30,6 +30,3 @@ void host_function() {
   templated_kernel<<<0, 0>>>(0);
   anonymous_ns_kernel<<<0,0>>>();
 }
-
-// CHECK: !{{[0-9]+}} = !{ptr @global_function, !"kernel", i32 1}
-// CHECK: !{{[0-9]+}} = !{ptr @_Z16templated_kernelIiEvT_, !"kernel", i32 1}
diff --git a/clang/test/CodeGenCUDA/usual-deallocators.cu b/clang/test/CodeGenCUDA/usual-deallocators.cu
index b85a706..64560ef 100644
--- a/clang/test/CodeGenCUDA/usual-deallocators.cu
+++ b/clang/test/CodeGenCUDA/usual-deallocators.cu
@@ -109,7 +109,7 @@ __host__ __device__ void tests_hd(void *t) {
 }
 
 // Make sure that we've generated the kernel used by A::~A.
-// DEVICE-LABEL: define void @_Z1fIiEvT_
+// DEVICE-LABEL: define ptx_kernel void @_Z1fIiEvT_
 
 // Make sure we've picked deallocator for the correct side of compilation.
 
@@ -147,5 +147,3 @@ __host__ __device__ void tests_hd(void *t) {
 // COMMON-LABEL: define  linkonce_odr void @_ZN8H1H2D1D2dlEPv(ptr noundef %0)
 // DEVICE: call void @dev_fn()
 // HOST: call void @host_fn()
-
-// DEVICE: !0 = !{ptr @_Z1fIiEvT_, !"kernel", i32 1}
diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
index 040615f..ffbce9f 100644
--- a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
+++ b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -fenable-matrix %s -emit-llvm -triple x86_64-unknown-linux -disable-llvm-passes -o - -std=c++11 | FileCheck %s
 
 using i8x3 = _BitInt(8) __attribute__((ext_vector_type(3)));
@@ -7,92 +8,104 @@ using i32x3x3 = _BitInt(32) __attribute__((matrix_type(3, 3)));
 using i512x3 = _BitInt(512) __attribute__((ext_vector_type(3)));
 using i512x3x3 = _BitInt(512) __attribute__((matrix_type(3, 3)));
 
-// CHECK-LABEL: define dso_local i32 @_Z2v1Dv3_DB8_(i32 %a.coerce)
+// CHECK-LABEL: define dso_local i32 @_Z2v1Dv3_DB8_(
+// CHECK-SAME: i32 [[A_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <3 x i8>, align 4
+// CHECK-NEXT:    [[A:%.*]] = alloca <3 x i8>, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i8>, align 4
+// CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
+// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i8>, ptr [[A]], align 4
+// CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[LOADVEC4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[LOADVEC42:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVEC42]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVEC44:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVEC44]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[ADD:%.*]] = add <3 x i8> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
+// CHECK-NEXT:    store <3 x i8> [[ADD]], ptr [[RETVAL]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
 i8x3 v1(i8x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %retval = alloca <3 x i8>, align 4
-  // CHECK-NEXT:   %a = alloca <3 x i8>, align 4
-  // CHECK-NEXT:   %a.addr = alloca <3 x i8>, align 4
-  // CHECK-NEXT:   store i32 %a.coerce, ptr %a, align 4
-  // CHECK-NEXT:   %loadVec4 = load <4 x i8>, ptr %a, align 4
-  // CHECK-NEXT:   %a1 = shufflevector <4 x i8> %loadVec4, <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK-NEXT:   %extractVec = shufflevector <3 x i8> %a1, <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  // CHECK-NEXT:   store <4 x i8> %extractVec, ptr %a.addr, align 4
-  // CHECK-NEXT:   %loadVec42 = load <4 x i8>, ptr %a.addr, align 4
-  // CHECK-NEXT:   %extractVec3 = shufflevector <4 x i8> %loadVec42, <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK-NEXT:   %loadVec44 = load <4 x i8>, ptr %a.addr, align 4
-  // CHECK-NEXT:   %extractVec5 = shufflevector <4 x i8> %loadVec44, <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK-NEXT:   %add = add <3 x i8> %extractVec3, %extractVec5
-  // CHECK-NEXT:   store <3 x i8> %add, ptr %retval, align 4
-  // CHECK-NEXT:   %0 = load i32, ptr %retval, align 4
-  // CHECK-NEXT:   ret i32 %0
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <3 x i32> @_Z2v2Dv3_DB32_(<3 x i32> noundef %a)
+// CHECK-LABEL: define dso_local noundef <3 x i32> @_Z2v2Dv3_DB32_(
+// CHECK-SAME: <3 x i32> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i32>, align 16
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVEC4]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVEC42:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVEC42]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[ADD:%.*]] = add <3 x i32> [[EXTRACTVEC1]], [[EXTRACTVEC3]]
+// CHECK-NEXT:    ret <3 x i32> [[ADD]]
+//
 i32x3 v2(i32x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca <3 x i32>, align 16
-  // CHECK-NEXT:   %extractVec = shufflevector <3 x i32> %a, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  // CHECK-NEXT:   store <4 x i32> %extractVec, ptr %a.addr, align 16
-  // CHECK-NEXT:   %loadVec4 = load <4 x i32>, ptr %a.addr, align 16
-  // CHECK-NEXT:   %extractVec1 = shufflevector <4 x i32> %loadVec4, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK-NEXT:   %loadVec42 = load <4 x i32>, ptr %a.addr, align 16
-  // CHECK-NEXT:   %extractVec3 = shufflevector <4 x i32> %loadVec42, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK-NEXT:   %add = add <3 x i32> %extractVec1, %extractVec3
-  // CHECK-NEXT:   ret <3 x i32> %add
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <3 x i512> @_Z2v3Dv3_DB512_(ptr noundef byval(<3 x i512>) align 256 %0)
+// CHECK-LABEL: define dso_local noundef <3 x i512> @_Z2v3Dv3_DB512_(
+// CHECK-SAME: ptr noundef byval(<3 x i512>) align 256 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i512>, align 256
+// CHECK-NEXT:    [[LOADVEC4:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
+// CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x i512> [[LOADVEC4]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256
+// CHECK-NEXT:    [[LOADVEC41:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
+// CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVEC41]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[LOADVEC43:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
+// CHECK-NEXT:    [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVEC43]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[ADD:%.*]] = add <3 x i512> [[EXTRACTVEC2]], [[EXTRACTVEC4]]
+// CHECK-NEXT:    ret <3 x i512> [[ADD]]
+//
 i512x3 v3(i512x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca <3 x i512>, align 256
-  // CHECK-NEXT:   %loadVec4 = load <4 x i512>, ptr %0, align 256
-  // CHECK-NEXT:   %a = shufflevector <4 x i512> %loadVec4, <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK-NEXT:   %extractVec = shufflevector <3 x i512> %a, <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  // CHECK-NEXT:   store <4 x i512> %extractVec, ptr %a.addr, align 256
-  // CHECK-NEXT:   %loadVec41 = load <4 x i512>, ptr %a.addr, align 256
-  // CHECK-NEXT:   %extractVec2 = shufflevector <4 x i512> %loadVec41, <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK-NEXT:   %loadVec43 = load <4 x i512>, ptr %a.addr, align 256
-  // CHECK-NEXT:   %extractVec4 = shufflevector <4 x i512> %loadVec43, <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
-  // CHECK-NEXT:   %add = add <3 x i512> %extractVec2, %extractVec4
-  // CHECK-NEXT:   ret <3 x i512> %add
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <9 x i8> @_Z2m1u11matrix_typeILm3ELm3EDB8_E(<9 x i8> noundef %a)
+// CHECK-LABEL: define dso_local noundef <9 x i8> @_Z2m1u11matrix_typeILm3ELm3EDB8_E(
+// CHECK-SAME: <9 x i8> noundef [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [9 x i8], align 1
+// CHECK-NEXT:    store <9 x i8> [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <9 x i8>, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i8>, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = add <9 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <9 x i8> [[TMP2]]
+//
 i8x3x3 m1(i8x3x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca [9 x i8], align 1
-  // CHECK-NEXT:   store <9 x i8> %a, ptr %a.addr, align 1
-  // CHECK-NEXT:   %0 = load <9 x i8>, ptr %a.addr, align 1
-  // CHECK-NEXT:   %1 = load <9 x i8>, ptr %a.addr, align 1
-  // CHECK-NEXT:   %2 = add <9 x i8> %0, %1
-  // CHECK-NEXT:   ret <9 x i8> %2
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <9 x i32> @_Z2m2u11matrix_typeILm3ELm3EDB32_E(<9 x i32> noundef %a)
+// CHECK-LABEL: define dso_local noundef <9 x i32> @_Z2m2u11matrix_typeILm3ELm3EDB32_E(
+// CHECK-SAME: <9 x i32> noundef [[A:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [9 x i32], align 4
+// CHECK-NEXT:    store <9 x i32> [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <9 x i32>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i32>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = add <9 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <9 x i32> [[TMP2]]
+//
 i32x3x3 m2(i32x3x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca [9 x i32], align 4
-  // CHECK-NEXT:   store <9 x i32> %a, ptr %a.addr, align 4
-  // CHECK-NEXT:   %0 = load <9 x i32>, ptr %a.addr, align 4
-  // CHECK-NEXT:   %1 = load <9 x i32>, ptr %a.addr, align 4
-  // CHECK-NEXT:   %2 = add <9 x i32> %0, %1
-  // CHECK-NEXT:   ret <9 x i32> %2
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local noundef <9 x i512> @_Z2m3u11matrix_typeILm3ELm3EDB512_E(<9 x i512> noundef %a)
+// CHECK-LABEL: define dso_local noundef <9 x i512> @_Z2m3u11matrix_typeILm3ELm3EDB512_E(
+// CHECK-SAME: <9 x i512> noundef [[A:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [9 x i512], align 8
+// CHECK-NEXT:    store <9 x i512> [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <9 x i512>, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <9 x i512>, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = add <9 x i512> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <9 x i512> [[TMP2]]
+//
 i512x3x3 m3(i512x3x3 a) {
-  // CHECK-NEXT: entry:
-  // CHECK-NEXT:   %a.addr = alloca [9 x i512], align 8
-  // CHECK-NEXT:   store <9 x i512> %a, ptr %a.addr, align 8
-  // CHECK-NEXT:   %0 = load <9 x i512>, ptr %a.addr, align 8
-  // CHECK-NEXT:   %1 = load <9 x i512>, ptr %a.addr, align 8
-  // CHECK-NEXT:   %2 = add <9 x i512> %0, %1
-  // CHECK-NEXT:   ret <9 x i512> %2
   return a + a;
 }
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl
index 4428b77..2ad5b82 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=DXC,CHECK
+// RUN: %clang_cc1 -triple spirv1.6-pc-vulkan1.3-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=SPIRV,CHECK
 
 RWBuffer<int> In;
 RWBuffer<int> Out;
@@ -7,15 +8,19 @@ RWBuffer<int> Out;
 void main(unsigned GI : SV_GroupIndex) {
   // CHECK: define void @main()
 
-  // CHECK: %[[INPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // DXC: %[[INPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[INPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_i32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
   // CHECK: %[[LOAD:.*]] = load i32, ptr %[[INPTR]]
-  // CHECK: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // DXC: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_i32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
   // CHECK: store i32 %[[LOAD]], ptr %[[OUTPTR]]
   Out[GI] = In[GI];
 
-  // CHECK: %[[INPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // DXC: %[[INPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[INPTR:.*]] = call ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_i32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
   // CHECK: %[[LOAD:.*]] = load i32, ptr %[[INPTR]]
-  // CHECK: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // DXC: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_i32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
   // CHECK: store i32 %[[LOAD]], ptr %[[OUTPTR]]
   Out[GI] = In.Load(GI);
 }
diff --git a/clang/test/CodeGenHLSL/debug/rwbuffer_debug_info.hlsl b/clang/test/CodeGenHLSL/debug/rwbuffer_debug_info.hlsl
new file mode 100644
index 0000000..db0388e
--- /dev/null
+++ b/clang/test/CodeGenHLSL/debug/rwbuffer_debug_info.hlsl
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s -debug-info-kind=standalone -dwarf-version=4  | FileCheck %s
+
+
+// CHECK: [[DWTag:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "RWBuffer<float>", 
+// CHECK: [[RWBuffer:![0-9]+]] = distinct !DISubprogram(name: "RWBuffer",
+// CHECK-SAME: scope: [[DWTag]]
+// CHECK: [[FirstThis:![0-9]+]] = !DILocalVariable(name: "this", arg: 1, scope: [[RWBuffer]], type: [[thisType:![0-9]+]]
+// CHECK: [[thisType]] = !DIDerivedType(tag: DW_TAG_reference_type, baseType: [[DWTag]], size: 32)
+RWBuffer<float> Out : register(u7, space4);
+
+[numthreads(8,1,1)]
+void main(int GI : SV_GroupIndex) {
+  Out[GI] = 0;
+}
diff --git a/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl b/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl
index 5e09f0f..3aa054a 100644
--- a/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl
@@ -1,32 +1,36 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
 
-// Make sure SV_GroupID translated into dx.group.id.
+// Make sure SV_GroupID translated into dx.group.id for directx target and spv.group.id for spirv target.
 
-// CHECK:  define void @foo()
-// CHECK:  %[[#ID:]] = call i32 @llvm.dx.group.id(i32 0)
-// CHECK:  call void @{{.*}}foo{{.*}}(i32 %[[#ID]])
+// CHECK:       define void @foo()
+// CHECK:       %[[#ID:]] = call i32 @llvm.[[TARGET]].group.id(i32 0)
+// CHECK-DXIL:       call void @{{.*}}foo{{.*}}(i32 %[[#ID]])
+// CHECK-SPIRV:      call spir_func void @{{.*}}foo{{.*}}(i32 %[[#ID]])
 [shader("compute")]
 [numthreads(8,8,1)]
 void foo(uint Idx : SV_GroupID) {}
 
-// CHECK:  define void @bar()
-// CHECK:  %[[#ID_X:]] = call i32 @llvm.dx.group.id(i32 0)
-// CHECK:  %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
-// CHECK:  %[[#ID_Y:]] = call i32 @llvm.dx.group.id(i32 1)
-// CHECK:  %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
-// CHECK:  call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]])
+// CHECK:       define void @bar()
+// CHECK:       %[[#ID_X:]] = call i32 @llvm.[[TARGET]].group.id(i32 0)
+// CHECK:       %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
+// CHECK:       %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].group.id(i32 1)
+// CHECK:       %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
+// CHECK-DXIL:  call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]])
+// CHECK-SPIRV:  call spir_func void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]])
 [shader("compute")]
 [numthreads(8,8,1)]
 void bar(uint2 Idx : SV_GroupID) {}
 
 // CHECK:  define void @test()
-// CHECK:  %[[#ID_X:]] = call i32 @llvm.dx.group.id(i32 0)
+// CHECK:  %[[#ID_X:]] = call i32 @llvm.[[TARGET]].group.id(i32 0)
 // CHECK:  %[[#ID_X_:]] = insertelement <3 x i32> poison, i32 %[[#ID_X]], i64 0
-// CHECK:  %[[#ID_Y:]] = call i32 @llvm.dx.group.id(i32 1)
+// CHECK:  %[[#ID_Y:]] = call i32 @llvm.[[TARGET]].group.id(i32 1)
 // CHECK:  %[[#ID_XY:]] = insertelement <3 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
-// CHECK:  %[[#ID_Z:]] = call i32 @llvm.dx.group.id(i32 2)
+// CHECK:  %[[#ID_Z:]] = call i32 @llvm.[[TARGET]].group.id(i32 2)
 // CHECK:  %[[#ID_XYZ:]] = insertelement <3 x i32> %[[#ID_XY]], i32 %[[#ID_Z]], i64 2
-// CHECK:  call void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]])
+// CHECK-DXIL:   call void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]])
+// CHECK-SPIRV:  call spir_func void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]])
 [shader("compute")]
 [numthreads(8,8,1)]
 void test(uint3 Idx : SV_GroupID) {}
diff --git a/clang/test/CodeGenOpenCL/ptx-calls.cl b/clang/test/CodeGenOpenCL/ptx-calls.cl
index 0081152..ae18717 100644
--- a/clang/test/CodeGenOpenCL/ptx-calls.cl
+++ b/clang/test/CodeGenOpenCL/ptx-calls.cl
@@ -7,7 +7,5 @@ void device_function() {
 __kernel void kernel_function() {
   device_function();
 }
-// CHECK-LABEL: define{{.*}} spir_kernel void @kernel_function()
+// CHECK-LABEL: define{{.*}} ptx_kernel void @kernel_function()
 // CHECK: call void @device_function()
-// CHECK: !{{[0-9]+}} = !{ptr @kernel_function, !"kernel", i32 1}
-
diff --git a/clang/test/CodeGenOpenCL/ptx-kernels.cl b/clang/test/CodeGenOpenCL/ptx-kernels.cl
index 210e568..eac0df4 100644
--- a/clang/test/CodeGenOpenCL/ptx-kernels.cl
+++ b/clang/test/CodeGenOpenCL/ptx-kernels.cl
@@ -6,6 +6,4 @@ void device_function() {
 
 __kernel void kernel_function() {
 }
-// CHECK-LABEL: define{{.*}} spir_kernel void @kernel_function()
-
-// CHECK: !{{[0-9]+}} = !{ptr @kernel_function, !"kernel", i32 1}
+// CHECK-LABEL: define{{.*}} ptx_kernel void @kernel_function()
diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl
index 9ae4a5f..f5b618f 100644
--- a/clang/test/CodeGenOpenCL/reflect.cl
+++ b/clang/test/CodeGenOpenCL/reflect.cl
@@ -12,8 +12,8 @@ bool device_function() {
   return __nvvm_reflect("__CUDA_ARCH") >= 700;
 }
 
-// CHECK-LABEL: define dso_local spir_kernel void @kernel_function(
-// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+// CHECK-LABEL: define dso_local ptx_kernel void @kernel_function(
+// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4
 // CHECK-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4
@@ -26,3 +26,9 @@ bool device_function() {
 __kernel void kernel_function(__global int *i) {
   *i = device_function();
 }
+//.
+// CHECK: [[META3]] = !{i32 1}
+// CHECK: [[META4]] = !{!"none"}
+// CHECK: [[META5]] = !{!"int*"}
+// CHECK: [[META6]] = !{!""}
+//.
diff --git a/clang/test/CodeGenSPIRV/Builtins/distance.c b/clang/test/CodeGenSPIRV/Builtins/distance.c
new file mode 100644
index 0000000..76c684b
--- /dev/null
+++ b/clang/test/CodeGenSPIRV/Builtins/distance.c
@@ -0,0 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN: %clang_cc1 -O1 -triple spirv-pc-vulkan-compute %s -emit-llvm -o - | FileCheck %s
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: define spir_func float @test_distance_float2(
+// CHECK-SAME: <2 x float> noundef [[X:%.*]], <2 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_DISTANCE:%.*]] = tail call float @llvm.spv.distance.v2f32(<2 x float> [[X]], <2 x float> [[Y]])
+// CHECK-NEXT:    ret float [[SPV_DISTANCE]]
+//
+float test_distance_float2(float2 X, float2 Y) { return __builtin_spirv_distance(X, Y); }
+
+// CHECK-LABEL: define spir_func float @test_distance_float3(
+// CHECK-SAME: <3 x float> noundef [[X:%.*]], <3 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_DISTANCE:%.*]] = tail call float @llvm.spv.distance.v3f32(<3 x float> [[X]], <3 x float> [[Y]])
+// CHECK-NEXT:    ret float [[SPV_DISTANCE]]
+//
+float test_distance_float3(float3 X, float3 Y) { return __builtin_spirv_distance(X, Y); }
+
+// CHECK-LABEL: define spir_func float @test_distance_float4(
+// CHECK-SAME: <4 x float> noundef [[X:%.*]], <4 x float> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_DISTANCE:%.*]] = tail call float @llvm.spv.distance.v4f32(<4 x float> [[X]], <4 x float> [[Y]])
+// CHECK-NEXT:    ret float [[SPV_DISTANCE]]
+//
+float test_distance_float4(float4 X, float4 Y) { return __builtin_spirv_distance(X, Y); }
diff --git a/clang/test/CoverageMapping/single-byte-counters.cpp b/clang/test/CoverageMapping/single-byte-counters.cpp
index 4c0987e..f09e130 100644
--- a/clang/test/CoverageMapping/single-byte-counters.cpp
+++ b/clang/test/CoverageMapping/single-byte-counters.cpp
@@ -1,20 +1,22 @@
 // RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -mllvm -enable-single-byte-coverage=true -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name single-byte-counters.cpp %s | FileCheck %s
 
 // CHECK: testIf
-int testIf(int x) { // CHECK-NEXT: File 0, [[@LINE]]:19 -> [[@LINE+7]]:2 = [[C00:#0]]
+int testIf(int x) { // CHECK-NEXT: File 0, [[@LINE]]:19 -> [[@LINE+8]]:2 = [[C00:#0]]
   int result = 0;
   if (x == 0)       // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:13 = [[C00]]
-                    // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+1]]:5 = [[C0T:#1]]
+
+                    // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:14 -> [[@LINE+1]]:5 = [[C0T:#1]]
     result = -1;    // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:16 = [[C0T]]
 
   return result;    // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE]]:16 = [[C0E:#2]]
 }
 
 // CHECK-NEXT: testIfElse
-int testIfElse(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+8]]:2 = [[C10:#0]]
+int testIfElse(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+9]]:2 = [[C10:#0]]
   int result = 0;
   if (x < 0)            // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = [[C10]]
-                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = [[C1T:#1]]
+
+                        // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:13 -> [[@LINE+1]]:5 = [[C1T:#1]]
     result = 0;         // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:15 = [[C1T]]
   else                  // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:16 -> [[@LINE+1]]:5 = [[C1F:#2]]
     result = x * x;     // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:19 = [[C1F]]
@@ -22,10 +24,11 @@ int testIfElse(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+8]]:2 = [
 }
 
 // CHECK-NEXT: testIfElseReturn
-int testIfElseReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+9]]:2 = [[C20:#0]]
+int testIfElseReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+10]]:2 = [[C20:#0]]
   int result = 0;
   if (x > 0)                  // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = [[C20]]
-                              // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = [[C2T:#1]]
+
+                              // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:13 -> [[@LINE+1]]:5 = [[C2T:#1]]
     result = x * x;           // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:19 = [[C2T]]
   else                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:20 -> [[@LINE+1]]:5 = [[C2F:#2]]
     return 0;                 // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:13 = [[C2F]]
@@ -34,10 +37,11 @@ int testIfElseReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+9]]
 }
 
 // CHECK-NEXT: testIfBothReturn
-int testIfBothReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+9]]:2 = [[C20:#0]]
+int testIfBothReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+10]]:2 = [[C20:#0]]
   int result = 0;
   if (x > 0)                  // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = [[C20]]
-                              // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = [[C2T:#1]]
+
+                              // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:13 -> [[@LINE+1]]:5 = [[C2T:#1]]
     return 42;                // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:14 = [[C2T]]
   else                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:15 -> [[@LINE+1]]:5 = [[C2F:#2]]
     return 0;                 // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:13 = [[C2F]]
@@ -46,19 +50,22 @@ int testIfBothReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+9]]
 }
 
 // CHECK-NEXT: testSwitch
-int testSwitch(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+17]]:2 = [[C30:#0]]
+int testSwitch(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+20]]:2 = [[C30:#0]]
   int result;
   switch (x) {
-                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+10]]:15 = 0
-  case 1:               // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = [[C31:#2]]
+                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+13]]:15 = 0
+  case 1:               // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:10 = [[C31:#2]]
+
     result = 1;
     break;
                         // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:11 -> [[@LINE+1]]:3 = 0
-  case 2:               // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = [[C32:#3]]
+  case 2:               // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:10 = [[C32:#3]]
+
     result = 2;
     break;
                         // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:11 -> [[@LINE+1]]:3 = 0
-  default:              // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:15 = [[C3D:#4]]
+  default:              // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:15 = [[C3D:#4]]
+
     result = 0;
   }
                         // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:4 -> [[@LINE+1]]:3 = [[C3E:#1]]
@@ -66,12 +73,13 @@ int testSwitch(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+17]]:2 =
 }
 
 // CHECK-NEXT: testWhile
-int testWhile() {       // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+11]]:2 = [[C40:#0]]
+int testWhile() {       // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+12]]:2 = [[C40:#0]]
   int i = 0;
   int sum = 0;
   while (i < 10) {      // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE]]:16 = [[C4C:#1]]
-                        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:17 -> [[@LINE-1]]:18 = [[C4T:#2]]
-                        // CHECK-NEXT: File 0, [[@LINE-2]]:18 -> [[@LINE+3]]:4 = [[C4T]]
+
+                        // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:17 -> [[@LINE-2]]:18 = [[C4T:#2]]
+                        // CHECK-NEXT: File 0, [[@LINE-3]]:18 -> [[@LINE+3]]:4 = [[C4T]]
     sum += i;
     i++;
   }
@@ -80,19 +88,22 @@ int testWhile() {       // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+11]]:2 =
 }
 
 // CHECK-NEXT: testContinueBreak
-int testContinueBreak() { // CHECK-NEXT: File 0, [[@LINE]]:25 -> [[@LINE+20]]:2 = #0
+int testContinueBreak() { // CHECK-NEXT: File 0, [[@LINE]]:25 -> [[@LINE+23]]:2 = #0
   int i = 0;
   int sum = 0;
   while (i < 10) {   // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE]]:16 = #1
-                     // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:17 -> [[@LINE-1]]:18 = [[C5B:#2]]
-                     // CHECK-NEXT: File 0, [[@LINE-2]]:18 -> [[@LINE+12]]:4 = [[C5B]]
+
+                     // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:17 -> [[@LINE-2]]:18 = [[C5B:#2]]
+                     // CHECK-NEXT: File 0, [[@LINE-3]]:18 -> [[@LINE+14]]:4 = [[C5B]]
     if (i == 4)      // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:15 = [[C5B]]
-                     // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:16 -> [[@LINE+1]]:7 = [[C5T:#4]]
+
+                     // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:16 -> [[@LINE+1]]:7 = [[C5T:#4]]
       continue;      // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:15 = [[C5T]]
                      // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:16 -> [[@LINE+2]]:5 = [[C5F:#5]]
-                     // CHECK-NEXT: File 0, [[@LINE+1]]:5 -> [[@LINE+7]]:4 = [[C5F]]
+                     // CHECK-NEXT: File 0, [[@LINE+1]]:5 -> [[@LINE+8]]:4 = [[C5F]]
     if (i == 5)      // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:15 = [[C5F]]
-                     // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:16 -> [[@LINE+1]]:7 = [[C5T1:#6]]
+
+                     // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:16 -> [[@LINE+1]]:7 = [[C5T1:#6]]
       break;         // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:12 = [[C5T1]]
                      // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+1]]:5 = [[C5F1:#7]]
     sum += i;        // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE+2]]:4 = [[C5F1]]
@@ -103,10 +114,11 @@ int testContinueBreak() { // CHECK-NEXT: File 0, [[@LINE]]:25 -> [[@LINE+20]]:2
 }
 
 // CHECK-NEXT: testFor
-int testFor() { // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE+12]]:2 = [[C60:#0]]
+int testFor() { // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE+13]]:2 = [[C60:#0]]
   int i;
   int sum = 0;
-                // CHECK-NEXT: File 0, [[@LINE+2]]:19 -> [[@LINE+2]]:25 = [[C61:#1]]
+                // CHECK-NEXT: File 0, [[@LINE+3]]:19 -> [[@LINE+3]]:25 = [[C61:#1]]
+
                 // CHECK-NEXT: File 0, [[@LINE+1]]:27 -> [[@LINE+1]]:30 = [[C6C:#2]]
   for (int i = 0; i < 10; i++) {
                 // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:31 -> [[@LINE-1]]:32 = [[C6B:#3]]
@@ -144,10 +156,11 @@ int testDo() {          // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+9]]:2 = [
 }
 
 // CHECK-NEXT: testConditional
-int testConditional(int x) {    // CHECK-NEXT: File 0, [[@LINE]]:28 -> [[@LINE+6]]:2 = [[C90:#0]]
+int testConditional(int x) {    // CHECK-NEXT: File 0, [[@LINE]]:28 -> [[@LINE+7]]:2 = [[C90:#0]]
  int result = (x > 0) ? 1 : -1; // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE]]:22 = [[C90]]
-                                // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:24 -> [[@LINE-1]]:25 = [[C9T:#2]]
-                                // CHECK-NEXT: File 0, [[@LINE-2]]:25 -> [[@LINE-2]]:26 = [[C9T]]
-                                // CHECK-NEXT: File 0, [[@LINE-3]]:29 -> [[@LINE-3]]:31 = [[C9F:#3]]
+
+                                // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:24 -> [[@LINE-2]]:25 = [[C9T:#2]]
+                                // CHECK-NEXT: File 0, [[@LINE-3]]:25 -> [[@LINE-3]]:26 = [[C9T]]
+                                // CHECK-NEXT: File 0, [[@LINE-4]]:29 -> [[@LINE-4]]:31 = [[C9F:#3]]
  return result;                 // CHECK-NEXT: File 0, [[@LINE]]:2 -> [[@LINE]]:15 = [[C9E:#1]]
 }
diff --git a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_52.bc b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/include/.keep
index e69de29..e69de29 100644
--- a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_52.bc
+++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/include/.keep
diff --git a/clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx-sm_52.bc b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/local/include/.keep
index e69de29..e69de29 100644
--- a/clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx-sm_52.bc
+++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/local/include/.keep
diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/include/c++/v1/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/include/c++/v1/.keep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/include/c++/v1/.keep
diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/local/include/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/local/include/.keep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/local/include/.keep
diff --git a/clang/test/Driver/Inputs/config-zos/clang.cfg b/clang/test/Driver/Inputs/config-zos/clang.cfg
new file mode 100644
index 0000000..43a5dbf
--- /dev/null
+++ b/clang/test/Driver/Inputs/config-zos/clang.cfg
@@ -0,0 +1 @@
+-DABC=123
diff --git a/clang/test/Driver/Inputs/config-zos/def.cfg b/clang/test/Driver/Inputs/config-zos/def.cfg
new file mode 100644
index 0000000..156f9c8
--- /dev/null
+++ b/clang/test/Driver/Inputs/config-zos/def.cfg
@@ -0,0 +1 @@
+-DDEF=456
diff --git a/clang/test/Driver/Inputs/config-zos/tst/def.cfg b/clang/test/Driver/Inputs/config-zos/tst/def.cfg
new file mode 100644
index 0000000..156f9c8
--- /dev/null
+++ b/clang/test/Driver/Inputs/config-zos/tst/def.cfg
@@ -0,0 +1 @@
+-DDEF=456
diff --git a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx.bc b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx.bc
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx.bc
diff --git a/clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx.bc b/clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx.bc
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Driver/Inputs/libomptarget/subdir/libomptarget-nvptx.bc
diff --git a/clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc b/clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/clang/test/Driver/Inputs/spirv-openmp/lib/libomptarget-spirv64.bc
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
index f596708..1c2ee26 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -81,3 +81,7 @@
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp --offload-arch=gfx803 \
 // RUN:     -stdlib=libc++ -nogpulib %s 2>&1 | FileCheck %s --check-prefix=LIBCXX
 // LIBCXX-NOT: include/amdgcn-amd-amdhsa/c++/v1
+
+// RUN: %clang -### -target x86_64-pc-linux-gnu -nogpulib  -fopenmp --offload-arch=gfx90a \
+// RUN:   -ftime-report %s 2>&1 | FileCheck %s --check-prefix=CHECK-TIME-REPORT
+// CHECK-TIME-REPORT: clang-linker-wrapper{{.*}}"--device-compiler=-ftime-report"
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index c975727..29a0fcb 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -739,6 +739,8 @@
 // RUN:     -fimplicit-modules \
 // RUN:     -fno-implicit-modules \
 // RUN:     -ftrivial-auto-var-init=zero \
+// RUN:     -fwrapv \
+// RUN:     -fno-wrapv \
 // RUN:     --version \
 // RUN:     -Werror /Zs -- %s 2>&1
 
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index ddbf1fd..2b72068 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -364,6 +364,7 @@
 // RUN: -fno-devirtualize-speculatively                                       \
 // RUN: -fslp-vectorize-aggressive                                            \
 // RUN: -fno-slp-vectorize-aggressive                                         \
+// RUN: -forder-file-instrumentation                                          \
 // RUN: %s 2>&1 | FileCheck --check-prefix=CHECK-WARNING %s
 // CHECK-WARNING-DAG: optimization flag '-finline-limit=1000' is not supported
 // CHECK-WARNING-DAG: optimization flag '-finline-limit' is not supported
@@ -423,6 +424,7 @@
 // CHECK-WARNING-DAG: optimization flag '-fno-devirtualize-speculatively' is not supported
 // CHECK-WARNING-DAG: the flag '-fslp-vectorize-aggressive' has been deprecated and will be ignored
 // CHECK-WARNING-DAG: the flag '-fno-slp-vectorize-aggressive' has been deprecated and will be ignored
+// CHECK-WARNING-DAG: argument '-forder-file-instrumentation' is deprecated, use '-mllvm -pgo-temporal-instrumentation' instead
 
 // Test that we mute the warning on these
 // RUN: %clang -### -finline-limit=1000 -Wno-invalid-command-line-argument              \
diff --git a/clang/test/Driver/config-zos.c b/clang/test/Driver/config-zos.c
new file mode 100644
index 0000000..8de02ec
--- /dev/null
+++ b/clang/test/Driver/config-zos.c
@@ -0,0 +1,17 @@
+// REQUIRES: shell
+// REQUIRES: systemz-registered-target
+
+// RUN: unset CLANG_NO_DEFAULT_CONFIG
+// RUN: rm -rf %t && mkdir %t
+
+// RUN: mkdir -p %t/testbin
+// RUN: mkdir -p %t/etc
+// RUN: ln -s %clang %t/testbin/clang
+// RUN: echo "-DXYZ=789" >%t/etc/clang.cfg
+// RUN: %t/testbin/clang --target=s390x-ibm-zos -c -### -no-canonical-prefixes %s 2>&1 | FileCheck -DDIR=%t %s 
+// RUN: %t/testbin/clang --target=s390x-ibm-zos -c -### -no-canonical-prefixes --no-default-config %s 2>&1 | FileCheck -check-prefix=NOCONFIG %s 
+//
+// CHECK: Configuration file: [[DIR]]/etc/clang.cfg
+// CHECK: "-D" "XYZ=789"
+// NOCONFIG-NOT: Configuration file: {{.*}}/etc/clang.cfg
+// NOCONFIG-NOT: "-D" "XYZ=789"
diff --git a/clang/test/Driver/config-zos1.c b/clang/test/Driver/config-zos1.c
new file mode 100644
index 0000000..5b1012d
--- /dev/null
+++ b/clang/test/Driver/config-zos1.c
@@ -0,0 +1,23 @@
+// REQUIRES: shell
+// REQUIRES: systemz-registered-target
+
+// RUN: unset CLANG_NO_DEFAULT_CONFIG
+
+// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos
+// RUN: %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s 
+// CHECK: Configuration file: {{.*}}/Inputs/config-zos/clang.cfg
+// CHECK: "-D" "ABC=123"
+
+// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/def.cfg
+// RUN: %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s -check-prefix=CHECK-DEF
+// CHECK-DEF: Configuration file: {{.*}}/Inputs/config-zos/def.cfg
+// CHECK-DEF: "-D" "DEF=456"
+
+// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/Garbage
+// RUN: not %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s  -check-prefix=CHECK-ERR
+// CHECK-ERR:  error: configuration file '{{.*}}/Inputs/config-zos/Garbage' cannot be found
+
+// The directory exists but no clang.cfg in it
+// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/tst
+// RUN: not %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s  -check-prefix=CHECK-ERRDIR
+// CHECK-ERRDIR:  error: configuration file '{{.*}}/Inputs/config-zos/tst/clang.cfg' cannot be found
diff --git a/clang/test/Driver/darwin-embedded-search-paths-libcxx.c b/clang/test/Driver/darwin-embedded-search-paths-libcxx.c
new file mode 100644
index 0000000..0f9a846
--- /dev/null
+++ b/clang/test/Driver/darwin-embedded-search-paths-libcxx.c
@@ -0,0 +1,45 @@
+// REQUIRES: default-cxx-stdlib=libc++
+// UNSUPPORTED: system-windows
+//   Windows is unsupported because we use the Unix path separator `/` in the test.
+
+// Unlike the Darwin driver, the MachO driver doesn't add any framework search paths,
+// only the normal header ones.
+// RUN: %clang -x c -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,NO-CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// Unlike the Darwin driver, the MachO driver doesn't default to libc++, but when
+// CLANG_DEFAULT_CXX_STDLIB is libc++ then the MachO driver should find the search path.
+// RUN: %clang -x c++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// If the user requests libc++, the MachO driver should still find the search path.
+// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// Verify that embedded uses can swap in alternate usr/include and usr/local/include directories.
+// usr/local/include is specified in the driver as -internal-isystem, however, the driver generated
+// paths come before the paths in the driver arguments. In order to keep usr/local/include in the
+// same position, -isystem has to be used instead of -Xclang -internal-isystem. There isn't an
+// -externc-isystem, but it's ok to use -Xclang -internal-externc-isystem since the driver doesn't
+// use that if -nostdlibinc or -nostdinc is passed.
+// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk \
+// RUN:        -nostdlibinc -isystem %S/Inputs/MacOSX15.1.sdk/embedded/usr/local/include \
+// RUN:        -Xclang -internal-externc-isystem -Xclang %S/Inputs/MacOSX15.1.sdk/embedded/usr/include \
+// RUN:        -### -c %s 2>&1 | FileCheck --check-prefixes=CC1,NO-CXX,EULI,CI,EUI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+
+// The ordering of these flags doesn't matter, and so this test is a little
+// fragile. i.e. all of the -internal-isystem paths will be searched before the
+// -internal-externc-isystem ones, and their order on the command line doesn't
+// matter. The line order here is just the current order that the driver writes
+// the cc1 arguments.
+
+// CC1: "-cc1"
+// NO-CXX-NOT: "-internal-isystem" "{{.*}}/include/c++/v1"
+// CXX-SAME: "-internal-isystem" "{{.*}}/include/c++/v1"
+// ULI-SAME: "-internal-isystem" "[[SDKROOT]]/usr/local/include"
+// EULI-SAME: "-isystem" "[[SDKROOT]]/embedded/usr/local/include"
+// CI-SAME: "-internal-isystem" "{{.*}}/clang/{{[[:digit:].]*}}/include"
+// UI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/usr/include"
+// EUI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/embedded/usr/include"
+// NO-FW-NOT: "-internal-iframework"
diff --git a/clang/test/Driver/darwin-embedded-search-paths.c b/clang/test/Driver/darwin-embedded-search-paths.c
new file mode 100644
index 0000000..bd651b7
--- /dev/null
+++ b/clang/test/Driver/darwin-embedded-search-paths.c
@@ -0,0 +1,46 @@
+// REQUIRES: !(default-cxx-stdlib=libc++)
+// UNSUPPORTED: system-windows
+//   Windows is unsupported because we use the Unix path separator `/` in the test.
+
+// Unlike the Darwin driver, the MachO driver doesn't add any framework search paths,
+// only the normal header ones.
+// RUN: %clang -x c -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,NO-CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// Unlike the Darwin driver, the MachO driver doesn't default to libc++, and unless
+// CLANG_DEFAULT_CXX_STDLIB is libc++ it won't add any search paths.
+// RUN: %clang -x c++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,NO-CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// However, if the user requests libc++, the MachO driver should find the search path.
+// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// Verify that embedded uses can swap in alternate usr/include and usr/local/include directories.
+// usr/local/include is specified in the driver as -internal-isystem, however, the driver generated
+// paths come before the paths in the driver arguments. In order to keep usr/local/include in the
+// same position, -isystem has to be used instead of -Xclang -internal-isystem. There isn't an
+// -externc-isystem, but it's ok to use -Xclang -internal-externc-isystem since the driver doesn't
+// use that if -nostdlibinc or -nostdinc is passed.
+// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk \
+// RUN:        -nostdlibinc -isystem %S/Inputs/MacOSX15.1.sdk/embedded/usr/local/include \
+// RUN:        -Xclang -internal-externc-isystem -Xclang %S/Inputs/MacOSX15.1.sdk/embedded/usr/include \
+// RUN:        -### -c %s 2>&1 | FileCheck --check-prefixes=CC1,NO-CXX,EULI,CI,EUI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+
+// The ordering of these flags doesn't matter, and so this test is a little
+// fragile. i.e. all of the -internal-isystem paths will be searched before the
+// -internal-externc-isystem ones, and their order on the command line doesn't
+// matter. The line order here is just the current order that the driver writes
+// the cc1 arguments.
+
+// CC1: "-cc1"
+// CC1: "-resource-dir" "[[RESOURCE_DIR:[^"]*]]"
+// NO-CXX-NOT: "-internal-isystem" "{{.*}}/include/c++/v1"
+// CXX-SAME: "-internal-isystem" "{{.*}}/include/c++/v1"
+// ULI-SAME: "-internal-isystem" "[[SDKROOT]]/usr/local/include"
+// EULI-SAME: "-isystem" "[[SDKROOT]]/embedded/usr/local/include"
+// CI-SAME: "-internal-isystem" "[[RESOURCE_DIR]]/include"
+// UI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/usr/include"
+// EUI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/embedded/usr/include"
+// NO-FW-NOT: "-internal-iframework"
diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index 6f1d315..317fd79 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -253,5 +253,5 @@
 // NOABI4-NOT: error:
 // NOABI4-NOT: "-mlink-builtin-bitcode" "{{.*}}oclc_abi_version_400.bc"
 // NOABI4-NOT: "-mlink-builtin-bitcode" "{{.*}}oclc_abi_version_500.bc"
-// NOABI5: error: cannot find ROCm device libraryfor ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
-// NOABI6: error: cannot find ROCm device libraryfor ABI version 6; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
+// NOABI5: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
+// NOABI6: error: cannot find ROCm device library for ABI version 6; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
diff --git a/clang/test/Driver/lto.c b/clang/test/Driver/lto.c
index 5be9501..a85f953 100644
--- a/clang/test/Driver/lto.c
+++ b/clang/test/Driver/lto.c
@@ -114,3 +114,9 @@
 //
 // CHECK-GISEL:         "-plugin-opt=-global-isel=1"
 // CHECK-DISABLE-GISEL: "-plugin-opt=-global-isel=0"
+
+// -flto passes -time-passes when -ftime-report is passed
+// RUN: %clang --target=x86_64-unknown-linux-gnu -### %s -flto -ftime-report 2> %t
+// RUN: FileCheck --check-prefix=CHECK-TIME-REPORT < %t %s
+
+// CHECK-TIME-REPORT: "-plugin-opt=-time-passes"
diff --git a/clang/test/Driver/mingw.cpp b/clang/test/Driver/mingw.cpp
index 9790c86..66da0c9 100644
--- a/clang/test/Driver/mingw.cpp
+++ b/clang/test/Driver/mingw.cpp
@@ -85,6 +85,10 @@
 // RUN:   | FileCheck %s --check-prefix CHECK_MINGW_EC_LINK
 // CHECK_MINGW_EC_LINK: "-m" "arm64ecpe"
 
+// RUN: %clang --target=mipsel-windows-gnu -### -o /dev/null %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix CHECK_MINGW_MIPSPE
+// CHECK_MINGW_MIPSPE: "-m" "mipspe"
+
 // RUN: %clang --target=i686-windows-gnu -fms-hotpatch -### -- %s 2>&1 \
 // RUN:    | FileCheck %s --check-prefix=FUNCTIONPADMIN
 // FUNCTIONPADMIN: "--functionpadmin"
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index f6e2245..74bd2a6 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -90,8 +90,8 @@
 // RUN:   %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s
 
 // CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
-// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_52.bc
-// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_52.bc
+// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx.bc
+// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx.bc
 // CHK-BCLIB-NOT: {{error:|warning:}}
 
 /// ###########################################################################
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 8344c1a..a8d9fcd 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -185,10 +185,15 @@
 // CHECK-NEXT:     zalasr               0.1       'Zalasr' (Load-Acquire and Store-Release Instructions)
 // CHECK-NEXT:     zvbc32e              0.7       'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements)
 // CHECK-NEXT:     zvkgs                0.7       'Zvkgs' (Vector-Scalar GCM instructions for Cryptography)
+// CHECK-NEXT:     sdext                1.0       'Sdext' (External debugger)
+// CHECK-NEXT:     sdtrig               1.0       'Sdtrig' (Debugger triggers)
 // CHECK-NEXT:     smctr                1.0       'Smctr' (Control Transfer Records Machine Level)
 // CHECK-NEXT:     ssctr                1.0       'Ssctr' (Control Transfer Records Supervisor Level)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
 // CHECK-NEXT:     xqcia                0.2       'Xqcia' (Qualcomm uC Arithmetic Extension)
+// CHECK-NEXT:     xqciac               0.2       'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+// CHECK-NEXT:     xqcicli              0.2       'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+// CHECK-NEXT:     xqcicm               0.2       'Xqcicm' (Qualcomm uC Conditional Move Extension)
 // CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
 // CHECK-NEXT:     xqcicsr              0.2       'Xqcicsr' (Qualcomm uC CSR Extension)
 // CHECK-NEXT:     xqcilsm              0.2       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index 1b099456..e97b694 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -433,6 +433,19 @@
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvkt"
 // MCPU-SIFIVE-P470-SAME: "-target-abi" "lp64d"
 
+// RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p550 | FileCheck -check-prefix=MCPU-SIFIVE-P550 %s
+// MCPU-SIFIVE-P550: "-nostdsysteminc" "-target-cpu" "sifive-p550"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+m"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+a"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+f"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+d"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+c"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+zicsr"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+zifencei"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+zba"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+zbb"
+// MCPU-SIFIVE-P550-SAME: "-target-abi" "lp64d"
+
 // RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p670 | FileCheck -check-prefix=MCPU-SIFIVE-P670 %s
 // MCPU-SIFIVE-P670: "-target-cpu" "sifive-p670"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+m"
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index 8347f9c..7107834 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -1,14 +1,24 @@
 // Test sanitizers ld flags.
 
+// Match all sanitizer related libclang_rt, we are not interested in
+// libclang_rt.builtins, libclang_rt.osx, libclang_rt.ios, libclang_rt.watchos
+// etc.
+//
+// If we need to add sanitizer with name starting with excluded laters, e.g.
+// `bsan`, we can extend expression like this: `([^iow]|b[^u])`.
+//
+// DEFINE: %{filecheck} = FileCheck %s --implicit-check-not="libclang_rt.{{([^biow])}}"
+
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX
 //
 // CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-NOT: "-lc"
-// CHECK-ASAN-LINUX: libclang_rt.asan.a"
+// CHECK-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-NOT: "--export-dynamic"
 // CHECK-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 // CHECK-ASAN-LINUX-NOT: "--export-dynamic"
@@ -21,62 +31,60 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-ASAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.asan_static-x86_64
-// CHECK-ASAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.asan-x86_64
+// CHECK-ASAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=address -fno-sanitize-link-runtime -### %s 2>&1 \
 // RUN:     --target=arm64e-apple-macosx -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-NO-LINK-RUNTIME-DARWIN
 //
-// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.asan_static
-// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.asan
+// CHECK-ASAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=address -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-EXECUTABLE-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-EXECUTABLE-LINUX
 //
-// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan_static
-// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan
+// CHECK-ASAN-EXECUTABLE-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
+// CHECK-ASAN-EXECUTABLE-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-EXECUTABLE-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 
 // RUN: %clang -fsanitize=address -shared -### %s 2>&1  \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-SHARED-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-SHARED-LINUX
 //
 // CHECK-ASAN-SHARED-LINUX: libclang_rt.asan_static
-// CHECK-ASAN-SHARED-LINUX-NOT: libclang_rt.asan
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libsan \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SHARED-ASAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libasan \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SHARED-ASAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address \
 // RUN:     -shared-libsan -static-libsan -shared-libasan             \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SHARED-ASAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHARED-ASAN-LINUX
 //
 // CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lc"
-// CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a"
 // CHECK-SHARED-ASAN-LINUX: libclang_rt.asan.so"
 // CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-preinit.a" "--no-whole-archive"
+// CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lrt"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-ldl"
@@ -88,13 +96,12 @@
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libsan \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX
 //
 // CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc"
-// CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a"
-// CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit.a"
 // CHECK-DSO-SHARED-ASAN-LINUX: libclang_rt.asan.so"
+// CHECK-DSO-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lpthread"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lrt"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-ldl"
@@ -106,13 +113,12 @@
 // RUN:     --target=i386-unknown-freebsd -fuse-ld=ld -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-FREEBSD %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD
 //
 // CHECK-ASAN-FREEBSD: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-FREEBSD-NOT: "-lc"
-// CHECK-ASAN-FREEBSD-NOT: libclang_rt.asan_cxx
+// CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan_static.a"
 // CHECK-ASAN-FREEBSD: freebsd{{/|\\+}}libclang_rt.asan.a"
-// CHECK-ASAN-FREEBSD-NOT: libclang_rt.asan_cxx
 // CHECK-ASAN-FREEBSD-NOT: "--dynamic-list"
 // CHECK-ASAN-FREEBSD: "--export-dynamic"
 // CHECK-ASAN-FREEBSD: "-lpthread"
@@ -122,23 +128,25 @@
 // RUN:     --target=i386-unknown-freebsd -fuse-ld=ld -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-FREEBSD-LDL %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-FREEBSD-LDL
 //
 // CHECK-ASAN-FREEBSD-LDL: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl"
+// CHECK-ASAN-FREEBSD-LDL: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
+// CHECK-ASAN-FREEBSD-LDL: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 
 // RUN: %clangxx -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX
 
 // RUN: %clangxx -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -fsanitize-link-c++-runtime \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX
 
 // CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
@@ -157,11 +165,10 @@
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -fno-sanitize-link-c++-runtime \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CNOCXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CNOCXX
 
 // CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-CNOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
-// CHECK-ASAN-LINUX-CNOCXX-NOT: libclang_rt.asan_cxx
 // CHECK-ASAN-LINUX-CNOCXX-SAME: "--export-dynamic"
 // CHECK-ASAN-LINUX-CNOCXX-NOT: stdc++
 // CHECK-ASAN-LINUX-CNOCXX-SAME: "-lpthread"
@@ -175,11 +182,10 @@
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -fno-sanitize-link-c++-runtime \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-NOCXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOCXX
 
 // CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-NOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
-// CHECK-ASAN-LINUX-NOCXX-NOT: libclang_rt.asan_cxx
 // CHECK-ASAN-LINUX-NOCXX-SAME: "--export-dynamic"
 // CHECK-ASAN-LINUX-NOCXX-SAME: "-lstdc++"
 // CHECK-ASAN-LINUX-NOCXX-SAME: "-lpthread"
@@ -193,11 +199,11 @@
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -nostdlib++ \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX
 
 // CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
-// CHECK-ASAN-LINUX-NOSTDCXX-SAME: libclang_rt.asan_cxx
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--export-dynamic"
 // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-lpthread"
 // CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-lrt"
@@ -209,36 +215,39 @@
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC
 //
 // CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++
 // CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX-CXX-STATIC: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 // CHECK-ASAN-LINUX-CXX-STATIC: stdc++
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=arm-linux-gnueabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-ARM %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ARM
 //
 // CHECK-ASAN-ARM: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-ARM-NOT: "-lc"
+// CHECK-ASAN-ARM: libclang_rt.asan_static.a"
 // CHECK-ASAN-ARM: libclang_rt.asan.a"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=armv7l-linux-gnueabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-ARMv7 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ARMv7
 //
 // CHECK-ASAN-ARMv7: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-ARMv7-NOT: "-lc"
+// CHECK-ASAN-ARMv7: libclang_rt.asan_static.a"
 // CHECK-ASAN-ARMv7: libclang_rt.asan.a"
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-ANDROID %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID
 //
 // CHECK-ASAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 // CHECK-ASAN-ANDROID: "-pie"
@@ -246,6 +255,7 @@
 // CHECK-ASAN-ANDROID-NOT: "-lpthread"
 // CHECK-ASAN-ANDROID-NOT: "-lresolv"
 // CHECK-ASAN-ANDROID: libclang_rt.asan.so"
+// CHECK-ASAN-ANDROID: libclang_rt.asan_static.a"
 // CHECK-ASAN-ANDROID-NOT: "-lpthread"
 // CHECK-ASAN-ANDROID-NOT: "-lresolv"
 
@@ -254,16 +264,17 @@
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     -static-libsan \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     -static-libasan \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-STATICLIBASAN
 //
 // CHECK-ASAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
+// CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan_static.a"
 // CHECK-ASAN-ANDROID-STATICLIBASAN: libclang_rt.asan.a"
 // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lpthread"
 // CHECK-ASAN-ANDROID-STATICLIBASAN-NOT: "-lrt"
@@ -273,7 +284,7 @@
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=undefined \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-ANDROID %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-ANDROID
 //
 // CHECK-UBSAN-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 // CHECK-UBSAN-ANDROID: "-pie"
@@ -289,7 +300,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     -static-libsan \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-ANDROID-STATICLIBASAN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-ANDROID-STATICLIBASAN
 //
 // CHECK-UBSAN-ANDROID-STATICLIBASAN: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 // CHECK-UBSAN-ANDROID-STATICLIBASAN: libclang_rt.ubsan_standalone.a"
@@ -302,7 +313,7 @@
 // RUN:     --target=i686-linux-android -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-ANDROID-X86 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-X86
 //
 // CHECK-ASAN-ANDROID-X86: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 // CHECK-ASAN-ANDROID-X86: "-pie"
@@ -310,6 +321,7 @@
 // CHECK-ASAN-ANDROID-X86-NOT: "-lpthread"
 // CHECK-ASAN-ANDROID-X86-NOT: "-lresolv"
 // CHECK-ASAN-ANDROID-X86: libclang_rt.asan.so"
+// CHECK-ASAN-ANDROID-X86: libclang_rt.asan_static.a"
 // CHECK-ASAN-ANDROID-X86-NOT: "-lpthread"
 // CHECK-ASAN-ANDROID-X86-NOT: "-lresolv"
 //
@@ -317,20 +329,23 @@
 // RUN:     --target=arm-linux-androideabi -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared-libsan \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-ANDROID-SHARED-LIBASAN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-SHARED-LIBASAN
 //
 // CHECK-ASAN-ANDROID-SHARED-LIBASAN-NOT: argument unused during compilation: '-shared-libsan'
+// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan{{.*}}.so"
+// CHECK-ASAN-ANDROID-SHARED-LIBASAN: libclang_rt.asan_static{{.*}}.a"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-ANDROID-SHARED %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-ANDROID-SHARED
 //
 // CHECK-ASAN-ANDROID-SHARED: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 // CHECK-ASAN-ANDROID-SHARED-NOT: "-lc"
 // CHECK-ASAN-ANDROID-SHARED: libclang_rt.asan.so"
+// CHECK-ASAN-ANDROID-SHARED: libclang_rt.asan_static.a"
 // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread"
 // CHECK-ASAN-ANDROID-SHARED-NOT: "-lresolv"
 
@@ -340,7 +355,7 @@
 // RUN:     -fsanitize=type \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-TYSAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-TYSAN-LINUX-CXX
 //
 // CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-TYSAN-LINUX-CXX-NOT: stdc++
@@ -352,7 +367,7 @@
 // RUN:     --target=x86_64-apple-darwin13.4.0 -fuse-ld=ld -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-TYSAN-DARWIN-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-TYSAN-DARWIN-CXX
 // CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib
 // CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi
@@ -362,7 +377,7 @@
 // RUN:     -fsanitize=thread \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-TSAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-LINUX-CXX
 //
 // CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-TSAN-LINUX-CXX-NOT: stdc++
@@ -381,24 +396,24 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-TSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.tsan
+// CHECK-TSAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: not %clang -fsanitize=thread -fno-sanitize-link-runtime -### %s 2>&1 \
 // RUN:     --target=arm64e-apple-ios -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-NO-LINK-RUNTIME-DARWIN
 //
-// CHECK-TSAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.tsan
+// CHECK-TSAN-NO-LINK-RUNTIME-DARWIN: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clangxx -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -stdlib=platform -lstdc++ \
 // RUN:     -fsanitize=memory \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-MSAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-MSAN-LINUX-CXX
 //
 // CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-MSAN-LINUX-CXX-NOT: stdc++
@@ -417,35 +432,31 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-MSAN-NO-LINK-RUNTIME-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-MSAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-MSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.msan
+// CHECK-MSAN-NO-LINK-RUNTIME-LINUX: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnux32 -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/multilib_64bit_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX
 
 // RUN: %clang -fsanitize=float-divide-by-zero -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnux32 -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/multilib_64bit_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnux32 -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/multilib_64bit_linux_tree \
 // RUN:     -static-libsan \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX
 
 // CHECK-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN-LINUX-NOT: libclang_rt.asan
-// CHECK-UBSAN-LINUX-NOT: libclang_rt.ubsan_standalone_cxx
 // CHECK-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
-// CHECK-UBSAN-LINUX-NOT: libclang_rt.asan
-// CHECK-UBSAN-LINUX-NOT: libclang_rt.ubsan_standalone_cxx
 // CHECK-UBSAN-LINUX-NOT: "-lstdc++"
 // CHECK-UBSAN-LINUX: "-lpthread"
 // CHECK-UBSAN-LINUX: "-lresolv"
@@ -454,46 +465,46 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-UBSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.undefined
+// CHECK-UBSAN-NO-LINK-RUNTIME-LINUX: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=undefined -fno-sanitize-link-runtime -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN
 //
-// CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.ubsan
+// CHECK-UBSAN-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=fuzzer -fno-sanitize-link-runtime -### %s 2>&1 \
 // RUN:     --target=arm64e-apple-watchos -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN
 //
-// CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN-NOT: libclang_rt.fuzzer
+// CHECK-FUZZER-NO-LINK-RUNTIME-DARWIN: "{{.*}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -shared-libsan \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -static-libsan -shared-libsan \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -shared -shared-libsan \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN
 
 // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone.so{{.*}}"
@@ -502,7 +513,7 @@
 // RUN:     --target=i386-unknown-linux \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-LINK-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-LINK-CXX
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
 // CHECK-UBSAN-LINUX-LINK-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
@@ -511,15 +522,11 @@
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-CXX
 // CHECK-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
 // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
-// CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
 // CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
-// CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
 // CHECK-UBSAN-LINUX-CXX: "-lstdc++"
-// CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
 // CHECK-UBSAN-LINUX-CXX: "-lpthread"
 // CHECK-UBSAN-LINUX-CXX: "-lresolv"
 
@@ -527,7 +534,7 @@
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-MINIMAL-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-LINUX
 // CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal.a" "--no-whole-archive"
 // CHECK-UBSAN-MINIMAL-LINUX: "-lpthread"
@@ -536,36 +543,37 @@
 // RUN: %clang -fsanitize=undefined -fsanitize-minimal-runtime -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-MINIMAL-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-MINIMAL-DARWIN
 // CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-MINIMAL-DARWIN: "{{.*}}libclang_rt.ubsan_minimal_osx_dynamic.dylib"
 
 // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld -static-libsan \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-STATIC-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-STATIC-DARWIN
 // CHECK-UBSAN-STATIC-DARWIN: {{.*}}error: static UndefinedBehaviorSanitizer runtime is not supported on darwin
 
 // RUN: not %clang -fsanitize=address -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld -static-libsan \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-STATIC-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-STATIC-DARWIN
 // CHECK-ASAN-STATIC-DARWIN: {{.*}}error: static AddressSanitizer runtime is not supported on darwin
 
 // RUN: not %clang -fsanitize=thread -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld -static-libsan \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-TSAN-STATIC-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-STATIC-DARWIN
 // CHECK-TSAN-STATIC-DARWIN: {{.*}}error: static ThreadSanitizer runtime is not supported on darwin
 
 // RUN: %clang -fsanitize=address,undefined -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX
 // CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
-// CHECK-ASAN-UBSAN-LINUX-NOT: libclang_rt.ubsan
+// CHECK-ASAN-UBSAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 // CHECK-ASAN-UBSAN-LINUX-NOT: "-lstdc++"
 // CHECK-ASAN-UBSAN-LINUX: "-lpthread"
 // CHECK-ASAN-UBSAN-LINUX: "-lresolv"
@@ -574,53 +582,73 @@
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX
 // CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
-// CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
+// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "-lstdc++"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "-lpthread"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "-lresolv"
 
+// RUN: %clangxx -fsanitize=address,undefined -fno-sanitize=vptr -### %s 2>&1 \
+// RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \
+// RUN:     -resource-dir=%S/Inputs/resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_static.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lstdc++"
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lpthread"
+// CHECK-ASAN-UBSAN-NOVPTR-LINUX-CXX-SAME: "-lresolv"
+
 // RUN: %clangxx -fsanitize=memory,undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX
 // CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
-// CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
+// CHECK-MSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms"
+// CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan_cxx.a" "--no-whole-archive"
+// CHECK-MSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan_cxx.a.syms"
+// CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 
 // RUN: %clangxx -fsanitize=thread,undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX
 // CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive"
-// CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
+// CHECK-TSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms"
+// CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan_cxx.a" "--no-whole-archive"
+// CHECK-TSAN-UBSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan_cxx.a.syms"
+// CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHARED %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-LINUX-SHARED
 // CHECK-UBSAN-LINUX-SHARED: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-LINUX-SHARED-NOT: --export-dynamic
 // CHECK-UBSAN-LINUX-SHARED-NOT: --dynamic-list
-// CHECK-UBSAN-LINUX-SHARED-NOT: libclang_rt.ubsan
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-LSAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-LINUX
 //
 // CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-LSAN-LINUX-NOT: "-lc"
-// CHECK-LSAN-LINUX-NOT: libclang_rt.ubsan
 // CHECK-LSAN-LINUX: libclang_rt.lsan.a"
 // CHECK-LSAN-LINUX: "-lpthread"
 // CHECK-LSAN-LINUX: "-ldl"
@@ -630,21 +658,20 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-LSAN-NO-LINK-RUNTIME-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-NO-LINK-RUNTIME-LINUX
 //
-// CHECK-LSAN-NO-LINK-RUNTIME-LINUX-NOT: libclang_rt.lsan
+// CHECK-LSAN-NO-LINK-RUNTIME-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:  --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak -fsanitize-coverage=func \
 // RUN:  -resource-dir=%S/Inputs/resource_dir \
 // RUN:  --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-LSAN-COV-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-COV-LINUX
 //
 // CHECK-LSAN-COV-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-LSAN-COV-LINUX-NOT: "-lc"
-// CHECK-LSAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-LSAN-COV-LINUX: libclang_rt.lsan.a
 // CHECK-LSAV-COV-LINUX: libclang_rt.lsan-x86_64.a"
-// CHECK-LSAN-COV-LINUX-NOT: libclang_rt.ubsan
 // CHECK-LSAN-COV-LINUX: "-lpthread"
 // CHECK-LSAN-COV-LINUX: "-ldl"
 // CHECK-LSAN-COV-LINUX: "-lresolv"
@@ -653,20 +680,21 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-LSAN-ASAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-ASAN-LINUX
 // CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
+// CHECK-LSAN-ASAN-LINUX: libclang_rt.asan_static
 // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan
-// CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
+// CHECK-LSAN-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 
 // RUN: %clang -fsanitize=address -fsanitize-coverage=func -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-COV-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-COV-LINUX
 // CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
-// CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-ASAN-COV-LINUX: libclang_rt.asan_static
+// CHECK-ASAN-COV-LINUX: libclang_rt.asan
+// CHECK-ASAN-COV-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 // CHECK-ASAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-ASAN-COV-LINUX: "-lpthread"
 // CHECK-ASAN-COV-LINUX: "-lresolv"
@@ -675,10 +703,10 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-MSAN-COV-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-MSAN-COV-LINUX
 // CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
-// CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-MSAN-COV-LINUX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms"
 // CHECK-MSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-MSAN-COV-LINUX: "-lpthread"
 // CHECK-MSAN-COV-LINUX: "-lresolv"
@@ -687,10 +715,9 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-DFSAN-COV-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-DFSAN-COV-LINUX
 // CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive"
-// CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan
 // CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-DFSAN-COV-LINUX: "-lpthread"
 // CHECK-DFSAN-COV-LINUX: "-lresolv"
@@ -699,7 +726,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-COV-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-COV-LINUX
 // CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++"
@@ -710,7 +737,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-COV-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-COV-LINUX
 // CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-COV-LINUX-NOT: "-lstdc++"
@@ -721,18 +748,17 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=numerical \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-NSAN-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-NSAN-LINUX
 //
 // CHECK-NSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-NSAN-LINUX-NOT: "-lc"
-// CHECK-NSAN-LINUX-NOT: libclang_rt.ubsan
 // CHECK-NSAN-LINUX: libclang_rt.nsan.a"
 // CHECK-NSAN-LINUX: "-lpthread" "-lrt" "-lm" "-ldl" "-lresolv"
 
 // RUN: %clang -### %s 2>&1 --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=numerical -shared-libsan \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-NSAN-SHARED-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-NSAN-SHARED-LINUX
 
 // CHECK-NSAN-SHARED-LINUX: libclang_rt.nsan.so"
 // CHECK-NSAN-SHARED-LINUX-NOT: "-lpthread"
@@ -742,19 +768,17 @@
 // RUN: %clang -### %s 2>&1 --target=x86_64-unknown-linux -fsanitize=numerical,undefined \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-NSAN-UBSAN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-NSAN-UBSAN
 
 // CHECK-NSAN-UBSAN: "--whole-archive" "{{[^"]*}}libclang_rt.nsan.a" "--no-whole-archive"
-// CHECK-NSAN-UBSAN-NOT: libclang_rt.ubsan
 
 // CFI by itself does not link runtime libraries.
 // RUN: not %clang -fsanitize=cfi -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-LINUX
 // CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-LINUX-NOT: libclang_rt.
 
 // CFI with diagnostics links the UBSan runtime.
 // RUN: not %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \
@@ -762,7 +786,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-DIAG-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-DIAG-LINUX
 // CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 
@@ -771,7 +795,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-LINUX
 // CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive"
 // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic
@@ -782,7 +806,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive"
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic
@@ -792,9 +816,8 @@
 // RUN:     --target=aarch64-linux-android -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-ANDROID
 // CHECK-CFI-CROSS-DSO-ANDROID: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-CROSS-DSO-ANDROID-NOT: libclang_rt.cfi
 
 // Cross-DSO CFI with diagnostics on Android links just the UBSAN runtime.
 // RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \
@@ -802,7 +825,7 @@
 // RUN:     --target=aarch64-linux-android -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-ANDROID %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-ANDROID
 // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "{{[^"]*}}libclang_rt.ubsan_standalone.so"
 // CHECK-CFI-CROSS-DSO-DIAG-ANDROID: "--export-dynamic-symbol=__cfi_check"
@@ -812,7 +835,7 @@
 // RUN:     --target=x86_64-apple-darwin13.4.0 -fuse-ld=ld -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-DARWIN106-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-DARWIN106-CXX
 // CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ASAN-DARWIN106-CXX: libclang_rt.asan_osx_dynamic.dylib
 // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi
@@ -822,7 +845,7 @@
 // RUN:     --target=x86_64-apple-darwin13.4.0 -fuse-ld=ld -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-LSAN-DARWIN106-CXX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-LSAN-DARWIN106-CXX
 // CHECK-LSAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LSAN-DARWIN106-CXX: libclang_rt.lsan_osx_dynamic.dylib
 // CHECK-LSAN-DARWIN106-CXX-NOT: -lc++abi
@@ -831,7 +854,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=safe-stack \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SAFESTACK-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SAFESTACK-LINUX
 //
 // CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SAFESTACK-LINUX-NOT: "-lc"
@@ -844,59 +867,70 @@
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86-64 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86-64
 // CHECK-SHADOWCALLSTACK-LINUX-X86-64-NOT: error:
+// CHECK-SHADOWCALLSTACK-LINUX-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: not %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=aarch64-unknown-linux -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64
 // CHECK-SHADOWCALLSTACK-LINUX-AARCH64: '-fsanitize=shadow-call-stack' only allowed with '-ffixed-x18'
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=riscv32-unknown-elf -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32
 // CHECK-SHADOWCALLSTACK-ELF-RISCV32-NOT: error:
+// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=riscv64-unknown-linux -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-RISCV64 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-RISCV64
 // CHECK-SHADOWCALLSTACK-LINUX-RISCV64-NOT: error:
+// CHECK-SHADOWCALLSTACK-LINUX-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -target riscv64-linux-android -fsanitize=shadow-call-stack %s -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SHADOWCALLSTACK-ANDROID-RISCV64
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ANDROID-RISCV64
 // CHECK-SHADOWCALLSTACK-ANDROID-RISCV64-NOT: error:
+// CHECK-SHADOWCALLSTACK-ANDROID-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=riscv64-unknown-fuchsia -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64
 // CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64-NOT: error:
+// CHECK-SHADOWCALLSTACK-FUCHSIA-RISCV64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=aarch64-unknown-linux -fuse-ld=ld -ffixed-x18 \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=arm64-unknown-ios -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18
+// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-NOT: error:
+// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=aarch64-unknown-linux-android -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18 %s
-// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-NOT: error:
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID
+// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID-NOT: error:
+// CHECK-SHADOWCALLSTACK-LINUX-AARCH64-X18-ANDROID: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 
 // RUN: not %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=x86-unknown-linux -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-LINUX-X86
 // CHECK-SHADOWCALLSTACK-LINUX-X86: error: unsupported option '-fsanitize=shadow-call-stack' for target 'x86-unknown-linux'
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     -fsanitize=safe-stack --target=x86_64-unknown-linux -fuse-ld=ld \
-// RUN:   | FileCheck --check-prefix=CHECK-SHADOWCALLSTACK-SAFESTACK %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-SAFESTACK
 // CHECK-SHADOWCALLSTACK-SAFESTACK-NOT: error:
+// CHECK-SHADOWCALLSTACK-SAFESTACK: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-SHADOWCALLSTACK-SAFESTACK: libclang_rt.safestack.a
 
 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-STATS-LINUX
 // CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client.a" "--no-whole-archive"
 // CHECK-CFI-STATS-LINUX-NOT: "--whole-archive"
@@ -905,7 +939,7 @@
 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-DARWIN %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-STATS-DARWIN
 // CHECK-CFI-STATS-DARWIN: "{{.*}}ld{{(.exe)?}}"
 // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_client_osx.a"
 // CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_osx_dynamic.dylib"
@@ -913,7 +947,7 @@
 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \
 // RUN:     --target=x86_64-pc-windows \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-WIN64 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-STATS-WIN64
 // CHECK-CFI-STATS-WIN64: "--dependent-lib=clang_rt.stats_client{{(-x86_64)?}}.lib"
 // CHECK-CFI-STATS-WIN64: "--dependent-lib=clang_rt.stats{{(-x86_64)?}}.lib"
 // CHECK-CFI-STATS-WIN64: "--linker-option=/include:__sanitizer_stats_register"
@@ -921,13 +955,13 @@
 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \
 // RUN:     --target=i686-pc-windows \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-WIN32 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-STATS-WIN32
 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \
 // RUN:     --target=i686-pc-windows \
 // RUN:     -fno-rtlib-defaultlib \
 // RUN:     -frtlib-defaultlib \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-WIN32 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-STATS-WIN32
 // CHECK-CFI-STATS-WIN32: "--dependent-lib=clang_rt.stats_client{{(-i386)?}}.lib"
 // CHECK-CFI-STATS-WIN32: "--dependent-lib=clang_rt.stats{{(-i386)?}}.lib"
 // CHECK-CFI-STATS-WIN32: "--linker-option=/include:___sanitizer_stats_register"
@@ -936,38 +970,35 @@
 // RUN:     --target=i686-pc-windows \
 // RUN:     -fno-rtlib-defaultlib \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-WIN32-NODEF %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-CFI-STATS-WIN32-NODEF
 // CHECK-CFI-STATS-WIN32-NODEF-NOT: "--dependent-lib=clang_rt.stats_client{{(-i386)?}}.lib"
 // CHECK-CFI-STATS-WIN32-NODEF-NOT: "--dependent-lib=clang_rt.stats{{(-i386)?}}.lib"
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=safe-stack \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SAFESTACK-ANDROID-ARM %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-ARM
 //
 // CHECK-SAFESTACK-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
-// CHECK-SAFESTACK-ANDROID-ARM-NOT: libclang_rt.safestack
 
 // RUN: %clang -### %s -shared 2>&1 \
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=safe-stack \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM
 //
 // CHECK-SAFESTACK-SHARED-ANDROID-ARM: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
-// CHECK-SAFESTACK-SHARED-ANDROID-ARM-NOT: libclang_rt.safestack
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=aarch64-linux-android -fuse-ld=ld -fsanitize=safe-stack \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64
 //
 // CHECK-SAFESTACK-ANDROID-AARCH64: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
-// CHECK-SAFESTACK-ANDROID-AARCH64-NOT: libclang_rt.safestack
 
 // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-scei-ps4 -fuse-ld=ld \
 // RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-PS4 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-PS4
 // CHECK-UBSAN-PS4: --dependent-lib=libSceDbgUBSanitizer_stub_weak.a
 // CHECK-UBSAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-UBSAN-PS4: -lSceDbgUBSanitizer_stub_weak
@@ -975,7 +1006,7 @@
 // RUN: not %clang -fsanitize=undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-sie-ps5 -fuse-ld=ld \
 // RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-PS5 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-UBSAN-PS5
 // CHECK-UBSAN-PS5: --dependent-lib=libSceUBSanitizer_nosubmission_stub_weak.a
 // CHECK-UBSAN-PS5: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-UBSAN-PS5: -lSceUBSanitizer_nosubmission_stub_weak
@@ -983,7 +1014,7 @@
 // RUN: not %clang -fsanitize=address -### %s 2>&1 \
 // RUN:     --target=x86_64-scei-ps4 -fuse-ld=ld \
 // RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-PS4 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-PS4
 // CHECK-ASAN-PS4: --dependent-lib=libSceDbgAddressSanitizer_stub_weak.a
 // CHECK-ASAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-ASAN-PS4: -lSceDbgAddressSanitizer_stub_weak
@@ -991,7 +1022,7 @@
 // RUN: not %clang -fsanitize=address -### %s 2>&1 \
 // RUN:     --target=x86_64-sie-ps5 -fuse-ld=ld \
 // RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-PS5 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-ASAN-PS5
 // CHECK-ASAN-PS5: --dependent-lib=libSceAddressSanitizer_nosubmission_stub_weak.a
 // CHECK-ASAN-PS5: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-ASAN-PS5: -lSceAddressSanitizer_nosubmission_stub_weak
@@ -999,7 +1030,7 @@
 // RUN: not %clang -fsanitize=address,undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-scei-ps4 -fuse-ld=ld \
 // RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-AUBSAN-PS4 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-AUBSAN-PS4
 // CHECK-AUBSAN-PS4-NOT: --dependent-lib=libSceDbgUBSanitizer_stub_weak.a
 // CHECK-AUBSAN-PS4: --dependent-lib=libSceDbgAddressSanitizer_stub_weak.a
 // CHECK-AUBSAN-PS4-NOT: --dependent-lib=libSceDbgUBSanitizer_stub_weak.a
@@ -1009,7 +1040,7 @@
 // RUN: not %clang -fsanitize=address,undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-sie-ps5 -fuse-ld=ld \
 // RUN:     -shared \
-// RUN:   | FileCheck --check-prefix=CHECK-AUBSAN-PS5 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-AUBSAN-PS5
 // CHECK-AUBSAN-PS5-NOT: --dependent-lib=libSceUBSanitizer_nosubmission_stub_weak.a
 // CHECK-AUBSAN-PS5: --dependent-lib=libSceAddressSanitizer_nosubmission_stub_weak.a
 // CHECK-AUBSAN-PS5-NOT: --dependent-lib=libSceUBSanitizer_nosubmission_stub_weak.a
@@ -1020,21 +1051,21 @@
 // RUN:     --target=x86_64-scei-ps4 -fuse-ld=ld \
 // RUN:     -shared \
 // RUN:     -nostdlib \
-// RUN:   | FileCheck --check-prefix=CHECK-NOLIB-PS4 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-NOLIB-PS4
 // CHECK-NOLIB-PS4-NOT: SceDbgAddressSanitizer_stub_weak
 
 // RUN: not %clang -fsanitize=address,undefined -### %s 2>&1 \
 // RUN:     --target=x86_64-sie-ps5 -fuse-ld=ld \
 // RUN:     -shared \
 // RUN:     -nostdlib \
-// RUN:   | FileCheck --check-prefix=CHECK-NOLIB-PS5 %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-NOLIB-PS5
 // CHECK-NOLIB-PS5-NOT: SceAddressSanitizer_nosubmission_stub_weak
 
 // RUN: %clang -fsanitize=scudo -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SCUDO-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SCUDO-LINUX
 // CHECK-SCUDO-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive"
 // CHECK-SCUDO-LINUX-NOT: "-lstdc++"
@@ -1046,11 +1077,10 @@
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=scudo -shared-libsan \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SCUDO-SHARED-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SCUDO-SHARED-LINUX
 //
 // CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc"
-// CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone.a"
 // CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone.so"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lpthread"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lrt"
@@ -1062,7 +1092,7 @@
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=scudo \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
-// RUN:   | FileCheck --check-prefix=CHECK-SCUDO-ANDROID %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SCUDO-ANDROID
 //
 // CHECK-SCUDO-ANDROID: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 // CHECK-SCUDO-ANDROID-NOT: "-lc"
@@ -1077,7 +1107,7 @@
 // RUN:     --target=arm-linux-androideabi -fuse-ld=ld -fsanitize=scudo \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -static-libsan \
-// RUN:   | FileCheck --check-prefix=CHECK-SCUDO-ANDROID-STATIC %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SCUDO-ANDROID-STATIC
 // CHECK-SCUDO-ANDROID-STATIC: "{{(.*[^.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 // CHECK-SCUDO-ANDROID-STATIC: "-pie"
 // CHECK-SCUDO-ANDROID-STATIC: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive"
@@ -1090,7 +1120,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-HWASAN-X86-64-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-HWASAN-X86-64-LINUX
 //
 // CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "-lc"
@@ -1107,11 +1137,12 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \
 // RUN:     -shared-libsan -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SHARED-HWASAN-X86-64-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-X86-64-LINUX
 //
 // CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc"
 // CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so"
+// CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan-preinit.a"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lrt"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-ldl"
@@ -1123,7 +1154,7 @@
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \
 // RUN:     -shared-libsan -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-DSO-SHARED-HWASAN-X86-64-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-X86-64-LINUX
 //
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc"
@@ -1139,7 +1170,7 @@
 // RUN:     --target=aarch64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-HWASAN-AARCH64-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-HWASAN-AARCH64-LINUX
 //
 // CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "-lc"
@@ -1157,11 +1188,12 @@
 // RUN:     -shared-libsan \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-SHARED-HWASAN-AARCH64-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-SHARED-HWASAN-AARCH64-LINUX
 //
 // CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so"
+// CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-preinit.a"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lrt"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-ldl"
@@ -1173,7 +1205,7 @@
 // RUN:     --target=aarch64-unknown-linux -fuse-ld=ld -fsanitize=hwaddress \
 // RUN:     -shared-libsan -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:   | FileCheck --check-prefix=CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX %s
+// RUN:   | %{filecheck} --check-prefix=CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX
 //
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc"
diff --git a/clang/test/Driver/spirv-openmp-toolchain.c b/clang/test/Driver/spirv-openmp-toolchain.c
new file mode 100644
index 0000000..3a94d97
--- /dev/null
+++ b/clang/test/Driver/spirv-openmp-toolchain.c
@@ -0,0 +1,64 @@
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=spirv64-intel \
+// RUN:        --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \
+// RUN: | FileCheck %s
+
+// verify the tools invocations
+// CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-llvm-bc"{{.*}}"-x" "c"
+// CHECK: "-cc1" "-triple" "spirv64-intel" "-aux-triple" "x86_64-unknown-linux-gnu"
+// CHECK: llvm-spirv{{.*}}
+// CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-obj"
+// CHECK: clang-linker-wrapper{{.*}} "-o" "a.out"
+
+// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=spirv64-intel %s 2>&1 \
+// RUN: | FileCheck --check-prefix=CHECK-PHASES %s
+
+// CHECK-PHASES: 0: input, "[[INPUT:.+]]", c, (host-openmp)
+// CHECK-PHASES: 1: preprocessor, {0}, cpp-output, (host-openmp)
+// CHECK-PHASES: 2: compiler, {1}, ir, (host-openmp)
+// CHECK-PHASES: 3: input, "[[INPUT]]", c, (device-openmp)
+// CHECK-PHASES: 4: preprocessor, {3}, cpp-output, (device-openmp)
+// CHECK-PHASES: 5: compiler, {4}, ir, (device-openmp)
+// CHECK-PHASES: 6: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (spirv64-intel)" {5}, ir
+// CHECK-PHASES: 7: backend, {6}, assembler, (device-openmp)
+// CHECK-PHASES: 8: assembler, {7}, object, (device-openmp)
+// CHECK-PHASES: 9: offload, "device-openmp (spirv64-intel)" {8}, object
+// CHECK-PHASES: 10: clang-offload-packager, {9}, image, (device-openmp)
+// CHECK-PHASES: 11: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (x86_64-unknown-linux-gnu)" {10}, ir
+// CHECK-PHASES: 12: backend, {11}, assembler, (host-openmp)
+// CHECK-PHASES: 13: assembler, {12}, object, (host-openmp)
+// CHECK-PHASES: 14: clang-linker-wrapper, {13}, image, (host-openmp)
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS
+
+// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
+// CHECK-BINDINGS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]"
+// CHECK-BINDINGS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_TEMP_BC]]"], output: "[[DEVICE_SPV:.+]]"
+// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[DEVICE_SPV]]"], output: "[[DEVICE_IMAGE:.+]]"
+// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_OBJ:.+]]"
+// CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp=libomp -fopenmp-targets=spirv64-intel %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS-TEMPS
+// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_PP:.+]]"
+// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_PP]]"], output: "[[HOST_BC:.+]]"
+// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[INPUT]]"], output: "[[DEVICE_PP:.+]]"
+// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "clang", inputs: ["[[DEVICE_PP]]", "[[HOST_BC]]"], output: "[[DEVICE_TEMP_BC:.+]]"
+// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_TEMP_BC]]"], output: "[[DEVICE_ASM:.+]]"
+// CHECK-BINDINGS-TEMPS: "spirv64-intel" - "SPIR-V::Translator", inputs: ["[[DEVICE_ASM]]"], output: "[[DEVICE_SPV:.+]]"
+// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[DEVICE_SPV]]"], output: "[[DEVICE_IMAGE:.+]]"
+// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_IMAGE]]"], output: "[[HOST_ASM:.+]]"
+// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "clang::as", inputs: ["[[HOST_ASM]]"], output: "[[HOST_OBJ:.+]]"
+// CHECK-BINDINGS-TEMPS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp=libomp -fopenmp-targets=spirv64-intel -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
+// CHECK-EMIT-LLVM-IR: "-cc1" "-triple" "spirv64-intel"{{.*}}"-emit-llvm-bc"
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=spirv64-intel \
+// RUN: --sysroot=%S/Inputs/spirv-openmp/ %s 2>&1 | FileCheck --check-prefix=CHECK-GPULIB %s
+// CHECK-GPULIB: "-cc1" "-triple" "spirv64-intel"{{.*}}"-mlink-builtin-bitcode" "{{.*}}libomptarget-spirv64.bc"
+
+// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=spirv64-intel \
+// RUN:        --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \
+// RUN: | FileCheck %s --check-prefix=CHECK-OFFLOAD-ARCH-ERROR
+// CHECK-OFFLOAD-ARCH-ERROR: error: failed to deduce triple for target architecture 'spirv64-intel'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead
diff --git a/clang/test/Driver/sycl-offload-jit.cpp b/clang/test/Driver/sycl-offload-jit.cpp
new file mode 100644
index 0000000..eb192e0
--- /dev/null
+++ b/clang/test/Driver/sycl-offload-jit.cpp
@@ -0,0 +1,50 @@
+/// Perform several driver tests for SYCL offloading for JIT
+
+/// Check the phases graph with -fsycl. Use of -fsycl enables offload
+// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fsycl %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=CHK-PHASES %s
+// RUN: %clang_cl -ccc-print-phases --target=x86_64-pc-windows-msvc -fsycl -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=CHK-PHASES %s
+// CHK-PHASES: 0: input, "[[INPUT:.+\.cpp]]", c++, (host-sycl)
+// CHK-PHASES-NEXT: 1: preprocessor, {0}, c++-cpp-output, (host-sycl)
+// CHK-PHASES-NEXT: 2: compiler, {1}, ir, (host-sycl)
+// CHK-PHASES-NEXT: 3: input, "[[INPUT]]", c++, (device-sycl)
+// CHK-PHASES-NEXT: 4: preprocessor, {3}, c++-cpp-output, (device-sycl)
+// CHK-PHASES-NEXT: 5: compiler, {4}, ir, (device-sycl)
+// CHK-PHASES-NEXT: 6: backend, {5}, ir, (device-sycl)
+// CHK-PHASES-NEXT: 7: offload, "device-sycl (spirv64-unknown-unknown)" {6}, ir
+// CHK-PHASES-NEXT: 8: clang-offload-packager, {7}, image, (device-sycl)
+// CHK-PHASES-NEXT: 9: offload, "host-sycl (x86_64{{.*}})" {2}, "device-sycl (x86_64{{.*}})" {8}, ir
+// CHK-PHASES-NEXT: 10: backend, {9}, assembler, (host-sycl)
+// CHK-PHASES-NEXT: 11: assembler, {10}, object, (host-sycl)
+// CHK-PHASES-NEXT: 12: clang-linker-wrapper, {11}, image, (host-sycl)
+
+/// Check expected default values for device compilation when using -fsycl as
+/// well as clang-offload-packager inputs.
+// RUN: %clang -### -fsycl -c --target=x86_64-unknown-linux-gnu %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-DEVICE-TRIPLE %s
+// CHK-DEVICE-TRIPLE: "-cc1"{{.*}} "-triple" "spirv64-unknown-unknown"
+// CHK-DEVICE-TRIPLE-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
+// CHK-DEVICE-TRIPLE-SAME: "-fsycl-is-device"
+// CHK-DEVICE-TRIPLE-SAME: "-O2"
+// CHK-DEVICE-TRIPLE: clang-offload-packager{{.*}} "--image=file={{.*}}.bc,triple=spirv64-unknown-unknown,arch=,kind=sycl"
+
+/// Check -fsycl-is-device is passed when compiling for the device.
+/// Check -fsycl-is-host is passed when compiling for host.
+// RUN: %clang -### -fsycl -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=CHK-FSYCL-IS-DEVICE,CHK-FSYCL-IS-HOST %s
+// RUN: %clang -### -fsycl -fsycl-device-only %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FSYCL-IS-DEVICE %s
+// RUN: %clang_cl -### -fsycl -c -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=CHK-FSYCL-IS-DEVICE,CHK-FSYCL-IS-HOST %s
+// RUN: %clang -### -fsycl -fsycl-host-only %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FSYCL-IS-HOST %s
+// CHK-FSYCL-IS-DEVICE: "-cc1"{{.*}} "-fsycl-is-device" {{.*}} "-emit-llvm-bc"
+// CHK-FSYCL-IS-HOST: "-cc1"{{.*}} "-fsycl-is-host"
+
+/// Check for option incompatibility with -fsycl
+// RUN: not %clang -### -fsycl -ffreestanding %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-INCOMPATIBILITY %s -DINCOMPATOPT=-ffreestanding
+// RUN: not %clang -### -fsycl --offload-new-driver -static-libstdc++ %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-INCOMPATIBILITY %s -DINCOMPATOPT=-static-libstdc++
+// CHK-INCOMPATIBILITY: error: invalid argument '[[INCOMPATOPT]]' not allowed with '-fsycl'
diff --git a/clang/test/Driver/uefi-constructed-args.c b/clang/test/Driver/uefi-constructed-args.c
index e90857b..3cc5abe 100644
--- a/clang/test/Driver/uefi-constructed-args.c
+++ b/clang/test/Driver/uefi-constructed-args.c
@@ -4,6 +4,7 @@
 // CHECK-SAME: "-triple" "x86_64-unknown-uefi"
 // CHECK-SAME: "-mrelocation-model" "pic" "-pic-level" "2"
 // CHECK-SAME: "-mframe-pointer=all"
+// CHECK-SAME: "-fms-extensions"
 // CHECK-NEXT: "-nologo"
 // CHECK-SAME: "-subsystem:efi_application"
 // CHECK-SAME: "-entry:EfiMain"
diff --git a/clang/test/Frontend/dependency-gen-symlink.c b/clang/test/Frontend/dependency-gen-symlink.c
index 2fa339a..15664a4 100644
--- a/clang/test/Frontend/dependency-gen-symlink.c
+++ b/clang/test/Frontend/dependency-gen-symlink.c
@@ -15,7 +15,7 @@
 // CHECK: dependency-gen-symlink.c.o
 // CHECK: dependency-gen-symlink.c
 // CHECK: a/header.h
-// CHECK: b/header.h
+// CHECK-NOT: b/header.h
 // CHECK-NOT: with-header-guard.h
 #include "a/header.h"
 #include "b/header.h"
diff --git a/clang/test/Frontend/dependency-gen-windows-duplicates.c b/clang/test/Frontend/dependency-gen-windows-duplicates.c
index abd3513..0ecc232 100644
--- a/clang/test/Frontend/dependency-gen-windows-duplicates.c
+++ b/clang/test/Frontend/dependency-gen-windows-duplicates.c
@@ -9,7 +9,7 @@
 // RUN: %clang -MD -MF - %t.dir/test.c -fsyntax-only -I %t.dir/subdir | FileCheck %s
 // CHECK: test.o:
 // CHECK-NEXT: \test.c
-// CHECK-NEXT: \SubDir\X.h
+// CHECK-NEXT: \subdir\x.h
 // File x.h must appear only once (case insensitive check).
 // CHECK-NOT: {{\\|/}}{{x|X}}.{{h|H}}
 
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 2e45f73..2813397 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -44,7 +44,7 @@
 // AMDGPU-NEXT:    call void @__gpu_exit() #[[ATTR8:[0-9]+]]
 // AMDGPU-NEXT:    unreachable
 //
-// NVPTX-LABEL: define protected void @foo(
+// NVPTX-LABEL: define protected ptx_kernel void @foo(
 // NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
 // NVPTX-NEXT:  [[ENTRY:.*:]]
 // NVPTX-NEXT:    [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]]
diff --git a/clang/test/Misc/target-invalid-cpu-note/riscv.c b/clang/test/Misc/target-invalid-cpu-note/riscv.c
index fc8536d..fb54dcb5 100644
--- a/clang/test/Misc/target-invalid-cpu-note/riscv.c
+++ b/clang/test/Misc/target-invalid-cpu-note/riscv.c
@@ -29,6 +29,7 @@
 // RISCV64-SAME: {{^}}, rocket-rv64
 // RISCV64-SAME: {{^}}, sifive-p450
 // RISCV64-SAME: {{^}}, sifive-p470
+// RISCV64-SAME: {{^}}, sifive-p550
 // RISCV64-SAME: {{^}}, sifive-p670
 // RISCV64-SAME: {{^}}, sifive-s21
 // RISCV64-SAME: {{^}}, sifive-s51
@@ -77,6 +78,7 @@
 // TUNE-RISCV64-SAME: {{^}}, rocket-rv64
 // TUNE-RISCV64-SAME: {{^}}, sifive-p450
 // TUNE-RISCV64-SAME: {{^}}, sifive-p470
+// TUNE-RISCV64-SAME: {{^}}, sifive-p550
 // TUNE-RISCV64-SAME: {{^}}, sifive-p670
 // TUNE-RISCV64-SAME: {{^}}, sifive-s21
 // TUNE-RISCV64-SAME: {{^}}, sifive-s51
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-1.cppm b/clang/test/Modules/expose-static-inline-from-gmf-1.cppm
new file mode 100644
index 0000000..4de9b58
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-1.cppm
@@ -0,0 +1,37 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm \
+// RUN:   -DTEST_INLINE
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify \
+// RUN:   -DTEST_INLINE
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+#ifdef TEST_INLINE
+#define INLINE inline
+#else
+#define INLINE
+#endif
+static INLINE void func(long) {}
+template <typename T = long> void a() { func(T{}); }
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+#ifdef TEST_INLINE
+// expected-no-diagnostics
+#else
+// expected-error@a.h:7 {{no matching function for call to 'func'}}
+// expected-note@test.cc:2 {{in instantiation of function template specialization 'a<long>' requested here}}
+#endif
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-2.cppm b/clang/test/Modules/expose-static-inline-from-gmf-2.cppm
new file mode 100644
index 0000000..c89b613
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-2.cppm
@@ -0,0 +1,22 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+template <typename G> static inline void func() {}
+template <typename T = long> void a() { func<T>(); }
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+// expected-no-diagnostics
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-3.cppm b/clang/test/Modules/expose-static-inline-from-gmf-3.cppm
new file mode 100644
index 0000000..dee7cdd
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-3.cppm
@@ -0,0 +1,24 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+namespace ns {
+template <typename G> static void func() {}
+template <typename T = long> void a() { func<T>(); }
+}
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ns::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+// expected-no-diagnostics
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-4.cppm b/clang/test/Modules/expose-static-inline-from-gmf-4.cppm
new file mode 100644
index 0000000..09c6b1f
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-4.cppm
@@ -0,0 +1,40 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm \
+// RUN:   -DTEST_INLINE
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify \
+// RUN:   -DTEST_INLINE
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+#ifdef TEST_INLINE
+#define INLINE inline
+#else
+#define INLINE
+#endif
+namespace ns {
+template <typename G> static void func() {}
+template <> INLINE void func<long>() {}
+template <typename T = long> void a() { func<T>(); }
+}
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ns::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+#ifdef TEST_INLINE
+// expected-no-diagnostics
+#else
+// expected-error@a.h:9 {{no matching function for call to 'func'}}
+// expected-note@test.cc:2 {{in instantiation of function template specialization 'ns::a<long>' requested here}}
+#endif
diff --git a/clang/test/Modules/expose-static-inline-from-gmf-5.cppm b/clang/test/Modules/expose-static-inline-from-gmf-5.cppm
new file mode 100644
index 0000000..334af845
--- /dev/null
+++ b/clang/test/Modules/expose-static-inline-from-gmf-5.cppm
@@ -0,0 +1,26 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/a.cppm --precompile -o %t/a.pcm
+// RUN: %clang -std=c++20 %t/test.cc -fprebuilt-module-path=%t -fsyntax-only -Xclang -verify
+
+//--- a.h
+namespace ns {
+namespace {
+template <typename G> void func() {}
+}
+template <typename T = long> void a() { func<T>(); }
+}
+
+//--- a.cppm
+module;
+#include "a.h"
+export module a;
+export using ns::a;
+
+//--- test.cc
+import a;
+auto m = (a(), 0);
+
+// expected-no-diagnostics
diff --git a/clang/test/Modules/missing-body-in-import.cpp b/clang/test/Modules/missing-body-in-import.cpp
new file mode 100644
index 0000000..b52ebba
--- /dev/null
+++ b/clang/test/Modules/missing-body-in-import.cpp
@@ -0,0 +1,42 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+
+// RUN: %clang_cc1 -std=c++23 mod1.cppm -emit-module-interface -o mod1.pcm -fallow-pcm-with-compiler-errors -verify
+// RUN: %clang_cc1 -std=c++23 mod2.cppm -emit-module-interface -o mod2.pcm -fmodule-file=mod1=mod1.pcm -verify -fallow-pcm-with-compiler-errors
+// RUN: %clang_cc1 -std=c++23 mod3.cppm -emit-module-interface -o mod3.pcm -fmodule-file=mod1=mod1.pcm -fmodule-file=mod2=mod2.pcm -verify -fallow-pcm-with-compiler-errors
+// RUN: %clang_cc1 -std=c++23 main.cpp -fmodule-file=mod1=mod1.pcm -fmodule-file=mod2=mod2.pcm -fmodule-file=mod3=mod3.pcm -verify -fallow-pcm-with-compiler-errors -ast-dump-all
+
+//--- mod1.cppm
+export module mod1;
+
+export template <unsigned N, unsigned M>
+class A {
+public:
+  constexpr A(const char[], const char[]) {
+    auto x = BrokenExpr; // expected-error {{use of undeclared identifier 'BrokenExpr'}}
+  }
+};
+
+export template<A<1,1> NTTP>
+struct B {};
+
+template < unsigned N, unsigned M >
+A(const char (&)[N], const char (&)[M]) -> A< 1, 1 >;
+
+//--- mod2.cppm
+export module mod2;
+import mod1;
+
+struct C: B <A{"a", "b"}> { // expected-error {{non-type template argument is not a constant expression}}
+  constexpr C(int a) { }
+};
+
+//--- mod3.cppm
+// expected-no-diagnostics
+export module mod3;
+export import mod2;
+
+//--- main.cpp
+// expected-no-diagnostics
+import mod3; // no crash
diff --git a/clang/test/Modules/pcm-with-errors.cpp b/clang/test/Modules/pcm-with-errors.cpp
new file mode 100644
index 0000000..1bbc386
--- /dev/null
+++ b/clang/test/Modules/pcm-with-errors.cpp
@@ -0,0 +1,26 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+
+// RUN: %clang_cc1 -std=c++23 m.cppm -emit-module-interface -o m.pcm -fallow-pcm-with-compiler-errors -verify
+// RUN: %clang_cc1 -std=c++23 main.cpp -fmodule-file=m=m.pcm -verify -fallow-pcm-with-compiler-errors -verify
+
+// RUN: %clang_cc1 -std=c++23 m.cppm -fmodules-reduced-bmi -emit-module-interface -o m.pcm -fallow-pcm-with-compiler-errors -verify
+// RUN: %clang_cc1 -std=c++23 main.cpp -fmodule-file=m=m.pcm -verify -fallow-pcm-with-compiler-errors -verify
+
+//--- m.cppm
+export module m;
+
+export int f() {
+  return 0;
+}
+
+export struct Foo {
+  __Int bar; // expected-error {{unknown type name '__Int'}}
+};
+
+//--- main.cpp
+// expected-no-diagnostics
+import m; // ok
+
+static_assert(__is_same(decltype(f), int())); // ok
diff --git a/clang/test/Modules/pr121066.cpp b/clang/test/Modules/pr121066.cpp
new file mode 100644
index 0000000..e92a81c
--- /dev/null
+++ b/clang/test/Modules/pr121066.cpp
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only %s -verify 
+
+import mod // expected-error {{expected ';' after module name}}
+           // expected-error@-1 {{module 'mod' not found}}
diff --git a/clang/test/OpenMP/declare_simd_aarch64.c b/clang/test/OpenMP/declare_simd_aarch64.c
index 21c83c2..e9538e7 100644
--- a/clang/test/OpenMP/declare_simd_aarch64.c
+++ b/clang/test/OpenMP/declare_simd_aarch64.c
@@ -1,8 +1,8 @@
 // REQUIRES: aarch64-registered-target
 // -fopemp and -fopenmp-simd behavior are expected to be the same.
 
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fopenmp -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix=AARCH64
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fopenmp-simd -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix=AARCH64
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fmath-errno -fopenmp -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix=AARCH64
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fmath-errno -fopenmp-simd -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix=AARCH64
 
 #pragma omp declare simd
 #pragma omp declare simd simdlen(2)
diff --git a/clang/test/OpenMP/irbuilder_simd_aligned.cpp b/clang/test/OpenMP/irbuilder_simd_aligned.cpp
index 1c3dc49..721fde6 100644
--- a/clang/test/OpenMP/irbuilder_simd_aligned.cpp
+++ b/clang/test/OpenMP/irbuilder_simd_aligned.cpp
@@ -70,8 +70,11 @@ void simple(float *a, float *b, int *c) {
 // CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
 // CHECK:       for.end:
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 128) ]
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[P]], align 8
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 64) ]
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [32 x i32], ptr [[D]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAYDECAY]], i64 16) ]
 // CHECK-NEXT:    store i32 3, ptr [[I1]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[I1]], ptr [[TMP6]], align 8
@@ -82,9 +85,6 @@ void simple(float *a, float *b, int *c) {
 // CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i32, ptr [[DOTCOUNT_ADDR]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
 // CHECK:       omp_loop.preheader:
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 128) ]
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 64) ]
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAYDECAY]], i64 16) ]
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
 // CHECK:       omp_loop.header:
 // CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 8a404a5..73a0969 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -343,23 +343,16 @@ struct HasMembersArray {
 void SelfUpdate() {
   struct Members s;
 
-  // expected-error@+2{{expected '('}}
-  // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
+  // expected-error@+1{{expected '('}}
 #pragma acc update self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+6{{use of undeclared identifier 'zero'}}
-  // expected-error@+5{{expected ','}}
-  // expected-error@+4{{expected expression}}
-  // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}}
-  // expected-warning@+2{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
+  // expected-error@+3{{use of undeclared identifier 'zero'}}
+  // expected-error@+2{{expected ','}}
+  // expected-error@+1{{expected expression}}
 #pragma acc update self(zero : s.array[s.value : 5], s.value), if_present
   for(int i = 0; i < 5;++i) {}
 
-  // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}}
-  // expected-warning@+2{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
 #pragma acc update self(s.array[s.value : 5], s.value), if_present
   for(int i = 0; i < 5;++i) {}
 }
@@ -821,28 +814,24 @@ void IntExprParsing() {
 #pragma acc init device_num(returns_int())
 
   // expected-error@+2{{expected '('}}
-  // expected-warning@+1{{OpenACC construct 'set' not yet implemented, pragma ignored}}
+  // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
 #pragma acc set default_async
 
   // expected-error@+2{{expected expression}}
-  // expected-warning@+1{{OpenACC construct 'set' not yet implemented, pragma ignored}}
+  // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
 #pragma acc set default_async()
 
   // expected-error@+2{{use of undeclared identifier 'invalid'}}
-  // expected-warning@+1{{OpenACC construct 'set' not yet implemented, pragma ignored}}
+  // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
 #pragma acc set default_async(invalid)
 
   // expected-error@+3{{expected ')'}}
   // expected-note@+2{{to match this '('}}
-  // expected-warning@+1{{OpenACC construct 'set' not yet implemented, pragma ignored}}
+  // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
 #pragma acc set default_async(5, 4)
 
-  // expected-warning@+2{{OpenACC clause 'default_async' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'set' not yet implemented, pragma ignored}}
 #pragma acc set default_async(5)
 
-  // expected-warning@+2{{OpenACC clause 'default_async' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'set' not yet implemented, pragma ignored}}
 #pragma acc set default_async(returns_int())
 
 
diff --git a/clang/test/ParserOpenACC/parse-constructs.c b/clang/test/ParserOpenACC/parse-constructs.c
index 878c38e..9948e33 100644
--- a/clang/test/ParserOpenACC/parse-constructs.c
+++ b/clang/test/ParserOpenACC/parse-constructs.c
@@ -148,11 +148,10 @@ void func() {
 #pragma acc shutdown clause list
   for(;;){}
   // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'set' not yet implemented, pragma ignored}}
+  // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
 #pragma acc set clause list
   for(;;){}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc update clause list
   for(;;){}
 }
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 3b99204..5999b9c 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -2746,3 +2746,252 @@
 // RUN: %clang_cc1 -dM -triple=x86_64-uefi -E /dev/null | FileCheck -match-full-lines -check-prefix UEFI %s
 
 // UEFI: #define __UEFI__ 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=xtensa  < /dev/null \
+// RUN:   | FileCheck -match-full-lines -check-prefix=XTENSA %s
+// XTENSA: #define _ILP32 1
+// XTENSA: #define __ATOMIC_ACQUIRE 2
+// XTENSA: #define __ATOMIC_ACQ_REL 4
+// XTENSA: #define __ATOMIC_CONSUME 1
+// XTENSA: #define __ATOMIC_RELAXED 0
+// XTENSA: #define __ATOMIC_RELEASE 3
+// XTENSA: #define __ATOMIC_SEQ_CST 5
+// XTENSA: #define __BIGGEST_ALIGNMENT__ 4
+// XTENSA: #define __BITINT_MAXWIDTH__ 128
+// XTENSA: #define __BOOL_WIDTH__ 1
+// XTENSA: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// XTENSA: #define __CHAR16_TYPE__ unsigned short
+// XTENSA: #define __CHAR32_TYPE__ unsigned int
+// XTENSA: #define __CHAR_BIT__ 8
+// XTENSA: #define __CLANG_ATOMIC_BOOL_LOCK_FREE 2
+// XTENSA: #define __CLANG_ATOMIC_CHAR16_T_LOCK_FREE 2
+// XTENSA: #define __CLANG_ATOMIC_CHAR32_T_LOCK_FREE 2
+// XTENSA: #define __CLANG_ATOMIC_CHAR_LOCK_FREE 2
+// XTENSA: #define __CLANG_ATOMIC_INT_LOCK_FREE 2
+// XTENSA: #define __CLANG_ATOMIC_LLONG_LOCK_FREE 1
+// XTENSA: #define __CLANG_ATOMIC_LONG_LOCK_FREE 2
+// XTENSA: #define __CLANG_ATOMIC_POINTER_LOCK_FREE 2
+// XTENSA: #define __CLANG_ATOMIC_SHORT_LOCK_FREE 2
+// XTENSA: #define __CLANG_ATOMIC_WCHAR_T_LOCK_FREE 2
+// XTENSA: #define __CONSTANT_CFSTRINGS__ 1
+// XTENSA: #define __DBL_DECIMAL_DIG__ 17
+// XTENSA: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// XTENSA: #define __DBL_DIG__ 15
+// XTENSA: #define __DBL_EPSILON__ 2.2204460492503131e-16
+// XTENSA: #define __DBL_HAS_DENORM__ 1
+// XTENSA: #define __DBL_HAS_INFINITY__ 1
+// XTENSA: #define __DBL_HAS_QUIET_NAN__ 1
+// XTENSA: #define __DBL_MANT_DIG__ 53
+// XTENSA: #define __DBL_MAX_10_EXP__ 308
+// XTENSA: #define __DBL_MAX_EXP__ 1024
+// XTENSA: #define __DBL_MAX__ 1.7976931348623157e+308
+// XTENSA: #define __DBL_MIN_10_EXP__ (-307)
+// XTENSA: #define __DBL_MIN_EXP__ (-1021)
+// XTENSA: #define __DBL_MIN__ 2.2250738585072014e-308
+// XTENSA: #define __DBL_NORM_MAX__ 1.7976931348623157e+308
+// XTENSA: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// XTENSA: #define __ELF__ 1
+// XTENSA: #define __FINITE_MATH_ONLY__ 0
+// XTENSA: #define __FLT_DECIMAL_DIG__ 9
+// XTENSA: #define __FLT_DENORM_MIN__ 1.40129846e-45F
+// XTENSA: #define __FLT_DIG__ 6
+// XTENSA: #define __FLT_EPSILON__ 1.19209290e-7F
+// XTENSA: #define __FLT_HAS_DENORM__ 1
+// XTENSA: #define __FLT_HAS_INFINITY__ 1
+// XTENSA: #define __FLT_HAS_QUIET_NAN__ 1
+// XTENSA: #define __FLT_MANT_DIG__ 24
+// XTENSA: #define __FLT_MAX_10_EXP__ 38
+// XTENSA: #define __FLT_MAX_EXP__ 128
+// XTENSA: #define __FLT_MAX__ 3.40282347e+38F
+// XTENSA: #define __FLT_MIN_10_EXP__ (-37)
+// XTENSA: #define __FLT_MIN_EXP__ (-125)
+// XTENSA: #define __FLT_MIN__ 1.17549435e-38F
+// XTENSA: #define __FLT_NORM_MAX__ 3.40282347e+38F
+// XTENSA: #define __FLT_RADIX__ 2
+// XTENSA: #define __FPCLASS_NEGINF 0x0004
+// XTENSA: #define __FPCLASS_NEGNORMAL 0x0008
+// XTENSA: #define __FPCLASS_NEGSUBNORMAL 0x0010
+// XTENSA: #define __FPCLASS_NEGZERO 0x0020
+// XTENSA: #define __FPCLASS_POSINF 0x0200
+// XTENSA: #define __FPCLASS_POSNORMAL 0x0100
+// XTENSA: #define __FPCLASS_POSSUBNORMAL 0x0080
+// XTENSA: #define __FPCLASS_POSZERO 0x0040
+// XTENSA: #define __FPCLASS_QNAN 0x0002
+// XTENSA: #define __FPCLASS_SNAN 0x0001
+// XTENSA: #define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+// XTENSA: #define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+// XTENSA: #define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+// XTENSA: #define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+// XTENSA: #define __GCC_ATOMIC_INT_LOCK_FREE 2
+// XTENSA: #define __GCC_ATOMIC_LLONG_LOCK_FREE 1
+// XTENSA: #define __GCC_ATOMIC_LONG_LOCK_FREE 2
+// XTENSA: #define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+// XTENSA: #define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+// XTENSA: #define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// XTENSA: #define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// XTENSA: #define __GCC_CONSTRUCTIVE_SIZE 64
+// XTENSA: #define __GCC_DESTRUCTIVE_SIZE 64
+// XTENSA: #define __GNUC_MINOR__ {{.*}}
+// XTENSA: #define __GNUC_PATCHLEVEL__ {{.*}}
+// XTENSA: #define __GNUC_STDC_INLINE__ 1
+// XTENSA: #define __GNUC__ {{.*}}
+// XTENSA: #define __GXX_ABI_VERSION {{.*}}
+// XTENSA: #define __ILP32__ 1
+// XTENSA: #define __INT16_C_SUFFIX__ 
+// XTENSA: #define __INT16_MAX__ 32767
+// XTENSA: #define __INT16_TYPE__ short
+// XTENSA: #define __INT32_C_SUFFIX__ 
+// XTENSA: #define __INT32_MAX__ 2147483647
+// XTENSA: #define __INT32_TYPE__ int
+// XTENSA: #define __INT64_C_SUFFIX__ LL
+// XTENSA: #define __INT64_MAX__ 9223372036854775807LL
+// XTENSA: #define __INT64_TYPE__ long long int
+// XTENSA: #define __INT8_C_SUFFIX__ 
+// XTENSA: #define __INT8_MAX__ 127
+// XTENSA: #define __INT8_TYPE__ signed char
+// XTENSA: #define __INTMAX_C_SUFFIX__ LL
+// XTENSA: #define __INTMAX_MAX__ 9223372036854775807LL
+// XTENSA: #define __INTMAX_TYPE__ long long int
+// XTENSA: #define __INTMAX_WIDTH__ 64
+// XTENSA: #define __INTPTR_MAX__ 2147483647
+// XTENSA: #define __INTPTR_TYPE__ int
+// XTENSA: #define __INTPTR_WIDTH__ 32
+// TODO: Xtensa GCC defines INT_FAST16 as int
+// XTENSA: #define __INT_FAST16_MAX__ 32767
+// XTENSA: #define __INT_FAST16_TYPE__ short
+// XTENSA: #define __INT_FAST16_WIDTH__ 16
+// XTENSA: #define __INT_FAST32_MAX__ 2147483647
+// XTENSA: #define __INT_FAST32_TYPE__ int
+// XTENSA: #define __INT_FAST32_WIDTH__ 32
+// XTENSA: #define __INT_FAST64_MAX__ 9223372036854775807LL
+// XTENSA: #define __INT_FAST64_TYPE__ long long int
+// XTENSA: #define __INT_FAST64_WIDTH__ 64
+// TODO: Xtensa GCC defines INT_FAST8 as int
+// XTENSA: #define __INT_FAST8_MAX__ 127
+// XTENSA: #define __INT_FAST8_TYPE__ signed char
+// XTENSA: #define __INT_FAST8_WIDTH__ 8
+// XTENSA: #define __INT_LEAST16_MAX__ 32767
+// XTENSA: #define __INT_LEAST16_TYPE__ short
+// XTENSA: #define __INT_LEAST16_WIDTH__ 16
+// XTENSA: #define __INT_LEAST32_MAX__ 2147483647
+// XTENSA: #define __INT_LEAST32_TYPE__ int
+// XTENSA: #define __INT_LEAST32_WIDTH__ 32
+// XTENSA: #define __INT_LEAST64_MAX__ 9223372036854775807LL
+// XTENSA: #define __INT_LEAST64_TYPE__ long long int
+// XTENSA: #define __INT_LEAST64_WIDTH__ 64
+// XTENSA: #define __INT_LEAST8_MAX__ 127
+// XTENSA: #define __INT_LEAST8_TYPE__ signed char
+// XTENSA: #define __INT_LEAST8_WIDTH__ 8
+// XTENSA: #define __INT_MAX__ 2147483647
+// XTENSA: #define __INT_WIDTH__ 32
+// XTENSA: #define __LDBL_DECIMAL_DIG__ 17
+// XTENSA: #define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L
+// XTENSA: #define __LDBL_DIG__ 15
+// XTENSA: #define __LDBL_EPSILON__ 2.2204460492503131e-16L
+// XTENSA: #define __LDBL_HAS_DENORM__ 1
+// XTENSA: #define __LDBL_HAS_INFINITY__ 1
+// XTENSA: #define __LDBL_HAS_QUIET_NAN__ 1
+// XTENSA: #define __LDBL_MANT_DIG__ 53
+// XTENSA: #define __LDBL_MAX_10_EXP__ 308
+// XTENSA: #define __LDBL_MAX_EXP__ 1024
+// XTENSA: #define __LDBL_MAX__ 1.7976931348623157e+308L
+// XTENSA: #define __LDBL_MIN_10_EXP__ (-307)
+// XTENSA: #define __LDBL_MIN_EXP__ (-1021)
+// XTENSA: #define __LDBL_MIN__ 2.2250738585072014e-308L
+// XTENSA: #define __LDBL_NORM_MAX__ 1.7976931348623157e+308L
+// XTENSA: #define __LITTLE_ENDIAN__ 1
+// XTENSA: #define __LLONG_WIDTH__ 64
+// XTENSA: #define __LONG_LONG_MAX__ 9223372036854775807LL
+// XTENSA: #define __LONG_MAX__ 2147483647L
+// XTENSA: #define __LONG_WIDTH__ 32
+// XTENSA: #define __MEMORY_SCOPE_DEVICE 1
+// XTENSA: #define __MEMORY_SCOPE_SINGLE 4
+// XTENSA: #define __MEMORY_SCOPE_SYSTEM 0
+// XTENSA: #define __MEMORY_SCOPE_WRKGRP 2
+// XTENSA: #define __MEMORY_SCOPE_WVFRNT 3
+// XTENSA: #define __NO_INLINE__ 1
+// XTENSA: #define __NO_MATH_ERRNO__ 1
+// XTENSA: #define __OBJC_BOOL_IS_BOOL 0
+// XTENSA: #define __POINTER_WIDTH__ 32
+// XTENSA: #define __PRAGMA_REDEFINE_EXTNAME 1
+// XTENSA: #define __PTRDIFF_MAX__ 2147483647
+// XTENSA: #define __PTRDIFF_TYPE__ int
+// XTENSA: #define __PTRDIFF_WIDTH__ 32
+// XTENSA: #define __SCHAR_MAX__ 127
+// XTENSA: #define __SHRT_MAX__ 32767
+// XTENSA: #define __SHRT_WIDTH__ 16
+// XTENSA: #define __SIG_ATOMIC_MAX__ 2147483647
+// XTENSA: #define __SIG_ATOMIC_WIDTH__ 32
+// XTENSA: #define __SIZEOF_DOUBLE__ 8
+// XTENSA: #define __SIZEOF_FLOAT__ 4
+// XTENSA: #define __SIZEOF_INT__ 4
+// XTENSA: #define __SIZEOF_LONG_DOUBLE__ 8
+// XTENSA: #define __SIZEOF_LONG_LONG__ 8
+// XTENSA: #define __SIZEOF_LONG__ 4
+// XTENSA: #define __SIZEOF_POINTER__ 4
+// XTENSA: #define __SIZEOF_PTRDIFF_T__ 4
+// XTENSA: #define __SIZEOF_SHORT__ 2
+// XTENSA: #define __SIZEOF_SIZE_T__ 4
+// XTENSA: #define __SIZEOF_WCHAR_T__ 4
+// XTENSA: #define __SIZEOF_WINT_T__ 4
+// XTENSA: #define __SIZE_MAX__ 4294967295U
+// XTENSA: #define __SIZE_TYPE__ unsigned int
+// XTENSA: #define __SIZE_WIDTH__ 32
+// XTENSA: #define __STDC_EMBED_EMPTY__ 2
+// XTENSA: #define __STDC_EMBED_FOUND__ 1
+// XTENSA: #define __STDC_EMBED_NOT_FOUND__ 0
+// XTENSA: #define __STDC_HOSTED__ 0
+// XTENSA: #define __STDC_UTF_16__ 1
+// XTENSA: #define __STDC_UTF_32__ 1
+// XTENSA: #define __STDC_VERSION__ 201710L
+// XTENSA: #define __STDC__ 1
+// XTENSA: #define __UINT16_C_SUFFIX__ 
+// XTENSA: #define __UINT16_MAX__ 65535
+// XTENSA: #define __UINT16_TYPE__ unsigned short
+// XTENSA: #define __UINT32_C_SUFFIX__ U
+// XTENSA: #define __UINT32_MAX__ 4294967295U
+// XTENSA: #define __UINT32_TYPE__ unsigned int
+// XTENSA: #define __UINT64_C_SUFFIX__ ULL
+// XTENSA: #define __UINT64_MAX__ 18446744073709551615ULL
+// XTENSA: #define __UINT64_TYPE__ long long unsigned int
+// XTENSA: #define __UINT8_C_SUFFIX__ 
+// XTENSA: #define __UINT8_MAX__ 255
+// XTENSA: #define __UINT8_TYPE__ unsigned char
+// XTENSA: #define __UINTMAX_C_SUFFIX__ ULL
+// XTENSA: #define __UINTMAX_MAX__ 18446744073709551615ULL
+// XTENSA: #define __UINTMAX_TYPE__ long long unsigned int
+// XTENSA: #define __UINTMAX_WIDTH__ 64
+// XTENSA: #define __UINTPTR_MAX__ 4294967295U
+// XTENSA: #define __UINTPTR_TYPE__ unsigned int
+// XTENSA: #define __UINTPTR_WIDTH__ 32
+// XTENSA: #define __UINT_FAST16_MAX__ 65535
+// XTENSA: #define __UINT_FAST16_TYPE__ unsigned short
+// XTENSA: #define __UINT_FAST32_MAX__ 4294967295U
+// XTENSA: #define __UINT_FAST32_TYPE__ unsigned int
+// XTENSA: #define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// XTENSA: #define __UINT_FAST64_TYPE__ long long unsigned int
+// XTENSA: #define __UINT_FAST8_MAX__ 255
+// XTENSA: #define __UINT_FAST8_TYPE__ unsigned char
+// XTENSA: #define __UINT_LEAST16_MAX__ 65535
+// XTENSA: #define __UINT_LEAST16_TYPE__ unsigned short
+// XTENSA: #define __UINT_LEAST32_MAX__ 4294967295U
+// XTENSA: #define __UINT_LEAST32_TYPE__ unsigned int
+// XTENSA: #define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// XTENSA: #define __UINT_LEAST64_TYPE__ long long unsigned int
+// XTENSA: #define __UINT_LEAST8_MAX__ 255
+// XTENSA: #define __UINT_LEAST8_TYPE__ unsigned char
+// XTENSA: #define __USER_LABEL_PREFIX__ 
+// XTENSA: #define __WCHAR_MAX__ 2147483647
+// XTENSA: #define __WCHAR_TYPE__ int
+// XTENSA: #define __WCHAR_WIDTH__ 32
+// XTENSA: #define __WINT_MAX__ 4294967295U
+// XTENSA: #define __WINT_TYPE__ unsigned int
+// XTENSA: #define __WINT_UNSIGNED__ 1
+// XTENSA: #define __WINT_WIDTH__ 32
+// XTENSA: #define __XCHAL_HAVE_ABS 1
+// XTENSA: #define __XCHAL_HAVE_ADDX 1
+// XTENSA: #define __XCHAL_HAVE_BE 0
+// XTENSA: #define __XCHAL_HAVE_L32R 1
+// XTENSA: #define __XTENSA_EL__ 1
+// XTENSA: #define __XTENSA__ 1
+// XTENSA: #define __xtensa__ 1
diff --git a/clang/test/Preprocessor/macho-embedded-predefines.c b/clang/test/Preprocessor/macho-embedded-predefines.c
index 74f2919..a7e5777 100644
--- a/clang/test/Preprocessor/macho-embedded-predefines.c
+++ b/clang/test/Preprocessor/macho-embedded-predefines.c
@@ -3,18 +3,18 @@
 // CHECK-7M: #define __APPLE_CC__
 // CHECK-7M: #define __APPLE__
 // CHECK-7M: #define __ARM_ARCH_7M__
-// CHECK-7M-NOT: #define __MACH__
+// CHECK-7M: #define __MACH__
 
 // RUN: %clang_cc1 -E -dM -triple thumbv7em-apple-unknown-macho -target-cpu cortex-m4 %s | FileCheck %s -check-prefix CHECK-7EM
 
 // CHECK-7EM: #define __APPLE_CC__
 // CHECK-7EM: #define __APPLE__
 // CHECK-7EM: #define __ARM_ARCH_7EM__
-// CHECK-7EM-NOT: #define __MACH__
+// CHECK-7EM: #define __MACH__
 
 // RUN: %clang_cc1 -E -dM -triple thumbv6m-apple-unknown-macho -target-cpu cortex-m0 %s | FileCheck %s -check-prefix CHECK-6M
 
 // CHECK-6M: #define __APPLE_CC__
 // CHECK-6M: #define __APPLE__
 // CHECK-6M: #define __ARM_ARCH_6M__
-// CHECK-6M-NOT: #define __MACH__
+// CHECK-6M: #define __MACH__
diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c
index 8e539a2..86708e0 100644
--- a/clang/test/Preprocessor/predefined-win-macros.c
+++ b/clang/test/Preprocessor/predefined-win-macros.c
@@ -113,6 +113,13 @@
 // CHECK-ARM64EC-WIN: #define _WIN32 1
 // CHECK-ARM64EC-WIN: #define _WIN64 1
 
+// RUN: %clang_cc1 -triple mipsel-windows %s -E -dM -o - \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-MIPSEL-WIN
+
+// CHECK-MIPSEL-WIN: #define _M_MRX000 4000
+// CHECK-MIPSEL-WIN: #define _WIN32 1
+// CHECK-MIPSEL-WIN-NOT: #define _MIPS_ 1
+
 // RUN: %clang_cc1 -triple i686-windows-gnu %s -E -dM -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-X86-MINGW
 
@@ -173,3 +180,12 @@
 // CHECK-ARM64EC-MINGW: #define __arm64ec__ 1
 // CHECK-ARM64EC-MINGW: #define __x86_64 1
 // CHECK-ARM64EC-MINGW: #define __x86_64__ 1
+
+// RUN: %clang_cc1 -triple mipsel-windows-gnu %s -E -dM -o - \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-MIPSEL-MINGW
+
+// CHECK-MIPSEL-MINGW-NOT: #define _M_MRX000 4000
+// CHECK-MIPSEL-MINGW: #define _MIPS_ 1
+// CHECK-MIPSEL-MINGW: #define _WIN32 1
+// CHECK-MIPSEL-MINGW: #define __mips 32
+// CHECK-MIPSEL-MINGW: #define __mips__ 1
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index e376821..c219771 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -182,6 +182,8 @@
 
 // Experimental extensions
 
+// CHECK-NOT: __riscv_sdext{{.*$}}
+// CHECK-NOT: __riscv_sdtrig{{.*$}}
 // CHECK-NOT: __riscv_smctr{{.*$}}
 // CHECK-NOT: __riscv_smmpm{{.*$}}
 // CHECK-NOT: __riscv_smnpm{{.*$}}
@@ -1796,6 +1798,22 @@
 // CHECK-SUPM-EXT: __riscv_supm 1000000{{$}}
 
 // RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN:   -march=rv32i_sdext1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-SDEXT-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN:   -march=rv64i_sdext1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-SDEXT-EXT %s
+// CHECK-SDEXT-EXT: __riscv_sdext 1000000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN:   -march=rv32i_sdtrig1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-SDTRIG-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN:   -march=rv64i_sdtrig1p0 -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-SDTRIG-EXT %s
+// CHECK-SDTRIG-EXT: __riscv_sdtrig 1000000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
 // RUN:   -march=rv32i_smctr1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMCTR-EXT %s
 // RUN: %clang --target=riscv64 -menable-experimental-extensions \
diff --git a/clang/test/Preprocessor/stdint.c b/clang/test/Preprocessor/stdint.c
index 7cb33ed..af1d698 100644
--- a/clang/test/Preprocessor/stdint.c
+++ b/clang/test/Preprocessor/stdint.c
@@ -1498,6 +1498,113 @@
 // XCORE:INTMAX_C_(0) 0LL
 // XCORE:UINTMAX_C_(0) 0ULL
 //
+// RUN: %clang_cc1 -E -ffreestanding -triple=xtensa %s | FileCheck -check-prefix XTENSA %s
+//
+// XTENSA:typedef long long int int64_t;
+// XTENSA:typedef long long unsigned int uint64_t;
+// XTENSA:typedef int64_t int_least64_t;
+// XTENSA:typedef uint64_t uint_least64_t;
+// XTENSA:typedef int64_t int_fast64_t;
+// XTENSA:typedef uint64_t uint_fast64_t;
+//
+// XTENSA:typedef int int32_t;
+// XTENSA:typedef unsigned int uint32_t;
+// XTENSA:typedef int32_t int_least32_t;
+// XTENSA:typedef uint32_t uint_least32_t;
+// XTENSA:typedef int32_t int_fast32_t;
+// XTENSA:typedef uint32_t uint_fast32_t;
+//
+// XTENSA:typedef short int16_t;
+// XTENSA:typedef unsigned short uint16_t;
+// XTENSA:typedef int16_t int_least16_t;
+// XTENSA:typedef uint16_t uint_least16_t;
+// XTENSA:typedef int16_t int_fast16_t;
+// XTENSA:typedef uint16_t uint_fast16_t;
+//
+// XTENSA:typedef signed char int8_t;
+// XTENSA:typedef unsigned char uint8_t;
+// XTENSA:typedef int8_t int_least8_t;
+// XTENSA:typedef uint8_t uint_least8_t;
+// XTENSA:typedef int8_t int_fast8_t;
+// XTENSA:typedef uint8_t uint_fast8_t;
+//
+// XTENSA:typedef int intptr_t;
+// XTENSA:typedef unsigned int uintptr_t;
+//
+// XTENSA:typedef long long int intmax_t;
+// XTENSA:typedef long long unsigned int uintmax_t;
+//
+// XTENSA:INT8_MAX_ 127
+// XTENSA:INT8_MIN_ (-127 -1)
+// XTENSA:UINT8_MAX_ 255
+// XTENSA:INT_LEAST8_MIN_ (-127 -1)
+// XTENSA:INT_LEAST8_MAX_ 127
+// XTENSA:UINT_LEAST8_MAX_ 255
+// XTENSA:INT_FAST8_MIN_ (-127 -1)
+// XTENSA:INT_FAST8_MAX_ 127
+// XTENSA:UINT_FAST8_MAX_ 255
+//
+// XTENSA:INT16_MAX_ 32767
+// XTENSA:INT16_MIN_ (-32767 -1)
+// XTENSA:UINT16_MAX_ 65535
+// XTENSA:INT_LEAST16_MIN_ (-32767 -1)
+// XTENSA:INT_LEAST16_MAX_ 32767
+// XTENSA:UINT_LEAST16_MAX_ 65535
+// XTENSA:INT_FAST16_MIN_ (-32767 -1)
+// XTENSA:INT_FAST16_MAX_ 32767
+// XTENSA:UINT_FAST16_MAX_ 65535
+//
+// XTENSA:INT32_MAX_ 2147483647
+// XTENSA:INT32_MIN_ (-2147483647 -1)
+// XTENSA:UINT32_MAX_ 4294967295U
+// XTENSA:INT_LEAST32_MIN_ (-2147483647 -1)
+// XTENSA:INT_LEAST32_MAX_ 2147483647
+// XTENSA:UINT_LEAST32_MAX_ 4294967295U
+// XTENSA:INT_FAST32_MIN_ (-2147483647 -1)
+// XTENSA:INT_FAST32_MAX_ 2147483647
+// XTENSA:UINT_FAST32_MAX_ 4294967295U
+//
+// XTENSA:INT64_MAX_ 9223372036854775807LL
+// XTENSA:INT64_MIN_ (-9223372036854775807LL -1)
+// XTENSA:UINT64_MAX_ 18446744073709551615ULL
+// XTENSA:INT_LEAST64_MIN_ (-9223372036854775807LL -1)
+// XTENSA:INT_LEAST64_MAX_ 9223372036854775807LL
+// XTENSA:UINT_LEAST64_MAX_ 18446744073709551615ULL
+// XTENSA:INT_FAST64_MIN_ (-9223372036854775807LL -1)
+// XTENSA:INT_FAST64_MAX_ 9223372036854775807LL
+// XTENSA:UINT_FAST64_MAX_ 18446744073709551615ULL
+//
+// XTENSA:INTPTR_MIN_ (-2147483647 -1)
+// XTENSA:INTPTR_MAX_ 2147483647
+// XTENSA:UINTPTR_MAX_ 4294967295U
+// XTENSA:PTRDIFF_MIN_ (-2147483647 -1)
+// XTENSA:PTRDIFF_MAX_ 2147483647
+// XTENSA:SIZE_MAX_ 4294967295U
+//
+// XTENSA:INTMAX_MIN_ (-9223372036854775807LL -1)
+// XTENSA:INTMAX_MAX_ 9223372036854775807LL
+// XTENSA:UINTMAX_MAX_ 18446744073709551615ULL
+//
+// XTENSA:SIG_ATOMIC_MIN_ (-2147483647 -1)
+// XTENSA:SIG_ATOMIC_MAX_ 2147483647
+// XTENSA:WINT_MIN_ 0U
+// XTENSA:WINT_MAX_ 4294967295U
+//
+// XTENSA:WCHAR_MAX_ 2147483647
+// XTENSA:WCHAR_MIN_ (-2147483647 -1)
+//
+// XTENSA:INT8_C_(0) 0
+// XTENSA:UINT8_C_(0) 0U
+// XTENSA:INT16_C_(0) 0
+// XTENSA:UINT16_C_(0) 0U
+// XTENSA:INT32_C_(0) 0
+// XTENSA:UINT32_C_(0) 0U
+// XTENSA:INT64_C_(0) 0LL
+// XTENSA:UINT64_C_(0) 0ULL
+//
+// XTENSA:INTMAX_C_(0) 0LL
+// XTENSA:UINTMAX_C_(0) 0ULL
+//
 //
 // stdint.h forms several macro definitions by pasting together identifiers
 // to form names (eg. int32_t is formed from int ## 32 ## _t). The following 
diff --git a/clang/test/Sema/varargs.c b/clang/test/Sema/varargs.c
index 2cb7270..bec41dd 100644
--- a/clang/test/Sema/varargs.c
+++ b/clang/test/Sema/varargs.c
@@ -75,6 +75,11 @@ void f9(__builtin_va_list args)
     (void)__builtin_va_arg(args, enum E); // Don't warn here in C
     (void)__builtin_va_arg(args, short); // expected-warning {{second argument to 'va_arg' is of promotable type 'short'}}
     (void)__builtin_va_arg(args, char); // expected-warning {{second argument to 'va_arg' is of promotable type 'char'}}
+    // Don't crash on some undefined behaviors.
+    int n;
+    (void)__builtin_va_arg(args, int[10]); // expected-warning{{second argument to 'va_arg' is of array type 'int[10]'}}
+    (void)__builtin_va_arg(args, int[++n]); // expected-warning{{second argument to 'va_arg' is of array type 'int[++n]'}}
+    (void)__builtin_va_arg(args, int[n][n]); // expected-warning{{second argument to 'va_arg' is of array type 'int[n][n]'}}
 }
 
 void f10(int a, ...) {
diff --git a/clang/test/SemaCXX/cxx20-decomposition.cpp b/clang/test/SemaCXX/cxx20-decomposition.cpp
index 430a158..ccc1af5 100644
--- a/clang/test/SemaCXX/cxx20-decomposition.cpp
+++ b/clang/test/SemaCXX/cxx20-decomposition.cpp
@@ -183,3 +183,26 @@ namespace ODRUseTests {
     }(0); }(0); // expected-note 2{{in instantiation}}
   }
 }
+
+
+namespace GH95081 {
+    void prevent_assignment_check() {
+        int arr[] = {1,2};
+        auto [e1, e2] = arr;
+
+        auto lambda = [e1] {
+            e1 = 42;  // expected-error {{cannot assign to a variable captured by copy in a non-mutable lambda}}
+        };
+    }
+
+    void f(int&) = delete;
+    void f(const int&);
+
+    int arr[1];
+    void foo() {
+        auto [x] = arr;
+        [x]() {
+            f(x); // deleted f(int&) used to be picked up erroneously
+        } ();
+    }
+}
diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
index 0674135..4806143 100644
--- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
+++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
@@ -305,3 +305,82 @@ static_assert(__is_same_as(_Three_way_comparison_result_with_tuple_like<tuple<in
 static_assert(__is_same_as(_Three_way_comparison_result_with_tuple_like<tuple<int>, 0>::type, long));
 
 }
+
+namespace GH88866 {
+
+template <typename...Ts> struct index_by;
+
+template <typename T, typename Indices>
+concept InitFunc = true;
+
+namespace ExpandsBoth {
+
+template <typename Indices, InitFunc<Indices> auto... init>
+struct LazyLitMatrix; // expected-note {{here}}
+
+template <
+    typename...Indices,
+    InitFunc<index_by<Indices>> auto... init
+>
+struct LazyLitMatrix<index_by<Indices...>, init...> {
+};
+
+// FIXME: Explain why we didn't pick up the partial specialization - pack sizes don't match.
+template struct LazyLitMatrix<index_by<int, char>, 42>;
+// expected-error@-1 {{instantiation of undefined template}}
+template struct LazyLitMatrix<index_by<int, char>, 42, 43>;
+
+}
+
+namespace ExpandsRespectively {
+
+template <typename Indices, InitFunc<Indices> auto... init>
+struct LazyLitMatrix;
+
+template <
+    typename...Indices,
+    InitFunc<index_by<Indices...>> auto... init
+>
+struct LazyLitMatrix<index_by<Indices...>, init...> {
+};
+
+template struct LazyLitMatrix<index_by<int, char>, 42>;
+template struct LazyLitMatrix<index_by<int, char>, 42, 43>;
+
+}
+
+namespace TypeParameter {
+
+template <typename Indices, InitFunc<Indices>... init>
+struct LazyLitMatrix; // expected-note {{here}}
+
+template <
+    typename...Indices,
+    InitFunc<index_by<Indices>>... init
+>
+struct LazyLitMatrix<index_by<Indices...>, init...> {
+};
+
+// FIXME: Explain why we didn't pick up the partial specialization - pack sizes don't match.
+template struct LazyLitMatrix<index_by<int, char>, float>;
+// expected-error@-1 {{instantiation of undefined template}}
+template struct LazyLitMatrix<index_by<int, char>, unsigned, float>;
+
+}
+
+namespace Invalid {
+
+template <typename Indices, InitFunc<Indices>... init>
+struct LazyLitMatrix;
+
+template <
+    typename...Indices,
+    InitFunc<index_by<Indices>> init
+    // expected-error@-1 {{unexpanded parameter pack 'Indices'}}
+>
+struct LazyLitMatrix<index_by<Indices...>, init> {
+};
+
+}
+
+}
diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
index cb679a6..58b642d 100644
--- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
+++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
@@ -305,3 +305,19 @@ template <class... Args> struct mdispatch_ {
 mdispatch_<int, int> d;
 
 } // namespace GH116105
+
+namespace GH121242 {
+    // Non-dependent type pack access
+    template <int...x>
+    int y = x...[0];
+
+    struct X {};
+
+    template <X...x>
+    X z = x...[0];
+
+    void foo() {
+        (void)y<0>;
+        (void)z<X{}>;
+    }
+} // namespace GH121242
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 91ef778..1b9e2ba6 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -5031,3 +5031,18 @@ void remove_all_extents() {
   using SomeArray = int[1][2];
   static_assert(__is_same(remove_all_extents_t<const SomeArray>, const int));
 }
+
+namespace GH121278 {
+// https://cplusplus.github.io/LWG/lwg-active.html#3929
+#if __cplusplus >= 202002L
+template <typename B, typename D>
+concept C = __is_base_of(B, D);
+// expected-error@-1 {{incomplete type 'GH121278::S' used in type trait expression}}
+// expected-note@-2 {{while substituting template arguments into constraint expression here}}
+
+struct T;
+struct S;
+bool b = C<T, S>;
+// expected-note@-1 {{while checking the satisfaction of concept 'C<GH121278::T, GH121278::S>' requested here}}
+#endif
+}
diff --git a/clang/test/SemaCXX/warn-unused-result.cpp b/clang/test/SemaCXX/warn-unused-result.cpp
index 682c500..5105f34 100644
--- a/clang/test/SemaCXX/warn-unused-result.cpp
+++ b/clang/test/SemaCXX/warn-unused-result.cpp
@@ -355,3 +355,12 @@ void use2() {
   (void)G{"Hello"};
 }
 } // namespace nodiscard_specialization
+
+namespace GH117975 {
+// Test for a regression for ICE in CallExpr::getUnusedResultAttr
+int f() { return 0; }
+void id_print_name() {
+  (int) // expected-warning {{expression result unused}}
+    ((int(*)())f)();
+}
+} // namespace GH117975
diff --git a/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c b/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
index f37c1d7..9a98f5e 100644
--- a/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
+++ b/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
@@ -137,7 +137,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop auto device_num(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop auto default_async(1)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop auto device_type(*)
@@ -254,7 +254,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop device_num(1) auto
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop default_async(1) auto
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop device_type(*) auto
@@ -372,7 +372,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop independent device_num(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop independent default_async(1)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop independent device_type(*)
@@ -489,7 +489,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop device_num(1) independent
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop default_async(1) independent
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop device_type(*) independent
@@ -615,7 +615,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop seq device_num(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop seq default_async(1)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop seq device_type(*)
@@ -738,7 +738,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop device_num(1) seq
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop default_async(1) seq
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop device_type(*) seq
diff --git a/clang/test/SemaOpenACC/combined-construct-device_type-clause.c b/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
index 50612c2..8072ae7 100644
--- a/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
@@ -193,8 +193,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'serial loop' directive}}
 #pragma acc serial loop device_type(*) device_num(1)
   for(int i = 0; i < 5; ++i);
-  // expected-error@+2{{OpenACC clause 'default_async' may not follow a 'device_type' clause in a 'serial loop' construct}}
-  // expected-note@+1{{previous clause is here}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'serial loop' directive}}
 #pragma acc serial loop device_type(*) default_async(1)
   for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop device_type(*) async
diff --git a/clang/test/SemaOpenACC/combined-construct-self-ast.cpp b/clang/test/SemaOpenACC/combined-construct-self-ast.cpp
index 3a6ba3c..e504ea7 100644
--- a/clang/test/SemaOpenACC/combined-construct-self-ast.cpp
+++ b/clang/test/SemaOpenACC/combined-construct-self-ast.cpp
@@ -20,6 +20,7 @@ void TemplFunc() {
   for (unsigned i = 0; i < 5; ++i);
   // CHECK-NEXT: OpenACCCombinedConstruct{{.*}}serial loop
   // CHECK-NEXT: self clause
+  // CHECK-NEXT: <<<NULL>>
   // CHECK-NEXT: ForStmt
   // CHECK: NullStmt
 
@@ -65,6 +66,7 @@ void TemplFunc() {
   //
   // CHECK-NEXT: OpenACCCombinedConstruct{{.*}}serial loop
   // CHECK-NEXT: self clause
+  // CHECK-NEXT: <<<NULL>>
   // CHECK-NEXT: ForStmt
   // CHECK: NullStmt
 
diff --git a/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp b/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
index 69f65f4..58c12b8 100644
--- a/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
@@ -197,6 +197,7 @@ void TemplFunc() {
   while(true);
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial
   // CHECK-NEXT: self clause
+  // CHECK-NEXT: <<<NULL>>
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
@@ -393,6 +394,7 @@ void TemplFunc() {
 
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial
   // CHECK-NEXT: self clause
+  // CHECK-NEXT: <<<NULL>>
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
diff --git a/clang/test/SemaOpenACC/compute-construct-device_type-clause.c b/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
index bbad68c..33f433c 100644
--- a/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
@@ -198,8 +198,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'kernels' directive}}
 #pragma acc kernels device_type(*) device_num(1)
   while(1);
-  // expected-error@+2{{OpenACC clause 'default_async' may not follow a 'device_type' clause in a 'kernels' construct}}
-  // expected-note@+1{{previous clause is here}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'kernels' directive}}
 #pragma acc kernels device_type(*) default_async(1)
   while(1);
 #pragma acc kernels device_type(*) async
diff --git a/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c b/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
index d75d6ab..d9a4c61 100644
--- a/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
+++ b/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
@@ -152,7 +152,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop auto device_num(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'loop' directive}}
 #pragma acc loop auto default_async(1)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc loop auto device_type(*)
@@ -286,7 +286,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop device_num(1) auto
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'loop' directive}}
 #pragma acc loop default_async(1) auto
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc loop device_type(*) auto
@@ -421,7 +421,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop independent device_num(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'loop' directive}}
 #pragma acc loop independent default_async(1)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc loop independent device_type(*)
@@ -555,7 +555,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop device_num(1) independent
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'loop' directive}}
 #pragma acc loop default_async(1) independent
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc loop device_type(*) independent
@@ -698,7 +698,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop seq device_num(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'loop' directive}}
 #pragma acc loop seq default_async(1)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc loop seq device_type(*)
@@ -838,7 +838,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop device_num(1) seq
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'loop' directive}}
 #pragma acc loop default_async(1) seq
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc loop device_type(*) seq
diff --git a/clang/test/SemaOpenACC/loop-construct-device_type-clause.c b/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
index bf2a249..f16f17f 100644
--- a/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
@@ -173,8 +173,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop device_type(*) device_num(1)
   for(int i = 0; i < 5; ++i);
-  // expected-error@+2{{OpenACC clause 'default_async' may not follow a 'device_type' clause in a 'loop' construct}}
-  // expected-note@+1{{previous clause is here}}
+  // expected-error@+1{{OpenACC 'default_async' clause is not valid on 'loop' directive}}
 #pragma acc loop device_type(*) default_async(1)
   for(int i = 0; i < 5; ++i);
   // expected-error@+1{{OpenACC 'async' clause is not valid on 'loop' directive}}
diff --git a/clang/test/SemaOpenACC/set-construct-ast.cpp b/clang/test/SemaOpenACC/set-construct-ast.cpp
new file mode 100644
index 0000000..c5eed94
--- /dev/null
+++ b/clang/test/SemaOpenACC/set-construct-ast.cpp
@@ -0,0 +1,98 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+int some_int();
+long some_long();
+void NormalFunc() {
+  // CHECK-LABEL: NormalFunc
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc set default_async(some_int()) device_num(some_long()) device_type(DT) if (some_int() < some_long())
+  // CHECK-NEXT: OpenACCSetConstruct{{.*}}set
+  // CHECK-NEXT: default_async clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: CallExpr{{.*}} 'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+  // CHECK-NEXT: device_type(DT)
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}} 'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+}
+
+template<typename T>
+void TemplFunc(T t) {
+  // CHECK-LABEL: FunctionTemplateDecl {{.*}}TemplFunc
+  // CHECK-NEXT: TemplateTypeParmDecl
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplFunc
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'T'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc set default_async(T::value) device_num(t) device_type(DT) if (T::value < t)
+  // CHECK-NEXT: OpenACCSetConstruct{{.*}}set
+  // CHECK-NEXT: default_async clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'T'
+  // CHECK-NEXT: device_type(DT)
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'<dependent type>' '<'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'T'
+
+  // Instantiation:
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void (SomeStruct)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'SomeStruct'
+  // CHECK-NEXT: RecordType{{.*}} 'SomeStruct'
+  // CHECK-NEXT: CXXRecord{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'SomeStruct'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCSetConstruct{{.*}}set
+  // CHECK-NEXT: default_async clause
+  // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+  // CHECK-NEXT: device_type(DT)
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECk-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+}
+
+struct SomeStruct{
+  static constexpr unsigned value = 5;
+  operator unsigned();
+};
+
+void use() {
+  TemplFunc(SomeStruct{});
+}
+#endif
diff --git a/clang/test/SemaOpenACC/set-construct.cpp b/clang/test/SemaOpenACC/set-construct.cpp
new file mode 100644
index 0000000..23816db
--- /dev/null
+++ b/clang/test/SemaOpenACC/set-construct.cpp
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NotConvertible{} NC;
+short getS();
+int getI();
+
+struct AmbiguousConvert{
+  operator int(); // #AMBIG_INT
+  operator short(); // #AMBIG_SHORT
+  operator float();
+} Ambiguous;
+
+struct ExplicitConvertOnly {
+  explicit operator int() const; // #EXPL_CONV
+} Explicit;
+
+void uses() {
+#pragma acc set default_async(getI())
+#pragma acc set device_num(getI())
+#pragma acc set device_type(getI)
+#pragma acc set device_type(getI) if (getI() < getS())
+
+  // expected-error@+1{{value of type 'struct NotConvertible' is not contextually convertible to 'bool'}}
+#pragma acc set if (NC) device_type(I)
+
+  // expected-error@+2{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
+  // expected-error@+1{{OpenACC clause 'device_num' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc set device_num(NC)
+  // expected-error@+4{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
+  // expected-error@+3{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc set device_num(Ambiguous)
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc set device_num(Explicit)
+
+  // expected-error@+2{{OpenACC clause 'default_async' requires expression of integer type ('struct NotConvertible' invalid)}}
+  // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
+#pragma acc set default_async(NC)
+  // expected-error@+4{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+  // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
+#pragma acc set default_async(Ambiguous)
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc set default_async(Explicit)
+
+  // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
+#pragma acc set
+
+#pragma acc set if (true)
+
+  // expected-error@+2{{'default_async' clause cannot appear more than once on a 'set' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc set default_async(getI()) default_async(getI())
+
+  // expected-error@+2{{'device_num' clause cannot appear more than once on a 'set' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc set device_num(getI()) device_num(getI())
+
+  // expected-error@+2{{'device_type' clause cannot appear more than once on a 'set' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc set device_type(I) device_type(I)
+  // expected-error@+2{{'if' clause cannot appear more than once on a 'set' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc set device_type(I) if(true) if (true)
+}
diff --git a/clang/test/SemaOpenACC/unimplemented-construct.c b/clang/test/SemaOpenACC/unimplemented-construct.c
index 42737eb..b22bfa5 100644
--- a/clang/test/SemaOpenACC/unimplemented-construct.c
+++ b/clang/test/SemaOpenACC/unimplemented-construct.c
@@ -4,8 +4,8 @@
 #pragma acc routine
 
 struct S {
-// expected-warning@+1{{OpenACC construct 'set' not yet implemented, pragma ignored}}
-#pragma acc set
+// expected-warning@+1{{OpenACC construct 'declare' not yet implemented, pragma ignored}}
+#pragma acc declare 
 int foo;
 };
 
diff --git a/clang/test/SemaOpenACC/update-construct-ast.cpp b/clang/test/SemaOpenACC/update-construct-ast.cpp
new file mode 100644
index 0000000..9048e88
--- /dev/null
+++ b/clang/test/SemaOpenACC/update-construct-ast.cpp
@@ -0,0 +1,267 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+int some_int();
+long some_long();
+
+int Global;
+short GlobalArray[5];
+
+
+void NormalFunc() {
+  // CHECK-LABEL: NormalFunc
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc update if_present if (some_int() < some_long())
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: if_present clause
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}} 'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+
+#pragma acc update wait async device_type(A) dtype(B)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: device_type(A)
+  // CHECK-NEXT: dtype(B)
+#pragma acc update wait(some_int(), some_long()) async(some_int())
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+#pragma acc update wait(queues:some_int(), some_long())
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+#pragma acc update wait(devnum: some_int() :some_int(), some_long())
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+
+#pragma acc update self(Global, GlobalArray, GlobalArray[0], GlobalArray[0:1])
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'Global' 'int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}} 'short' lvalue
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 0
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 0
+  // CHECK-NEXT: IntegerLiteral{{.*}} 1
+}
+
+template<typename T>
+void TemplFunc(T t) {
+  // CHECK-LABEL: FunctionTemplateDecl {{.*}}TemplFunc
+  // CHECK-NEXT: TemplateTypeParmDecl
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplFunc
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'T'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc update if_present if (T::value < t)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: if_present clause
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'<dependent type>' '<'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'T'
+
+#pragma acc update wait async device_type(T) dtype(U)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: dtype(U)
+#pragma acc update wait(T::value, t) async(T::value)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+#pragma acc update wait(queues:T::value, t) async(t)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+#pragma acc update wait(devnum: T::value:t, T::value)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+
+  decltype(T::value) Local = 0, LocalArray[5] = {};
+  // CHECK-NEXT: DeclStmt 
+  // CHECK-NEXT: VarDecl
+  // CHECK-NEXT: IntegerLiteral
+  // CHECK-NEXT: VarDecl
+  // CHECK-NEXT: InitListExpr
+
+#pragma acc update self(Local, LocalArray, LocalArray[0], LocalArray[0:1])
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'Local' 'decltype(T::value)'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(T::value)[5]'
+  // CHECK-NEXT: ArraySubscriptExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(T::value)[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}}0
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(T::value)[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}}0
+  // CHECK-NEXT: IntegerLiteral{{.*}}1
+
+  // Instantiation:
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void (SomeStruct)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'SomeStruct'
+  // CHECK-NEXT: RecordType{{.*}} 'SomeStruct'
+  // CHECK-NEXT: CXXRecord{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'SomeStruct'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: if_present clause
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: dtype(U)
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+
+  // CHECK-NEXT: DeclStmt 
+  // CHECK-NEXT: VarDecl
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: IntegerLiteral
+  // CHECK-NEXT: VarDecl
+  // CHECK-NEXT: InitListExpr
+  // CHECK-NEXT: array_filler
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'Local' 'decltype(SomeStruct::value)':'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(SomeStruct::value)[5]'
+  // CHECK-NEXT: ArraySubscriptExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(SomeStruct::value)[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}}0
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(SomeStruct::value)[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}}0
+  // CHECK-NEXT: IntegerLiteral{{.*}}1
+}
+
+struct SomeStruct{
+  static constexpr unsigned value = 5;
+  operator unsigned();
+};
+void use() {
+  TemplFunc(SomeStruct{});
+}
+#endif
diff --git a/clang/test/SemaOpenACC/update-construct.cpp b/clang/test/SemaOpenACC/update-construct.cpp
new file mode 100644
index 0000000..2abd7a3
--- /dev/null
+++ b/clang/test/SemaOpenACC/update-construct.cpp
@@ -0,0 +1,167 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NotConvertible{} NC;
+int getI();
+void uses() {
+  int Var;
+#pragma acc update async self(Var)
+#pragma acc update wait self(Var)
+#pragma acc update self(Var) device_type(I)
+#pragma acc update if(true) self(Var)
+#pragma acc update if_present self(Var)
+#pragma acc update self(Var)
+  // expected-warning@+1{{OpenACC clause 'host' not yet implemented}}
+#pragma acc update host(Var)
+  // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // expected-error@+2{{OpenACC clause 'if' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update self(Var) device_type(I) if(true)
+  // expected-error@+2{{OpenACC clause 'if_present' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update self(Var) device_type(I) if_present
+  // expected-error@+2{{OpenACC clause 'self' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update device_type(I) self(Var)
+  // expected-error@+2{{OpenACC clause 'host' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update device_type(I) host(Var)
+  // expected-error@+2{{OpenACC clause 'device' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update device_type(I) device(Var)
+  // These 2 are OK.
+#pragma acc update self(Var) device_type(I) async
+#pragma acc update self(Var) device_type(I) wait
+  // Unless otherwise specified, we assume 'device_type' can happen after itself.
+#pragma acc update self(Var) device_type(I) device_type(I)
+
+  // TODO: OpenACC: These should diagnose because there isn't at least 1 of
+  // 'self', 'host', or 'device'.
+#pragma acc update async
+#pragma acc update wait
+#pragma acc update device_type(I)
+#pragma acc update if(true)
+#pragma acc update if_present
+
+  // expected-error@+1{{value of type 'struct NotConvertible' is not contextually convertible to 'bool'}}
+#pragma acc update if (NC) device_type(I)
+
+  // expected-error@+2{{OpenACC 'if' clause cannot appear more than once on a 'update' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update if(true) if (false)
+
+  // TODO: OpenACC: There is restrictions on the contents of a 'varlist', so
+  // those should be checked here too.
+
+  // Cannot be the body of an 'if', 'while', 'do', 'switch', or
+  // 'label'.
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following an if statement}}
+  if (true)
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a while statement}}
+  while (true)
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a do statement}}
+  do
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+  while (true);
+
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a switch statement}}
+  switch(Var)
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a label statement}}
+  LABEL:
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // For loops are OK.
+  for (;;)
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // Checking for 'async', which requires an 'int' expression.
+#pragma acc update async
+
+#pragma acc update async(getI())
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
+#pragma acc update async(getI(), getI())
+  // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'update' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update async(getI()) async(getI())
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc update async(NC)
+
+  // Checking for 'wait', which has a complicated set arguments.
+#pragma acc update wait
+#pragma acc update wait()
+#pragma acc update wait(getI(), getI())
+#pragma acc update wait(devnum: getI():  getI())
+#pragma acc update wait(devnum: getI(): queues: getI(), getI())
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc update wait(devnum:NC : 5)
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc update wait(devnum:5 : NC)
+
+    int arr[5];
+  // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}}
+  // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc update wait(devnum:arr : queues: arr, NC, 5)
+}
+
+struct SomeS {
+  int Array[5];
+  int MemberOfComp;
+};
+
+template<typename I, typename T>
+void varlist_restrictions_templ() {
+  I iArray[5];
+  T Single;
+  T Array[5];
+
+  // Members of a subarray of struct or class type may not appear, but others
+  // are permitted to.
+#pragma acc update self(iArray[0:1])
+
+#pragma acc update self(Array[0:1])
+
+  // expected-error@+1{{OpenACC sub-array is not allowed here}}
+#pragma acc update self(Array[0:1].MemberOfComp)
+}
+
+void varlist_restrictions() {
+  varlist_restrictions_templ<int, SomeS>();// expected-note{{in instantiation of}}
+  int iArray[5];
+  SomeS Single;
+  SomeS Array[5];
+
+  int LocalInt;
+  int *LocalPtr;
+
+#pragma acc update self(LocalInt, LocalPtr, Single)
+
+#pragma acc update self(Single.MemberOfComp)
+
+#pragma acc update self(Single.Array[0:1])
+
+
+  // Members of a subarray of struct or class type may not appear, but others
+  // are permitted to.
+#pragma acc update self(iArray[0:1])
+
+#pragma acc update self(Array[0:1])
+
+  // expected-error@+1{{OpenACC sub-array is not allowed here}}
+#pragma acc update self(Array[0:1].MemberOfComp)
+}
+
diff --git a/clang/test/SemaSPIRV/BuiltIns/distance-errors.c b/clang/test/SemaSPIRV/BuiltIns/distance-errors.c
new file mode 100644
index 0000000..17ea25d
--- /dev/null
+++ b/clang/test/SemaSPIRV/BuiltIns/distance-errors.c
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 %s -triple spirv-pc-vulkan-compute -verify
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+
+float test_no_second_arg(float2 p0) {
+  return __builtin_spirv_distance(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
+}
+
+float test_too_many_arg(float2 p0) {
+  return __builtin_spirv_distance(p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+}
+
+float test_double_scalar_inputs(double p0, double p1) {
+  return __builtin_spirv_distance(p0, p1);
+  //  expected-error@-1 {{passing 'double' to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(double)))) double' (vector of 2 'double' values)}}
+}
+
+float test_int_scalar_inputs(int p0, int p1) {
+  return __builtin_spirv_distance(p0, p1);
+  //  expected-error@-1 {{passing 'int' to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(int)))) int' (vector of 2 'int' values)}}
+}
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 3124693..f335ca3b 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1165,3 +1165,15 @@ concept C = invalid; // expected-error {{use of undeclared identifier 'invalid'}
 bool val2 = C<int>;
 
 } // namespace GH109780
+
+namespace GH121980 {
+
+template <class>
+concept has_member_difference_type; // expected-error {{expected '='}}
+
+template <has_member_difference_type> struct incrementable_traits; // expected-note {{declared here}}
+
+template <has_member_difference_type Tp>
+struct incrementable_traits<Tp>; // expected-error {{not more specialized than the primary}}
+
+}
diff --git a/clang/test/VFS/external-names.c b/clang/test/VFS/external-names.c
index 5b7c443..dd0b5eb 100644
--- a/clang/test/VFS/external-names.c
+++ b/clang/test/VFS/external-names.c
@@ -47,4 +47,4 @@
 
 // RUN: %clang_cc1 -D REINCLUDE -I %t -ivfsoverlay %t.yaml -Eonly %s -MTfoo -dependency-file %t.dep
 // RUN: cat %t.dep | FileCheck --check-prefix=CHECK-DEP %s
-// CHECK-DEP-NOT: Inputs
+// CHECK-DEP: Inputs{{..?}}external-names.h
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp b/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp
index 98be350..e332528 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp
+++ b/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp
@@ -1,5 +1,9 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
 // Basic C++ test for update_cc_test_checks
 // RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple=x86_64-apple-macho -emit-llvm -o - %s | FileCheck %s --check-prefix=MACHO
+// RUN: %clang_cc1 -triple=x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple=x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=MINGW
 
 class Foo {
   int x;
@@ -13,6 +17,8 @@ public:
   inline int function_defined_out_of_line(int arg) const;
 };
 
+[[clang::noinline]] static int static_noinline_fn(int arg) { return arg; }
+
 Foo::Foo(int x) : x(x) {}
 Foo::~Foo() {}
 int Foo::function_defined_out_of_line(int arg) const { return x - arg; }
@@ -22,4 +28,5 @@ int main() {
   Foo f(1);
   f.function_defined_inline(2);
   f.function_defined_out_of_line(3);
+  return static_noinline_fn(0);
 }
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp.expected b/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp.expected
index c42dc07..96370b4 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp.expected
+++ b/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp.expected
@@ -1,6 +1,9 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
 // Basic C++ test for update_cc_test_checks
 // RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple=x86_64-apple-macho -emit-llvm -o - %s | FileCheck %s --check-prefix=MACHO
+// RUN: %clang_cc1 -triple=x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple=x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=MINGW
 
 class Foo {
   int x;
@@ -8,52 +11,109 @@ class Foo {
 public:
   explicit Foo(int x);
   ~Foo();
-// CHECK-LABEL: @_ZNK3Foo23function_defined_inlineEi(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
-// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
-// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    ret i32 [[ADD]]
-//
   inline int function_defined_inline(int arg) const {
     return arg + x;
   }
   inline int function_defined_out_of_line(int arg) const;
 };
 
-// CHECK-LABEL: @_ZN3FooC1Ei(
-// CHECK-NEXT:  entry:
+[[clang::noinline]] static int static_noinline_fn(int arg) { return arg; }
+
+Foo::Foo(int x) : x(x) {}
+Foo::~Foo() {}
+int Foo::function_defined_out_of_line(int arg) const { return x - arg; }
+
+// Call the inline methods to ensure the LLVM IR is generated:
+int main() {
+  Foo f(1);
+  f.function_defined_inline(2);
+  f.function_defined_out_of_line(3);
+  return static_noinline_fn(0);
+}
+// CHECK-LABEL: define dso_local void @_ZN3FooC2Ei(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[X2:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[X2]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define dso_local void @_ZN3FooC1Ei(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[X:%.*]], ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
 // CHECK-NEXT:    call void @_ZN3FooC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
 // CHECK-NEXT:    ret void
 //
-Foo::Foo(int x) : x(x) {}
-// CHECK-LABEL: @_ZN3FooD1Ev(
-// CHECK-NEXT:  entry:
+//
+// CHECK-LABEL: define dso_local void @_ZN3FooD2Ev(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define dso_local void @_ZN3FooD1Ev(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    call void @_ZN3FooD2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
-Foo::~Foo() {}
-// CHECK-LABEL: @_ZNK3Foo28function_defined_out_of_lineEi(
-// CHECK-NEXT:  entry:
+//
+// CHECK-LABEL: define dso_local noundef i32 @main(
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN3FooC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK3Foo23function_defined_inlineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZL18static_noinline_fni(i32 noundef 0)
+// CHECK-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN3FooD1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+//
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZNK3Foo23function_defined_inlineEi(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret i32 [[ADD]]
+//
+//
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
@@ -61,20 +121,230 @@ Foo::~Foo() {}
 // CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    ret i32 [[SUB]]
 //
-int Foo::function_defined_out_of_line(int arg) const { return x - arg; }
-
-// Call the inline methods to ensure the LLVM IR is generated:
-// CHECK-LABEL: @main(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
-// CHECK-NEXT:    call void @_ZN3FooC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK3Foo23function_defined_inlineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
-// CHECK-NEXT:    call void @_ZN3FooD1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2]]
-// CHECK-NEXT:    ret i32 0
 //
-int main() {
-  Foo f(1);
-  f.function_defined_inline(2);
-  f.function_defined_out_of_line(3);
-}
+// CHECK-LABEL: define internal noundef i32 @_ZL18static_noinline_fni(
+// CHECK-SAME: i32 noundef [[ARG:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MACHO-LABEL: define void @_ZN3FooC2Ei(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[X2:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// MACHO-NEXT:    store i32 [[TMP0]], ptr [[X2]], align 4
+// MACHO-NEXT:    ret void
+//
+//
+// MACHO-LABEL: define void @_ZN3FooC1Ei(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// MACHO-NEXT:    call void @_ZN3FooC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// MACHO-NEXT:    ret void
+//
+//
+// MACHO-LABEL: define void @_ZN3FooD2Ev(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    ret void
+//
+//
+// MACHO-LABEL: define void @_ZN3FooD1Ev(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    call void @_ZN3FooD2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2:[0-9]+]]
+// MACHO-NEXT:    ret void
+//
+//
+// MACHO-LABEL: define noundef i32 @main(
+// MACHO-SAME: ) #[[ATTR1:[0-9]+]] {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
+// MACHO-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// MACHO-NEXT:    call void @_ZN3FooC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
+// MACHO-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK3Foo23function_defined_inlineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
+// MACHO-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
+// MACHO-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZL18static_noinline_fni(i32 noundef 0)
+// MACHO-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// MACHO-NEXT:    call void @_ZN3FooD1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2]]
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MACHO-LABEL: define linkonce_odr noundef i32 @_ZNK3Foo23function_defined_inlineEi(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MACHO-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// MACHO-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// MACHO-NEXT:    ret i32 [[ADD]]
+//
+//
+// MACHO-LABEL: define linkonce_odr noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
+// MACHO-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// MACHO-NEXT:    ret i32 [[SUB]]
+//
+//
+// MACHO-LABEL: define internal noundef i32 @_ZL18static_noinline_fni(
+// MACHO-SAME: i32 noundef [[ARG:%.*]]) #[[ATTR0]] {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MSVC-LABEL: define dso_local noundef i32 @main(
+// MSVC-SAME: ) #[[ATTR1:[0-9]+]] {
+// MSVC-NEXT:  [[ENTRY:.*:]]
+// MSVC-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
+// MSVC-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// MSVC-NEXT:    [[CALL:%.*]] = call noundef ptr @"??0Foo@@QEAA@H@Z"(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
+// MSVC-NEXT:    [[CALL1:%.*]] = call noundef i32 @"?function_defined_inline@Foo@@QEBAHH@Z"(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
+// MSVC-NEXT:    [[CALL2:%.*]] = call noundef i32 @"?function_defined_out_of_line@Foo@@QEBAHH@Z"(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
+// MSVC-NEXT:    [[CALL3:%.*]] = call noundef i32 @"?static_noinline_fn@@YAHH@Z"(i32 noundef 0)
+// MSVC-NEXT:    store i32 [[CALL3]], ptr [[RETVAL]], align 4
+// MSVC-NEXT:    call void @"??1Foo@@QEAA@XZ"(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2:[0-9]+]]
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MINGW-LABEL: define dso_local void @_ZN3FooC2Ei(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[X2:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// MINGW-NEXT:    store i32 [[TMP0]], ptr [[X2]], align 4
+// MINGW-NEXT:    ret void
+//
+//
+// MINGW-LABEL: define dso_local void @_ZN3FooC1Ei(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// MINGW-NEXT:    call void @_ZN3FooC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// MINGW-NEXT:    ret void
+//
+//
+// MINGW-LABEL: define dso_local void @_ZN3FooD2Ev(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    ret void
+//
+//
+// MINGW-LABEL: define dso_local void @_ZN3FooD1Ev(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    call void @_ZN3FooD2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2:[0-9]+]]
+// MINGW-NEXT:    ret void
+//
+//
+// MINGW-LABEL: define dso_local noundef i32 @main(
+// MINGW-SAME: ) #[[ATTR1:[0-9]+]] {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
+// MINGW-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// MINGW-NEXT:    call void @_ZN3FooC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
+// MINGW-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK3Foo23function_defined_inlineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
+// MINGW-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
+// MINGW-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZL18static_noinline_fni(i32 noundef 0)
+// MINGW-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// MINGW-NEXT:    call void @_ZN3FooD1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2]]
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MINGW-LABEL: define linkonce_odr dso_local noundef i32 @_ZNK3Foo23function_defined_inlineEi(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MINGW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// MINGW-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// MINGW-NEXT:    ret i32 [[ADD]]
+//
+//
+// MINGW-LABEL: define linkonce_odr dso_local noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
+// MINGW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// MINGW-NEXT:    ret i32 [[SUB]]
+//
+//
+// MINGW-LABEL: define internal noundef i32 @_ZL18static_noinline_fni(
+// MINGW-SAME: i32 noundef [[ARG:%.*]]) #[[ATTR0]] {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c
new file mode 100644
index 0000000..018f992
--- /dev/null
+++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c
@@ -0,0 +1,49 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+/// Check that we generate checks for functions even though the mangledName
+/// property in the AST dump JSON does not match the LLVM IR name.
+// RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ELF
+// RUN: %clang_cc1 -triple=x86_64-apple-macho -emit-llvm -o - %s | FileCheck %s --check-prefix=MACHO
+// RUN: %clang_cc1 -triple=x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple=x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=MINGW
+// RUN: %clang_cc1 -triple=i686-unknown-win32 -emit-llvm -o - %s | FileCheck %s --check-prefix=WIN32
+// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=THUMB-DARWIN
+
+// UTC_ARGS: --disable
+// ELF: target datalayout = "e-m:e-
+// MACHO: target datalayout = "e-m:o-
+// MSVC: target datalayout = "e-m:w-
+// MINGW: target datalayout = "e-m:w-
+// WIN32: target datalayout = "e-m:x-
+// THUMB-DARWIN: target datalayout = "e-m:o-
+// UTC_ARGS: --enable
+
+#ifdef __arm__
+/// FIXME: UTC does not find this function, but can find all others.
+typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t;
+int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
+  return a + b + c;
+}
+#endif
+
+/// Check global variable mangling
+[[gnu::used]] static int i1 = 1;
+int i2 = 2;
+
+[[clang::noinline,gnu::used]] static int static_noinline_fn(int arg) { return arg; }
+
+[[gnu::visibility("hidden")]] int hidden_visibility(int arg) { return arg; }
+
+#ifdef __ELF__
+[[gnu::visibility("protected")]] int protected_visibility(int arg) { return arg; }
+#endif
+
+[[gnu::visibility("default")]] int default_visibility(int arg) { return arg; }
+
+int no_visibility(int arg) { return arg; }
+
+
+/// FIXME: the i386 @fastcall@12 is not being checked here
+#ifdef _WIN32
+int __fastcall fastcall(int arg, long arg2, long arg3) { return arg; }
+#endif
+
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected
new file mode 100644
index 0000000..5d514f9
--- /dev/null
+++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected
@@ -0,0 +1,246 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+/// Check that we generate checks for functions even though the mangledName
+/// property in the AST dump JSON does not match the LLVM IR name.
+// RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ELF
+// RUN: %clang_cc1 -triple=x86_64-apple-macho -emit-llvm -o - %s | FileCheck %s --check-prefix=MACHO
+// RUN: %clang_cc1 -triple=x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple=x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=MINGW
+// RUN: %clang_cc1 -triple=i686-unknown-win32 -emit-llvm -o - %s | FileCheck %s --check-prefix=WIN32
+// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=THUMB-DARWIN
+
+// UTC_ARGS: --disable
+// ELF: target datalayout = "e-m:e-
+// MACHO: target datalayout = "e-m:o-
+// MSVC: target datalayout = "e-m:w-
+// MINGW: target datalayout = "e-m:w-
+// WIN32: target datalayout = "e-m:x-
+// THUMB-DARWIN: target datalayout = "e-m:o-
+// UTC_ARGS: --enable
+
+#ifdef __arm__
+/// FIXME: UTC does not find this function, but can find all others.
+typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t;
+int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
+  return a + b + c;
+}
+#endif
+
+/// Check global variable mangling
+[[gnu::used]] static int i1 = 1;
+int i2 = 2;
+
+// ELF-LABEL: @static_noinline_fn(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+// MACHO-LABEL: @static_noinline_fn(
+// MACHO-NEXT:  entry:
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+// MSVC-LABEL: @static_noinline_fn(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @static_noinline_fn(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+// WIN32-LABEL: @static_noinline_fn(
+// WIN32-NEXT:  entry:
+// WIN32-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// WIN32-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    ret i32 [[TMP0]]
+//
+// THUMB-DARWIN-LABEL: @static_noinline_fn(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// THUMB-DARWIN-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    ret i32 [[TMP0]]
+//
+[[clang::noinline,gnu::used]] static int static_noinline_fn(int arg) { return arg; }
+
+// ELF-LABEL: @hidden_visibility(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+// MACHO-LABEL: @hidden_visibility(
+// MACHO-NEXT:  entry:
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+// MSVC-LABEL: @hidden_visibility(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @hidden_visibility(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+// WIN32-LABEL: @hidden_visibility(
+// WIN32-NEXT:  entry:
+// WIN32-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// WIN32-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    ret i32 [[TMP0]]
+//
+// THUMB-DARWIN-LABEL: @hidden_visibility(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// THUMB-DARWIN-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    ret i32 [[TMP0]]
+//
+[[gnu::visibility("hidden")]] int hidden_visibility(int arg) { return arg; }
+
+#ifdef __ELF__
+// ELF-LABEL: @protected_visibility(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+[[gnu::visibility("protected")]] int protected_visibility(int arg) { return arg; }
+#endif
+
+// ELF-LABEL: @default_visibility(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+// MACHO-LABEL: @default_visibility(
+// MACHO-NEXT:  entry:
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+// MSVC-LABEL: @default_visibility(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @default_visibility(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+// WIN32-LABEL: @default_visibility(
+// WIN32-NEXT:  entry:
+// WIN32-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// WIN32-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    ret i32 [[TMP0]]
+//
+// THUMB-DARWIN-LABEL: @default_visibility(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// THUMB-DARWIN-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    ret i32 [[TMP0]]
+//
+[[gnu::visibility("default")]] int default_visibility(int arg) { return arg; }
+
+// ELF-LABEL: @no_visibility(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+// MACHO-LABEL: @no_visibility(
+// MACHO-NEXT:  entry:
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+// MSVC-LABEL: @no_visibility(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @no_visibility(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+// WIN32-LABEL: @no_visibility(
+// WIN32-NEXT:  entry:
+// WIN32-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// WIN32-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    ret i32 [[TMP0]]
+//
+// THUMB-DARWIN-LABEL: @no_visibility(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// THUMB-DARWIN-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    ret i32 [[TMP0]]
+//
+int no_visibility(int arg) { return arg; }
+
+
+/// FIXME: the i386 @fastcall@12 is not being checked here
+#ifdef _WIN32
+// MSVC-LABEL: @fastcall(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG3_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    [[ARG2_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG3:%.*]], ptr [[ARG3_ADDR]], align 4
+// MSVC-NEXT:    store i32 [[ARG2:%.*]], ptr [[ARG2_ADDR]], align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @fastcall(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    [[ARG2_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    [[ARG3_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    store i32 [[ARG2:%.*]], ptr [[ARG2_ADDR]], align 4
+// MINGW-NEXT:    store i32 [[ARG3:%.*]], ptr [[ARG3_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+int __fastcall fastcall(int arg, long arg2, long arg3) { return arg; }
+#endif
+
diff --git a/clang/test/utils/update_cc_test_checks/c-symbol-mangling.test b/clang/test/utils/update_cc_test_checks/c-symbol-mangling.test
new file mode 100644
index 0000000..35cff933
--- /dev/null
+++ b/clang/test/utils/update_cc_test_checks/c-symbol-mangling.test
@@ -0,0 +1,8 @@
+## Test that we handle mangled C symbol names correctly in the update script
+
+# RUN: cp %S/Inputs/c-symbol-mangling.c %t-generated.c && %update_cc_test_checks %t-generated.c
+# RUN: diff -u %S/Inputs/c-symbol-mangling.c.expected %t-generated.c
+
+## Check that re-running update_cc_test_checks doesn't change the output
+# RUN: %update_cc_test_checks %t-generated.c
+# RUN: diff -u %S/Inputs/c-symbol-mangling.c.expected %t-generated.c
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 4201f04..9fba63b 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -504,14 +504,14 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
         {"-Xlinker",
          Args.MakeArgString("--plugin-opt=" + StringRef(Arg->getValue()))});
 
-  if (!Triple.isNVPTX())
+  if (!Triple.isNVPTX() && !Triple.isSPIRV())
     CmdArgs.push_back("-Wl,--no-undefined");
 
   for (StringRef InputFile : InputFiles)
     CmdArgs.push_back(InputFile);
 
   // If this is CPU offloading we copy the input libraries.
-  if (!Triple.isAMDGPU() && !Triple.isNVPTX()) {
+  if (!Triple.isAMDGPU() && !Triple.isNVPTX() && !Triple.isSPIRV()) {
     CmdArgs.push_back("-Wl,-Bsymbolic");
     CmdArgs.push_back("-shared");
     ArgStringList LinkerArgs;
@@ -595,6 +595,7 @@ Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
   case Triple::aarch64_be:
   case Triple::ppc64:
   case Triple::ppc64le:
+  case Triple::spirv64:
   case Triple::systemz:
   case Triple::loongarch64:
     return generic::clang(InputFiles, Args);
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index bd36181..709dc51 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -913,7 +913,7 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
       return llvm::nulls();
 
     std::error_code EC;
-    FileOS.emplace(OutputFileName, EC);
+    FileOS.emplace(OutputFileName, EC, llvm::sys::fs::OF_Text);
     if (EC) {
       llvm::errs() << "Failed to open output file '" << OutputFileName
                    << "': " << llvm::errorCodeToError(EC) << '\n';
@@ -1003,9 +1003,9 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
           auto OSIter = OSs.find(MakeformatOutputPath);
           if (OSIter == OSs.end()) {
             std::error_code EC;
-            OSIter =
-                OSs.try_emplace(MakeformatOutputPath, MakeformatOutputPath, EC)
-                    .first;
+            OSIter = OSs.try_emplace(MakeformatOutputPath, MakeformatOutputPath,
+                                     EC, llvm::sys::fs::OF_Text)
+                         .first;
             if (EC)
               llvm::errs() << "Failed to open P1689 make format output file \""
                            << MakeformatOutputPath << "\" for " << EC.message()
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 12038de..ffd157e 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -355,10 +355,12 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
   if (!SetBackdoorDriverOutputsFromEnvVars(TheDriver))
     return 1;
 
+  auto ExecuteCC1WithContext =
+      [&ToolContext](SmallVectorImpl<const char *> &ArgV) {
+        return ExecuteCC1Tool(ArgV, ToolContext);
+      };
   if (!UseNewCC1Process) {
-    TheDriver.CC1Main = [ToolContext](SmallVectorImpl<const char *> &ArgV) {
-      return ExecuteCC1Tool(ArgV, ToolContext);
-    };
+    TheDriver.CC1Main = ExecuteCC1WithContext;
     // Ensure the CC1Command actually catches cc1 crashes
     llvm::CrashRecoveryContext::Enable();
   }
diff --git a/clang/tools/include-mapping/cppreference_parser.py b/clang/tools/include-mapping/cppreference_parser.py
index 9101f3d..f7da2ba 100644
--- a/clang/tools/include-mapping/cppreference_parser.py
+++ b/clang/tools/include-mapping/cppreference_parser.py
@@ -139,7 +139,7 @@ def _ParseIndexPage(index_page_html):
 
 
 def _ReadSymbolPage(path, name, qual_name):
-    with open(path) as f:
+    with open(path, encoding="utf-8") as f:
         return _ParseSymbolPage(f.read(), name, qual_name)
 
 
@@ -156,7 +156,7 @@ def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
     #      contains the defined header.
     #   2. Parse the symbol page to get the defined header.
     index_page_path = os.path.join(root_dir, index_page_name)
-    with open(index_page_path, "r") as f:
+    with open(index_page_path, "r", encoding="utf-8") as f:
         # Read each symbol page in parallel.
         results = []  # (symbol_name, promise of [header...])
         for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 7967c10..5e51fc4 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2192,6 +2192,7 @@ public:
   void VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *D);
   void VisitOpenACCInitConstruct(const OpenACCInitConstruct *D);
   void VisitOpenACCShutdownConstruct(const OpenACCShutdownConstruct *D);
+  void VisitOpenACCSetConstruct(const OpenACCSetConstruct *D);
   void VisitOMPExecutableDirective(const OMPExecutableDirective *D);
   void VisitOMPLoopBasedDirective(const OMPLoopBasedDirective *D);
   void VisitOMPLoopDirective(const OMPLoopDirective *D);
@@ -2838,8 +2839,13 @@ void OpenACCClauseEnqueue::VisitIfClause(const OpenACCIfClause &C) {
   Visitor.AddStmt(C.getConditionExpr());
 }
 void OpenACCClauseEnqueue::VisitSelfClause(const OpenACCSelfClause &C) {
-  if (C.hasConditionExpr())
-    Visitor.AddStmt(C.getConditionExpr());
+  if (C.isConditionExprClause()) {
+    if (C.hasConditionExpr())
+      Visitor.AddStmt(C.getConditionExpr());
+  } else {
+    for (Expr *Var : C.getVarList())
+      Visitor.AddStmt(Var);
+  }
 }
 void OpenACCClauseEnqueue::VisitNumWorkersClause(
     const OpenACCNumWorkersClause &C) {
@@ -2849,6 +2855,10 @@ void OpenACCClauseEnqueue::VisitDeviceNumClause(
     const OpenACCDeviceNumClause &C) {
   Visitor.AddStmt(C.getIntExpr());
 }
+void OpenACCClauseEnqueue::VisitDefaultAsyncClause(
+    const OpenACCDefaultAsyncClause &C) {
+  Visitor.AddStmt(C.getIntExpr());
+}
 void OpenACCClauseEnqueue::VisitVectorLengthClause(
     const OpenACCVectorLengthClause &C) {
   Visitor.AddStmt(C.getIntExpr());
@@ -3658,6 +3668,12 @@ void EnqueueVisitor::VisitOpenACCShutdownConstruct(
     EnqueueChildren(Clause);
 }
 
+void EnqueueVisitor::VisitOpenACCSetConstruct(const OpenACCSetConstruct *C) {
+  EnqueueChildren(C);
+  for (auto *Clause : C->clauses())
+    EnqueueChildren(Clause);
+}
+
 void EnqueueVisitor::VisitAnnotateAttr(const AnnotateAttr *A) {
   EnqueueChildren(A);
 }
@@ -6426,6 +6442,10 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OpenACCInitConstruct");
   case CXCursor_OpenACCShutdownConstruct:
     return cxstring::createRef("OpenACCShutdownConstruct");
+  case CXCursor_OpenACCSetConstruct:
+    return cxstring::createRef("OpenACCSetConstruct");
+  case CXCursor_OpenACCUpdateConstruct:
+    return cxstring::createRef("OpenACCUpdateConstruct");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index f56e77b..ee276d8 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -909,6 +909,12 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OpenACCShutdownConstructClass:
     K = CXCursor_OpenACCShutdownConstruct;
     break;
+  case Stmt::OpenACCSetConstructClass:
+    K = CXCursor_OpenACCSetConstruct;
+    break;
+  case Stmt::OpenACCUpdateConstructClass:
+    K = CXCursor_OpenACCUpdateConstruct;
+    break;
   case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPTargetParallelGenericLoopDirective;
     break;
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index f3f314b..d197d30d 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -763,10 +763,6 @@ TEST_P(ImportType, ImportPackExpansion) {
                                    implicitCastExpr(has(declRefExpr()))))))));
 }
 
-const internal::VariadicDynCastAllOfMatcher<Type,
-                                            DependentTemplateSpecializationType>
-    dependentTemplateSpecializationType;
-
 TEST_P(ImportType, ImportDependentTemplateSpecialization) {
   MatchVerifier<Decl> Verifier;
   testImport("template<typename T>"
@@ -3172,9 +3168,6 @@ TEST_P(ImportDecl, ImportFieldOrder) {
              recordDecl(hasFieldOrder({"b", "a"})));
 }
 
-const internal::VariadicDynCastAllOfMatcher<Expr, DependentScopeDeclRefExpr>
-    dependentScopeDeclRefExpr;
-
 TEST_P(ImportExpr, DependentScopeDeclRefExpr) {
   MatchVerifier<Decl> Verifier;
   testImport("template <typename T> struct S { static T foo; };"
@@ -3199,9 +3192,6 @@ TEST_P(ImportExpr, DependentScopeDeclRefExpr) {
                  has(callExpr(has(dependentScopeDeclRefExpr())))))))));
 }
 
-const internal::VariadicDynCastAllOfMatcher<Type, DependentNameType>
-    dependentNameType;
-
 TEST_P(ImportExpr, DependentNameType) {
   MatchVerifier<Decl> Verifier;
   testImport("template <typename T> struct declToImport {"
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index 056b7c7..92ec79d 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -2235,6 +2235,39 @@ TEST_P(ASTMatchersTest, ArgumentCountIs_CXXConstructExpr) {
                  Constructor1Arg));
 }
 
+TEST_P(ASTMatchersTest, HasDependentName_DependentScopeDeclRefExpr) {
+  if (!GetParam().isCXX() || GetParam().hasDelayedTemplateParsing()) {
+    // FIXME: Fix this test to work with delayed template parsing.
+    return;
+  }
+
+  EXPECT_TRUE(matches("template <class T> class X : T { void f() { T::v; } };",
+                      dependentScopeDeclRefExpr(hasDependentName("v"))));
+
+  EXPECT_TRUE(matches("template <typename T> struct S { static T Foo; };"
+                      "template <typename T> void x() { (void)S<T>::Foo; }",
+                      dependentScopeDeclRefExpr(hasDependentName("Foo"))));
+
+  EXPECT_TRUE(matches("template <typename T> struct S { static T foo(); };"
+                      "template <typename T> void x() { S<T>::foo(); }",
+                      dependentScopeDeclRefExpr(hasDependentName("foo"))));
+}
+
+TEST_P(ASTMatchersTest, HasDependentName_DependentNameType) {
+  if (!GetParam().isCXX()) {
+    // FIXME: Fix this test to work with delayed template parsing.
+    return;
+  }
+
+  EXPECT_TRUE(matches(
+      R"(
+        template <typename T> struct declToImport {
+          typedef typename T::type dependent_name;
+        };
+      )",
+      dependentNameType(hasDependentName("type"))));
+}
+
 TEST(ASTMatchersTest, NamesMember_CXXDependentScopeMemberExpr) {
 
   // Member functions:
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index 9bc287e..680e218 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -556,6 +556,21 @@ TEST_P(ASTMatchersTest, DeclRefExpr) {
                          Reference));
 }
 
+TEST_P(ASTMatchersTest, DependentScopeDeclRefExpr) {
+  if (!GetParam().isCXX() || GetParam().hasDelayedTemplateParsing()) {
+    // FIXME: Fix this test to work with delayed template parsing.
+    return;
+  }
+
+  EXPECT_TRUE(matches("template <class T> class X : T { void f() { T::v; } };",
+                      dependentScopeDeclRefExpr()));
+
+  EXPECT_TRUE(
+      matches("template <typename T> struct S { static T Foo; };"
+              "template <typename T> void declToImport() { (void)S<T>::Foo; }",
+              dependentScopeDeclRefExpr()));
+}
+
 TEST_P(ASTMatchersTest, CXXMemberCallExpr) {
   if (!GetParam().isCXX()) {
     return;
@@ -629,10 +644,8 @@ TEST_P(ASTMatchersTest, MemberExpr_MatchesVariable) {
   EXPECT_TRUE(matches("template <class T>"
                       "class X : T { void f() { this->T::v; } };",
                       cxxDependentScopeMemberExpr()));
-  // FIXME: Add a matcher for DependentScopeDeclRefExpr.
-  EXPECT_TRUE(
-      notMatches("template <class T> class X : T { void f() { T::v; } };",
-                 cxxDependentScopeMemberExpr()));
+  EXPECT_TRUE(matches("template <class T> class X : T { void f() { T::v; } };",
+                      dependentScopeDeclRefExpr()));
   EXPECT_TRUE(matches("template <class T> void x() { T t; t.v; }",
                       cxxDependentScopeMemberExpr()));
 }
@@ -1899,6 +1912,35 @@ TEST_P(ASTMatchersTest, DeducedTemplateSpecializationType) {
               deducedTemplateSpecializationType()));
 }
 
+TEST_P(ASTMatchersTest, DependentNameType) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+
+  EXPECT_TRUE(matches(
+      R"(
+        template <typename T> struct declToImport {
+          typedef typename T::type dependent_name;
+        };
+      )",
+      dependentNameType()));
+}
+
+TEST_P(ASTMatchersTest, DependentTemplateSpecializationType) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+
+  EXPECT_TRUE(matches(
+      R"(
+        template<typename T> struct A;
+        template<typename T> struct declToImport {
+          typename A<T>::template B<T> a;
+        };
+      )",
+      dependentTemplateSpecializationType()));
+}
+
 TEST_P(ASTMatchersTest, RecordType) {
   EXPECT_TRUE(matches("struct S {}; struct S s;",
                       recordType(hasDeclaration(recordDecl(hasName("S"))))));
diff --git a/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
index 6488833..d27f6a6 100644
--- a/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
@@ -149,6 +149,35 @@ TEST_F(CachedConstAccessorsLatticeTest, SameLocBeforeClearOrDiffAfterClear) {
 }
 
 TEST_F(CachedConstAccessorsLatticeTest,
+       SameLocBeforeClearOrDiffAfterClearWithCallee) {
+  CommonTestInputs Inputs;
+  auto *CE = Inputs.CallRef;
+  RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+                            {});
+
+  LatticeT Lattice;
+  auto NopInit = [](StorageLocation &) {};
+  const FunctionDecl *Callee = CE->getDirectCallee();
+  ASSERT_NE(Callee, nullptr);
+  StorageLocation &Loc1 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, Callee, Env, NopInit);
+  auto NotCalled = [](StorageLocation &) {
+    ASSERT_TRUE(false) << "Not reached";
+  };
+  StorageLocation &Loc2 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, Callee, Env, NotCalled);
+
+  EXPECT_EQ(&Loc1, &Loc2);
+
+  Lattice.clearConstMethodReturnStorageLocations(Loc);
+  StorageLocation &Loc3 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, Callee, Env, NopInit);
+
+  EXPECT_NE(&Loc3, &Loc1);
+  EXPECT_NE(&Loc3, &Loc2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest,
        SameStructValBeforeClearOrDiffAfterClear) {
   TestAST AST(R"cpp(
     struct S {
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
index de16f6b..19c3ff4 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -3771,6 +3771,54 @@ TEST_P(UncheckedOptionalAccessTest, ConstPointerAccessorWithModInBetween) {
                        /*IgnoreSmartPointerDereference=*/false);
 }
 
+TEST_P(UncheckedOptionalAccessTest, SmartPointerAccessorMixed) {
+  ExpectDiagnosticsFor(R"cc(
+     #include "unchecked_optional_access_test.h"
+
+    struct A {
+      $ns::$optional<int> x;
+    };
+
+    namespace absl {
+    template<typename T>
+    class StatusOr {
+      public:
+      bool ok() const;
+
+      const T& operator*() const&;
+      T& operator*() &;
+
+      const T* operator->() const;
+      T* operator->();
+
+      const T& value() const;
+      T& value();
+    };
+    }
+
+    void target(absl::StatusOr<A> &mut, const absl::StatusOr<A> &imm) {
+      if (!mut.ok() || !imm.ok())
+        return;
+
+      if (mut->x.has_value()) {
+        mut->x.value();
+        ((*mut).x).value();
+        (mut.value().x).value();
+
+        // check flagged after modifying
+        mut = imm;
+        mut->x.value();  // [[unsafe]]
+      }
+      if (imm->x.has_value()) {
+        imm->x.value();
+        ((*imm).x).value();
+        (imm.value().x).value();
+      }
+    }
+  )cc",
+                       /*IgnoreSmartPointerDereference=*/false);
+}
+
 TEST_P(UncheckedOptionalAccessTest, ConstBoolAccessor) {
   ExpectDiagnosticsFor(R"cc(
     #include "unchecked_optional_access_test.h"
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 7fc7492..1f0beaf 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -144,6 +144,9 @@ TEST(ConfigParseTest, GetsCorrectBasedOnStyle) {
   EXPECT_EQ(0, parseConfiguration(TEXT, &Style).value());                      \
   EXPECT_EQ(VALUE, Style.FIELD) << "Unexpected value after parsing!"
 
+#define CHECK_PARSE_LIST(FIELD)                                                \
+  CHECK_PARSE(#FIELD ": [foo]", FIELD, std::vector<std::string>{"foo"})
+
 #define CHECK_PARSE_NESTED_VALUE(TEXT, STRUCT, FIELD, VALUE)                   \
   EXPECT_NE(VALUE, Style.STRUCT.FIELD) << "Initial value already the same!";   \
   EXPECT_EQ(0, parseConfiguration(#STRUCT ":\n  " TEXT, &Style).value());      \
@@ -159,6 +162,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(AllowShortCompoundRequirementOnASingleLine);
   CHECK_PARSE_BOOL(AllowShortEnumsOnASingleLine);
   CHECK_PARSE_BOOL(AllowShortLoopsOnASingleLine);
+  CHECK_PARSE_BOOL(AllowShortNamespacesOnASingleLine);
   CHECK_PARSE_BOOL(BinPackArguments);
   CHECK_PARSE_BOOL(BreakAdjacentStringLiterals);
   CHECK_PARSE_BOOL(BreakAfterJavaFieldAnnotations);
@@ -864,6 +868,13 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   CHECK_PARSE("SortUsingDeclarations: true", SortUsingDeclarations,
               FormatStyle::SUD_LexicographicNumeric);
 
+  CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Never",
+              WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Never);
+  CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Always",
+              WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Always);
+  CHECK_PARSE("WrapNamespaceBodyWithEmptyLines: Leave",
+              WrapNamespaceBodyWithEmptyLines, FormatStyle::WNBWELS_Leave);
+
   // FIXME: This is required because parsing a configuration simply overwrites
   // the first N elements of the list instead of resetting it.
   Style.ForEachMacros.clear();
@@ -898,11 +909,15 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   CHECK_PARSE("StatementMacros: [QUNUSED, QT_REQUIRE_VERSION]", StatementMacros,
               std::vector<std::string>({"QUNUSED", "QT_REQUIRE_VERSION"}));
 
-  Style.NamespaceMacros.clear();
-  CHECK_PARSE("NamespaceMacros: [TESTSUITE]", NamespaceMacros,
-              std::vector<std::string>{"TESTSUITE"});
-  CHECK_PARSE("NamespaceMacros: [TESTSUITE, SUITE]", NamespaceMacros,
-              std::vector<std::string>({"TESTSUITE", "SUITE"}));
+  CHECK_PARSE_LIST(JavaImportGroups);
+  CHECK_PARSE_LIST(Macros);
+  CHECK_PARSE_LIST(NamespaceMacros);
+  CHECK_PARSE_LIST(ObjCPropertyAttributeOrder);
+  CHECK_PARSE_LIST(TableGenBreakingDAGArgOperators);
+  CHECK_PARSE_LIST(TemplateNames);
+  CHECK_PARSE_LIST(TypeNames);
+  CHECK_PARSE_LIST(TypenameMacros);
+  CHECK_PARSE_LIST(VariableTemplates);
 
   Style.WhitespaceSensitiveMacros.clear();
   CHECK_PARSE("WhitespaceSensitiveMacros: [STRINGIZE]",
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 47465a1..4d48bca 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -4476,6 +4476,7 @@ TEST_F(FormatTest, FormatsCompactNamespaces) {
                "int k; } // namespace out",
                Style);
 
+  Style.ColumnLimit = 41;
   verifyFormat("namespace A { namespace B { namespace C {\n"
                "}}} // namespace A::B::C",
                "namespace A { namespace B {\n"
@@ -4504,6 +4505,12 @@ TEST_F(FormatTest, FormatsCompactNamespaces) {
                "} // namespace bbbbbb\n"
                "} // namespace aaaaaa",
                Style);
+
+  verifyFormat("namespace a { namespace b {\n"
+               "namespace c {\n"
+               "}}} // namespace a::b::c",
+               Style);
+
   Style.ColumnLimit = 80;
 
   // Extra semicolon after 'inner' closing brace prevents merging
@@ -5880,6 +5887,11 @@ TEST_F(FormatTest, MacrosWithoutTrailingSemicolon) {
   verifyFormat("SOME_WEIRD_LOG_MACRO << SomeThing;", "SOME_WEIRD_LOG_MACRO\n"
                                                      "<< SomeThing;");
 
+  verifyFormat("GGGG(ffff(xxxxxxxxxxxxxxxxxxxx)->yyyyyyyyyyyyyyyyyyyy)(foo);",
+               "GGGG(ffff(xxxxxxxxxxxxxxxxxxxx)->yyyyyyyyyyyyyyyyyyyy)\n"
+               "(foo);",
+               getLLVMStyleWithColumns(60));
+
   verifyFormat("VISIT_GL_CALL(GenBuffers, void, (GLsizei n, GLuint* buffers), "
                "(n, buffers))",
                getChromiumStyle(FormatStyle::LK_Cpp));
@@ -28314,6 +28326,242 @@ TEST_F(FormatTest, KeepFormFeed) {
                Style);
 }
 
+TEST_F(FormatTest, ShortNamespacesOption) {
+  auto Style = getLLVMStyle();
+  Style.AllowShortNamespacesOnASingleLine = true;
+  Style.CompactNamespaces = true;
+  Style.FixNamespaceComments = false;
+
+  // Basic functionality.
+  verifyFormat("namespace foo { class bar; }", Style);
+  verifyFormat("namespace foo::bar { class baz; }", Style);
+  verifyFormat("namespace { class bar; }", Style);
+  verifyFormat("namespace foo {\n"
+               "class bar;\n"
+               "class baz;\n"
+               "}",
+               Style);
+
+  // Trailing comments prevent merging.
+  verifyFormat("namespace foo { namespace baz {\n"
+               "class qux;\n"
+               "} // comment\n"
+               "}",
+               Style);
+
+  // Make sure code doesn't walk too far on unbalanced code.
+  verifyFormat("namespace foo {", Style);
+  verifyFormat("namespace foo {\n"
+               "class baz;",
+               Style);
+  verifyFormat("namespace foo {\n"
+               "namespace bar { class baz; }",
+               Style);
+
+  // Nested namespaces.
+  verifyFormat("namespace foo { namespace bar { class baz; } }", Style);
+
+  // Without CompactNamespaces, we won't merge consecutive namespace
+  // declarations.
+  Style.CompactNamespaces = false;
+  verifyFormat("namespace foo {\n"
+               "namespace bar { class baz; }\n"
+               "}",
+               Style);
+
+  verifyFormat("namespace foo {\n"
+               "namespace bar { class baz; }\n"
+               "namespace qux { class quux; }\n"
+               "}",
+               Style);
+
+  Style.CompactNamespaces = true;
+
+  // Varying inner content.
+  verifyFormat("namespace foo {\n"
+               "int f() { return 5; }\n"
+               "}",
+               Style);
+  verifyFormat("namespace foo { template <T> struct bar; }", Style);
+  verifyFormat("namespace foo { constexpr int num = 42; }", Style);
+
+  // Validate nested namespace wrapping scenarios around the ColumnLimit.
+  Style.ColumnLimit = 64;
+
+  // Validate just under the ColumnLimit.
+  verifyFormat(
+      "namespace foo { namespace bar { namespace baz { class qux; } } }",
+      Style);
+
+  // Validate just over the ColumnLimit.
+  verifyFormat("namespace foo { namespace baar { namespace baaz {\n"
+               "class quux;\n"
+               "}}}",
+               Style);
+
+  verifyFormat(
+      "namespace foo { namespace bar { namespace baz { namespace qux {\n"
+      "class quux;\n"
+      "}}}}",
+      Style);
+
+  // Validate that the ColumnLimit logic accounts for trailing content as well.
+  verifyFormat("namespace foo { namespace bar { class qux; } } // extra",
+               Style);
+
+  verifyFormat("namespace foo { namespace bar { namespace baz {\n"
+               "class qux;\n"
+               "}}} // extra",
+               Style);
+
+  // FIXME: Ideally AllowShortNamespacesOnASingleLine would disable the trailing
+  // namespace comment from 'FixNamespaceComments', as it's not really necessary
+  // in this scenario, but the two options work at very different layers of the
+  // formatter, so I'm not sure how to make them interact.
+  //
+  // As it stands, the trailing comment will be added and likely make the line
+  // too long to fit within the ColumnLimit, reducing the how likely the line
+  // will still fit on a single line. The recommendation for now is to use the
+  // concatenated namespace syntax instead. e.g. 'namespace foo::bar'
+  Style.FixNamespaceComments = true;
+  verifyFormat(
+      "namespace foo { namespace bar { namespace baz {\n"
+      "class qux;\n"
+      "}}} // namespace foo::bar::baz",
+      "namespace foo { namespace bar { namespace baz { class qux; } } }",
+      Style);
+}
+
+TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesNever) {
+  auto Style = getLLVMStyle();
+  Style.FixNamespaceComments = false;
+  Style.MaxEmptyLinesToKeep = 2;
+  Style.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Never;
+
+  // Empty namespace.
+  verifyFormat("namespace N {}", Style);
+
+  // Single namespace.
+  verifyFormat("namespace N {\n"
+               "int f1(int a) { return 2 * a; }\n"
+               "}",
+               "namespace N {\n"
+               "\n"
+               "\n"
+               "int f1(int a) { return 2 * a; }\n"
+               "\n"
+               "\n"
+               "}",
+               Style);
+
+  // Nested namespace.
+  verifyFormat("namespace N1 {\n"
+               "namespace N2 {\n"
+               "int a = 1;\n"
+               "}\n"
+               "}",
+               "namespace N1 {\n"
+               "\n"
+               "\n"
+               "namespace N2 {\n"
+               "\n"
+               "int a = 1;\n"
+               "\n"
+               "}\n"
+               "\n"
+               "\n"
+               "}",
+               Style);
+
+  Style.CompactNamespaces = true;
+
+  verifyFormat("namespace N1 { namespace N2 {\n"
+               "int a = 1;\n"
+               "}}",
+               "namespace N1 { namespace N2 {\n"
+               "\n"
+               "\n"
+               "int a = 1;\n"
+               "\n"
+               "\n"
+               "}}",
+               Style);
+}
+
+TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesAlways) {
+  auto Style = getLLVMStyle();
+  Style.FixNamespaceComments = false;
+  Style.MaxEmptyLinesToKeep = 2;
+  Style.WrapNamespaceBodyWithEmptyLines = FormatStyle::WNBWELS_Always;
+
+  // Empty namespace.
+  verifyFormat("namespace N {}", Style);
+
+  // Single namespace.
+  verifyFormat("namespace N {\n"
+               "\n"
+               "int f1(int a) { return 2 * a; }\n"
+               "\n"
+               "}",
+               "namespace N {\n"
+               "int f1(int a) { return 2 * a; }\n"
+               "}",
+               Style);
+
+  // Nested namespace.
+  verifyFormat("namespace N1 {\n"
+               "namespace N2 {\n"
+               "\n"
+               "int a = 1;\n"
+               "\n"
+               "}\n"
+               "}",
+               "namespace N1 {\n"
+               "namespace N2 {\n"
+               "int a = 1;\n"
+               "}\n"
+               "}",
+               Style);
+
+  verifyFormat("namespace N1 {\n"
+               "\n"
+               "namespace N2 {\n"
+               "\n"
+               "\n"
+               "int a = 1;\n"
+               "\n"
+               "\n"
+               "}\n"
+               "\n"
+               "}",
+               "namespace N1 {\n"
+               "\n"
+               "namespace N2 {\n"
+               "\n"
+               "\n"
+               "\n"
+               "int a = 1;\n"
+               "\n"
+               "\n"
+               "\n"
+               "}\n"
+               "\n"
+               "}",
+               Style);
+
+  Style.CompactNamespaces = true;
+
+  verifyFormat("namespace N1 { namespace N2 {\n"
+               "\n"
+               "int a = 1;\n"
+               "\n"
+               "}}",
+               "namespace N1 { namespace N2 {\n"
+               "int a = 1;\n"
+               "}}",
+               Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/MatchFilePathTest.cpp b/clang/unittests/Format/MatchFilePathTest.cpp
index 28f6656..346ea7c 100644
--- a/clang/unittests/Format/MatchFilePathTest.cpp
+++ b/clang/unittests/Format/MatchFilePathTest.cpp
@@ -164,6 +164,41 @@ TEST_F(MatchFilePathTest, Path) {
   EXPECT_FALSE(match("foo\\", R"(foo*\)"));
 }
 
+TEST_F(MatchFilePathTest, Globstar) {
+  EXPECT_TRUE(match("/", "**"));
+  EXPECT_TRUE(match("foo", "**"));
+  EXPECT_TRUE(match("/foo", "**"));
+  EXPECT_TRUE(match("foo/", "**"));
+  EXPECT_TRUE(match("foo/bar", "**"));
+
+  EXPECT_TRUE(match("/", "**/"));
+  EXPECT_TRUE(match("foo/", "**/"));
+  EXPECT_TRUE(match("/foo/", "**/"));
+  EXPECT_TRUE(match("foo/bar/", "**/"));
+
+  EXPECT_TRUE(match("/", "/**"));
+  EXPECT_TRUE(match("/foo", "/**"));
+  EXPECT_TRUE(match("/foo/", "/**"));
+  EXPECT_TRUE(match("/foo/bar", "/**"));
+
+  EXPECT_TRUE(match("foo", "**/foo"));
+  EXPECT_TRUE(match("/foo", "**/foo"));
+  EXPECT_TRUE(match("foo/bar", "**/bar"));
+  EXPECT_TRUE(match("/foo/bar", "**/foo/bar"));
+  EXPECT_TRUE(match("foo/bar/baz", "**/bar/baz"));
+
+  EXPECT_TRUE(match("abc/foo", "abc/**"));
+  EXPECT_TRUE(match("abc/foo/", "abc/**"));
+  EXPECT_TRUE(match("abc/foo/bar", "abc/**"));
+
+  EXPECT_TRUE(match("a/b", "a/**/b"));
+  EXPECT_TRUE(match("a/x/b", "a/**/b"));
+  EXPECT_TRUE(match("a/x/y/b", "a/**/b"));
+
+  EXPECT_FALSE(match("a/x/b", "a**/b"));
+  EXPECT_FALSE(match("a/x/b", "a/**b"));
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 3175382..cb3f8c7 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -984,6 +984,18 @@ TEST_F(SortIncludesTest, SortAndDeduplicateIncludes) {
                     "#include <c>\n"
                     "#include <b>"));
 
+  verifyFormat("/* COPYRIGHT *\\\n"
+               "\\* (C) 2024  */\n"
+               "\n"
+               "#include <a>\n"
+               "#include <b>",
+               sort("/* COPYRIGHT *\\\n"
+                    "\\* (C) 2024  */\n"
+                    "\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "#include <b>"));
+
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
   verifyFormat("#include <a>\n"
                "#include <b>\n"
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index b2fb522..0383780 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -370,6 +370,21 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[8], tok::r_paren, TT_CastRParen);
   EXPECT_TOKEN(Tokens[11], tok::star, TT_BinaryOperator);
+
+  Tokens = annotate("template <typename T>\n"
+                    "concept C = requires(T a, T b) { a && b; };");
+  ASSERT_EQ(Tokens.size(), 24u) << Tokens;
+  EXPECT_TOKEN(Tokens[16], tok::l_brace, TT_RequiresExpressionLBrace);
+  EXPECT_TOKEN(Tokens[18], tok::ampamp, TT_BinaryOperator);
+
+  Tokens = annotate("template <typename T, typename V>\n"
+                    "concept CheckMultiplicableBy = requires(T a, V b) {\n"
+                    "  { a * b } -> std::same_as<T>;\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 36u) << Tokens;
+  EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_RequiresExpressionLBrace);
+  EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace);
+  EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsUsesOfPlusAndMinus) {
@@ -3615,6 +3630,19 @@ TEST_F(TokenAnnotatorTest, TemplateInstantiation) {
   EXPECT_TOKEN(Tokens[18], tok::greater, TT_TemplateCloser);
 }
 
+TEST_F(TokenAnnotatorTest, VariableTemplate) {
+  auto Style = getLLVMStyle();
+  Style.VariableTemplates.push_back("a");
+
+  auto Tokens = annotate("auto t3 = (a<int>) + b;", Style);
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::identifier, TT_VariableTemplate);
+  EXPECT_TOKEN(Tokens[5], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
+  EXPECT_TOKEN(Tokens[8], tok::r_paren, TT_Unknown); // Not TT_CastRParen
+  EXPECT_TOKEN(Tokens[9], tok::plus, TT_BinaryOperator);
+}
+
 TEST_F(TokenAnnotatorTest, SwitchInMacroArgument) {
   auto Tokens = annotate("FOOBAR(switch);\n"
                          "void f() {}");
@@ -3622,6 +3650,13 @@ TEST_F(TokenAnnotatorTest, SwitchInMacroArgument) {
   EXPECT_TOKEN(Tokens[9], tok::l_brace, TT_FunctionLBrace);
 }
 
+TEST_F(TokenAnnotatorTest, AfterPPDirective) {
+  auto Tokens = annotate("#error -- My error message");
+
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::minusminus, TT_AfterPPDirective);
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index 6c3604a..94cc218 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -25,12 +25,14 @@ enum class BuiltinType {
   LibBuiltin,
   LangBuiltin,
   TargetBuiltin,
+  TargetLibBuiltin,
 };
 
 class PrototypeParser {
 public:
   PrototypeParser(StringRef Substitution, const Record *Builtin)
-      : Loc(Builtin->getFieldLoc("Prototype")), Substitution(Substitution) {
+      : Loc(Builtin->getFieldLoc("Prototype")), Substitution(Substitution),
+        EnableOpenCLLong(Builtin->getValueAsBit("EnableOpenCLLong")) {
     ParsePrototype(Builtin->getValueAsString("Prototype"));
   }
 
@@ -108,9 +110,15 @@ private:
     } else if (T.consume_back("&")) {
       ParseType(T);
       Type += "&";
+    } else if (EnableOpenCLLong && T.consume_front("long long")) {
+      Type += "O";
+      ParseType(T);
     } else if (T.consume_front("long")) {
       Type += "L";
       ParseType(T);
+    } else if (T.consume_front("signed")) {
+      Type += "S";
+      ParseType(T);
     } else if (T.consume_front("unsigned")) {
       Type += "U";
       ParseType(T);
@@ -155,6 +163,7 @@ private:
                                .Case("__fp16", "h")
                                .Case("__int128_t", "LLLi")
                                .Case("_Float16", "x")
+                               .Case("__bf16", "y")
                                .Case("bool", "b")
                                .Case("char", "c")
                                .Case("constant_CFString", "F")
@@ -194,6 +203,7 @@ public:
 private:
   SMLoc Loc;
   StringRef Substitution;
+  bool EnableOpenCLLong;
   std::string Type;
 };
 
@@ -262,6 +272,9 @@ void EmitBuiltinDef(raw_ostream &OS, StringRef Substitution,
   case BuiltinType::TargetBuiltin:
     OS << "TARGET_BUILTIN";
     break;
+  case BuiltinType::TargetLibBuiltin:
+    OS << "TARGET_HEADER_BUILTIN";
+    break;
   }
 
   OS << "(" << Spelling;
@@ -279,6 +292,12 @@ void EmitBuiltinDef(raw_ostream &OS, StringRef Substitution,
     OS << ", " << Builtin->getValueAsString("Languages");
     break;
   }
+  case BuiltinType::TargetLibBuiltin: {
+    OS << ", ";
+    HeaderNameParser{Builtin}.Print(OS);
+    OS << ", " << Builtin->getValueAsString("Languages");
+    [[fallthrough]];
+  }
   case BuiltinType::TargetBuiltin:
     OS << ", \"" << Builtin->getValueAsString("Features") << "\"";
     break;
@@ -331,6 +350,8 @@ void EmitBuiltin(raw_ostream &OS, const Record *Builtin) {
         BT = BuiltinType::AtomicBuiltin;
       } else if (Builtin->isSubClassOf("LangBuiltin")) {
         BT = BuiltinType::LangBuiltin;
+      } else if (Builtin->isSubClassOf("TargetLibBuiltin")) {
+        BT = BuiltinType::TargetLibBuiltin;
       } else if (Builtin->isSubClassOf("TargetBuiltin")) {
         BT = BuiltinType::TargetBuiltin;
       } else if (Builtin->isSubClassOf("LibBuiltin")) {
@@ -367,6 +388,10 @@ void clang::EmitClangBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
 #if defined(BUILTIN) && !defined(TARGET_BUILTIN)
 #  define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
 #endif
+
+#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN)
+#  define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS)
+#endif
 )c++";
 
   // AtomicBuiltins are order dependent
@@ -390,5 +415,6 @@ void clang::EmitClangBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
 #undef LIBBUILTIN
 #undef LANGBUILTIN
 #undef TARGET_BUILTIN
+#undef TARGET_HEADER_BUILTIN
 )c++";
 }
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index cf7e5a1..97b768d 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1640,13 +1640,6 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "  return x0 & (1ULL << 63);\n";
   OS << "}\n\n";
 
-  OS << "__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible "
-        "{\n";
-  OS << "  uint64_t x0, x1;\n";
-  OS << "  __builtin_arm_get_sme_state(&x0, &x1);\n";
-  OS << "  return x0 & 1;\n";
-  OS << "}\n\n";
-
   OS << "void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n";
   OS << "void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n";
   OS << "void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;\n";
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 386c572..f2716f1 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -63,7 +63,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/3.html">3</a></td>
     <td>NAD</td>
     <td>The template compilation model rules render some explicit specialization declarations not visible during instantiation</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="4">
     <td><a href="https://cplusplus.github.io/CWG/issues/4.html">4</a></td>
@@ -111,7 +111,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/11.html">11</a></td>
     <td>CD1</td>
     <td>How do the keywords typename/template interact with using-declarations?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="12">
     <td><a href="https://cplusplus.github.io/CWG/issues/12.html">12</a></td>
@@ -135,7 +135,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/15.html">15</a></td>
     <td>dup</td>
     <td>Default arguments for parameters of function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="16">
     <td><a href="https://cplusplus.github.io/CWG/issues/16.html">16</a></td>
@@ -147,7 +147,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/17.html">17</a></td>
     <td>NAD</td>
     <td>Footnote 99 should discuss the naming class when describing members that can be accessed from friends</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="18">
     <td><a href="https://cplusplus.github.io/CWG/issues/18.html">18</a></td>
@@ -183,7 +183,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/23.html">23</a></td>
     <td>NAD</td>
     <td>Some questions regarding partial ordering of function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="24">
     <td><a href="https://cplusplus.github.io/CWG/issues/24.html">24</a></td>
@@ -195,19 +195,19 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/25.html">25</a></td>
     <td>TC1</td>
     <td>Exception specifications and pointers to members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="26">
     <td><a href="https://cplusplus.github.io/CWG/issues/26.html">26</a></td>
     <td>NAD</td>
     <td>Copy constructors and default arguments</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="27">
     <td><a href="https://cplusplus.github.io/CWG/issues/27.html">27</a></td>
     <td>NAD</td>
     <td>Overload ambiguities for builtin ?: prototypes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="28">
     <td><a href="https://cplusplus.github.io/CWG/issues/28.html">28</a></td>
@@ -273,7 +273,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/38.html">38</a></td>
     <td>TC1</td>
     <td>Explicit template arguments and operator functions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="39">
     <td><a href="https://cplusplus.github.io/CWG/issues/39.html">39</a></td>
@@ -291,13 +291,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/41.html">41</a></td>
     <td>TC1</td>
     <td>Clarification of lookup of names after declarator-id</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="42">
     <td><a href="https://cplusplus.github.io/CWG/issues/42.html">42</a></td>
     <td>NAD</td>
     <td>Redefining names from base classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="43">
     <td><a href="https://cplusplus.github.io/CWG/issues/43.html">43</a></td>
@@ -315,13 +315,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/45.html">45</a></td>
     <td>CD1</td>
     <td>Access to nested classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="46">
     <td><a href="https://cplusplus.github.io/CWG/issues/46.html">46</a></td>
     <td>NAD</td>
     <td>Explicit instantiation of member templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="47">
     <td><a href="https://cplusplus.github.io/CWG/issues/47.html">47</a></td>
@@ -333,7 +333,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/48.html">48</a></td>
     <td>TC1</td>
     <td>Definitions of unused static members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="49">
     <td><a href="https://cplusplus.github.io/CWG/issues/49.html">49</a></td>
@@ -345,7 +345,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/50.html">50</a></td>
     <td>NAD</td>
     <td>Converting pointer to incomplete type to same type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="51">
     <td><a href="https://cplusplus.github.io/CWG/issues/51.html">51</a></td>
@@ -363,7 +363,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/53.html">53</a></td>
     <td>TC1</td>
     <td>Lvalue-to-rvalue conversion before certain static_casts</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="54">
     <td><a href="https://cplusplus.github.io/CWG/issues/54.html">54</a></td>
@@ -375,13 +375,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/55.html">55</a></td>
     <td>NAD</td>
     <td>Adding/subtracting pointer and enumeration value</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="56">
     <td><a href="https://cplusplus.github.io/CWG/issues/56.html">56</a></td>
     <td>TC1</td>
     <td>Redeclaring typedefs within classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="57">
     <td><a href="https://cplusplus.github.io/CWG/issues/57.html">57</a></td>
@@ -399,13 +399,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/59.html">59</a></td>
     <td>TC1</td>
     <td>Clarification of overloading and UDC to reference type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="60">
     <td><a href="https://cplusplus.github.io/CWG/issues/60.html">60</a></td>
     <td>CD1</td>
     <td>Reference binding and valid conversion sequences</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="61">
     <td><a href="https://cplusplus.github.io/CWG/issues/61.html">61</a></td>
@@ -423,13 +423,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/63.html">63</a></td>
     <td>CD1</td>
     <td>Class instantiation from pointer conversion to void*, null and self</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="64">
     <td><a href="https://cplusplus.github.io/CWG/issues/64.html">64</a></td>
     <td>TC1</td>
     <td>Partial ordering to disambiguate explicit specialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="65">
     <td><a href="https://cplusplus.github.io/CWG/issues/65.html">65</a></td>
@@ -465,7 +465,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/70.html">70</a></td>
     <td>CD1</td>
     <td>Is an array bound a nondeduced context?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="71">
     <td><a href="https://cplusplus.github.io/CWG/issues/71.html">71</a></td>
@@ -489,25 +489,25 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/74.html">74</a></td>
     <td>TC1</td>
     <td>Enumeration value in direct-new-declarator</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="75">
     <td><a href="https://cplusplus.github.io/CWG/issues/75.html">75</a></td>
     <td>TC1</td>
     <td>In-class initialized members must be const</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="76">
     <td><a href="https://cplusplus.github.io/CWG/issues/76.html">76</a></td>
     <td>TC1</td>
     <td>Are const volatile variables considered "constant expressions"?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="77">
     <td><a href="https://cplusplus.github.io/CWG/issues/77.html">77</a></td>
     <td>CD1</td>
     <td>The definition of friend does not allow nested classes to be friends</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="78">
     <td><a href="https://cplusplus.github.io/CWG/issues/78.html">78</a></td>
@@ -543,13 +543,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/83.html">83</a></td>
     <td>TC1</td>
     <td>Overloading and deprecated conversion of string literal</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="84">
     <td><a href="https://cplusplus.github.io/CWG/issues/84.html">84</a></td>
     <td>TC1</td>
     <td>Overloading and conversion loophole used by <TT>auto_ptr</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="85">
     <td><a href="https://cplusplus.github.io/CWG/issues/85.html">85</a></td>
@@ -585,13 +585,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/90.html">90</a></td>
     <td>TC1</td>
     <td>Should the enclosing class be an "associated class" too?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="91">
     <td><a href="https://cplusplus.github.io/CWG/issues/91.html">91</a></td>
     <td>NAD</td>
     <td>A union's associated types should include the union itself</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="92">
     <td><a href="https://cplusplus.github.io/CWG/issues/92.html">92</a></td>
@@ -609,7 +609,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/94.html">94</a></td>
     <td>TC1</td>
     <td>Inconsistencies in the descriptions of constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="95">
     <td><a href="https://cplusplus.github.io/CWG/issues/95.html">95</a></td>
@@ -627,13 +627,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/97.html">97</a></td>
     <td>NAD</td>
     <td>Use of bool constants in integral constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="98">
     <td><a href="https://cplusplus.github.io/CWG/issues/98.html">98</a></td>
     <td>TC1</td>
     <td>Branching into try block</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="99">
     <td><a href="https://cplusplus.github.io/CWG/issues/99.html">99</a></td>
@@ -645,7 +645,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/100.html">100</a></td>
     <td>TC1</td>
     <td>Clarify why string literals are not allowed as template arguments</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="101">
     <td><a href="https://cplusplus.github.io/CWG/issues/101.html">101</a></td>
@@ -657,7 +657,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/102.html">102</a></td>
     <td>NAD</td>
     <td>Operator lookup rules do not work well with parts of the library</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="103">
     <td><a href="https://cplusplus.github.io/CWG/issues/103.html">103</a></td>
@@ -687,7 +687,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/107.html">107</a></td>
     <td>NAD</td>
     <td>Linkage of operator functions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="108">
     <td><a href="https://cplusplus.github.io/CWG/issues/108.html">108</a></td>
@@ -699,7 +699,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/109.html">109</a></td>
     <td>NAD</td>
     <td>Allowing <TT>::template</TT> in <I>using-declaration</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="110">
     <td><a href="https://cplusplus.github.io/CWG/issues/110.html">110</a></td>
@@ -717,19 +717,19 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/112.html">112</a></td>
     <td>CD1</td>
     <td>Array types and cv-qualifiers</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="113">
     <td><a href="https://cplusplus.github.io/CWG/issues/113.html">113</a></td>
     <td>CD1</td>
     <td>Visibility of called function</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="114">
     <td><a href="https://cplusplus.github.io/CWG/issues/114.html">114</a></td>
     <td>NAD</td>
     <td>Virtual overriding by template member function specializations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="115">
     <td><a href="https://cplusplus.github.io/CWG/issues/115.html">115</a></td>
@@ -741,7 +741,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/116.html">116</a></td>
     <td>TC1</td>
     <td>Equivalent and functionally-equivalent function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="117">
     <td><a href="https://cplusplus.github.io/CWG/issues/117.html">117</a></td>
@@ -771,13 +771,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/121.html">121</a></td>
     <td>TC1</td>
     <td>Dependent type names with non-dependent <I>nested-name-specifier</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="122">
     <td><a href="https://cplusplus.github.io/CWG/issues/122.html">122</a></td>
     <td>CD1</td>
     <td><I>template-id</I>s as <I>unqualified-id</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="123">
     <td><a href="https://cplusplus.github.io/CWG/issues/123.html">123</a></td>
@@ -795,7 +795,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/125.html">125</a></td>
     <td>CD1</td>
     <td>Ambiguity in <TT>friend</TT> declaration syntax</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="126">
     <td><a href="https://cplusplus.github.io/CWG/issues/126.html">126</a></td>
@@ -813,7 +813,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/128.html">128</a></td>
     <td>TC1</td>
     <td>Casting between enum types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="129">
     <td><a href="https://cplusplus.github.io/CWG/issues/129.html">129</a></td>
@@ -855,7 +855,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/135.html">135</a></td>
     <td>TC1</td>
     <td>Class type in in-class member function definitions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="136">
     <td><a href="https://cplusplus.github.io/CWG/issues/136.html">136</a></td>
@@ -867,7 +867,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/137.html">137</a></td>
     <td>TC1</td>
     <td><TT>static_cast</TT> of <I>cv</I> <TT>void*</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="138">
     <td><a href="https://cplusplus.github.io/CWG/issues/138.html">138</a></td>
@@ -879,13 +879,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/139.html">139</a></td>
     <td>CD1</td>
     <td>Error in <TT>friend</TT> lookup example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="140">
     <td><a href="https://cplusplus.github.io/CWG/issues/140.html">140</a></td>
     <td>CD1</td>
     <td>Agreement of parameter declarations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="141">
     <td><a href="https://cplusplus.github.io/CWG/issues/141.html">141</a></td>
@@ -903,7 +903,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/143.html">143</a></td>
     <td>CD1</td>
     <td>Friends and Koenig lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="144">
     <td><a href="https://cplusplus.github.io/CWG/issues/144.html">144</a></td>
@@ -915,7 +915,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/145.html">145</a></td>
     <td>TC1</td>
     <td>Deprecation of prefix <TT>++</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="146">
     <td><a href="https://cplusplus.github.io/CWG/issues/146.html">146</a></td>
@@ -927,13 +927,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/147.html">147</a></td>
     <td>TC1</td>
     <td>Naming the constructor</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="148">
     <td><a href="https://cplusplus.github.io/CWG/issues/148.html">148</a></td>
     <td>TC1</td>
     <td>POD classes and pointers to members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="149">
     <td><a href="https://cplusplus.github.io/CWG/issues/149.html">149</a></td>
@@ -957,7 +957,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/152.html">152</a></td>
     <td>TC1</td>
     <td><TT>explicit</TT> copy constructors</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="153">
     <td><a href="https://cplusplus.github.io/CWG/issues/153.html">153</a></td>
@@ -969,7 +969,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/154.html">154</a></td>
     <td>NAD</td>
     <td>Anonymous unions in unnamed namespaces</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="155">
     <td><a href="https://cplusplus.github.io/CWG/issues/155.html">155</a></td>
@@ -981,7 +981,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/156.html">156</a></td>
     <td>NAD</td>
     <td>Name lookup for conversion functions</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="partial-superseded" align="center">Superseded by <a href="#1111">1111</a></td>
   </tr>
   <tr class="open" id="157">
     <td><a href="https://cplusplus.github.io/CWG/issues/157.html">157</a></td>
@@ -1029,7 +1029,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/164.html">164</a></td>
     <td>TC1</td>
     <td>Overlap between Koenig and normal lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="165">
     <td><a href="https://cplusplus.github.io/CWG/issues/165.html">165</a></td>
@@ -1059,7 +1059,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/169.html">169</a></td>
     <td>NAD</td>
     <td><I>template-id</I>s in <I>using-declaration</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="170">
     <td><a href="https://cplusplus.github.io/CWG/issues/170.html">170</a></td>
@@ -1077,13 +1077,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/172.html">172</a></td>
     <td>CD1</td>
     <td>Unsigned int as underlying type of enum</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="173">
     <td><a href="https://cplusplus.github.io/CWG/issues/173.html">173</a></td>
     <td>TC1</td>
     <td>Constraints on execution character set</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="174">
     <td><a href="https://cplusplus.github.io/CWG/issues/174.html">174</a></td>
@@ -1107,19 +1107,19 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/177.html">177</a></td>
     <td>CD1</td>
     <td>Lvalues vs rvalues in copy-initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="178">
     <td><a href="https://cplusplus.github.io/CWG/issues/178.html">178</a></td>
     <td>TC1</td>
     <td>More on value-initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="179">
     <td><a href="https://cplusplus.github.io/CWG/issues/179.html">179</a></td>
     <td>TC1</td>
     <td>Function pointers and subtraction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="180">
     <td><a href="https://cplusplus.github.io/CWG/issues/180.html">180</a></td>
@@ -1131,7 +1131,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/181.html">181</a></td>
     <td>TC1</td>
     <td>Errors in template <I>template-parameter</I> example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="182">
     <td><a href="https://cplusplus.github.io/CWG/issues/182.html">182</a></td>
@@ -1149,7 +1149,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/184.html">184</a></td>
     <td>CD1</td>
     <td>Default arguments in template <I>template-parameter</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="185">
     <td><a href="https://cplusplus.github.io/CWG/issues/185.html">185</a></td>
@@ -1173,7 +1173,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/188.html">188</a></td>
     <td>TC1</td>
     <td>Comma operator and rvalue conversion</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="189">
     <td><a href="https://cplusplus.github.io/CWG/issues/189.html">189</a></td>
@@ -1185,19 +1185,19 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/190.html">190</a></td>
     <td>TC1</td>
     <td>Layout-compatible POD-struct types</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="191">
     <td><a href="https://cplusplus.github.io/CWG/issues/191.html">191</a></td>
     <td>CD6</td>
     <td>Name lookup does not handle complex nesting</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="192">
     <td><a href="https://cplusplus.github.io/CWG/issues/192.html">192</a></td>
     <td>NAD</td>
     <td>Name lookup in parameters</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="193">
     <td><a href="https://cplusplus.github.io/CWG/issues/193.html">193</a></td>
@@ -1209,13 +1209,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/194.html">194</a></td>
     <td>TC1</td>
     <td>Identifying constructors</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="195">
     <td><a href="https://cplusplus.github.io/CWG/issues/195.html">195</a></td>
     <td>CD1</td>
     <td>Converting between function and object pointers</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="196">
     <td><a href="https://cplusplus.github.io/CWG/issues/196.html">196</a></td>
@@ -1227,13 +1227,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/197.html">197</a></td>
     <td>CD1</td>
     <td>Issues with two-stage lookup of dependent names</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="198">
     <td><a href="https://cplusplus.github.io/CWG/issues/198.html">198</a></td>
     <td>CD1</td>
     <td>Definition of "use" in local and nested classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="199">
     <td><a href="https://cplusplus.github.io/CWG/issues/199.html">199</a></td>
@@ -1263,7 +1263,7 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/203.html">203</a></td>
     <td>NAD</td>
     <td>Type of address-of-member expression</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="204">
     <td><a href="https://cplusplus.github.io/CWG/issues/204.html">204</a></td>
@@ -1281,13 +1281,13 @@
     <td><a href="https://cplusplus.github.io/CWG/issues/206.html">206</a></td>
     <td>TC1</td>
     <td>Semantic constraints on non-dependent names</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="207">
     <td><a href="https://cplusplus.github.io/CWG/issues/207.html">207</a></td>
     <td>CD1</td>
     <td><I>using-declaration</I>s and protected access</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="208">
     <td><a href="https://cplusplus.github.io/CWG/issues/208.html">208</a></td>
@@ -1312,7 +1312,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/211.html">211</a></td>
     <td>NAD</td>
     <td>Constructors should not be allowed to return normally after an exception</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="212">
     <td><a href="https://cplusplus.github.io/CWG/issues/212.html">212</a></td>
@@ -1324,13 +1324,13 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/213.html">213</a></td>
     <td>TC1</td>
     <td>Lookup in dependent base classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="214">
     <td><a href="https://cplusplus.github.io/CWG/issues/214.html">214</a></td>
     <td>CD1</td>
     <td>Partial ordering of function templates is underspecified</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="215">
     <td><a href="https://cplusplus.github.io/CWG/issues/215.html">215</a></td>
@@ -1348,13 +1348,13 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/217.html">217</a></td>
     <td>TC1</td>
     <td>Default arguments for non-template member functions of class templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="218">
     <td><a href="https://cplusplus.github.io/CWG/issues/218.html">218</a></td>
     <td>CD1</td>
     <td>Specification of Koenig lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="219">
     <td><a href="https://cplusplus.github.io/CWG/issues/219.html">219</a></td>
@@ -1408,13 +1408,13 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/227.html">227</a></td>
     <td>TC1</td>
     <td>How many scopes in an <TT>if</TT> statement?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="228">
     <td><a href="https://cplusplus.github.io/CWG/issues/228.html">228</a></td>
     <td>CD1</td>
     <td>Use of <TT>template</TT> keyword with non-member templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="229">
     <td><a href="https://cplusplus.github.io/CWG/issues/229.html">229</a></td>
@@ -1432,7 +1432,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/231.html">231</a></td>
     <td>NAD</td>
     <td>Visibility of names after <I>using-directive</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="232">
     <td><a href="https://cplusplus.github.io/CWG/issues/232.html">232</a></td>
@@ -1442,7 +1442,7 @@ accessible?</td>
   </tr>
   <tr id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>References vs pointers in UDC overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -1480,7 +1480,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/239.html">239</a></td>
     <td>CD1</td>
     <td>Footnote 116 and Koenig lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="240">
     <td><a href="https://cplusplus.github.io/CWG/issues/240.html">240</a></td>
@@ -1492,7 +1492,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/241.html">241</a></td>
     <td>TC1</td>
     <td>Error in example in 14.8.1</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="242">
     <td><a href="https://cplusplus.github.io/CWG/issues/242.html">242</a></td>
@@ -1504,7 +1504,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/243.html">243</a></td>
     <td>NAD</td>
     <td>Weighting of conversion functions in direct-initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="244">
     <td><a href="https://cplusplus.github.io/CWG/issues/244.html">244</a></td>
@@ -1516,7 +1516,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/245.html">245</a></td>
     <td>CD1</td>
     <td>Name lookup in <I>elaborated-type-specifier</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="246">
     <td><a href="https://cplusplus.github.io/CWG/issues/246.html">246</a></td>
@@ -1528,7 +1528,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/247.html">247</a></td>
     <td>NAD</td>
     <td>Pointer-to-member casts and function overload resolution</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="248">
     <td><a href="https://cplusplus.github.io/CWG/issues/248.html">248</a></td>
@@ -1540,13 +1540,13 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/249.html">249</a></td>
     <td>TC1</td>
     <td>What is a member function template?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="250">
     <td><a href="https://cplusplus.github.io/CWG/issues/250.html">250</a></td>
     <td>TC1</td>
     <td>Address of function template specialization with non-deduced template arguments</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="251">
     <td><a href="https://cplusplus.github.io/CWG/issues/251.html">251</a></td>
@@ -1576,7 +1576,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/255.html">255</a></td>
     <td>CD6</td>
     <td>Placement deallocation functions and lookup ambiguity</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="256">
     <td><a href="https://cplusplus.github.io/CWG/issues/256.html">256</a></td>
@@ -1618,7 +1618,7 @@ accessible?</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/262.html">262</a></td>
     <td>CD1</td>
     <td>Default arguments and ellipsis</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="263">
     <td><a href="https://cplusplus.github.io/CWG/issues/263.html">263</a></td>
@@ -1679,13 +1679,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/272.html">272</a></td>
     <td>CD1</td>
     <td>Explicit destructor invocation and <I>qualified-id</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="273">
     <td><a href="https://cplusplus.github.io/CWG/issues/273.html">273</a></td>
     <td>CD1</td>
     <td>POD classes and <TT>operator&amp;()</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="274">
     <td><a href="https://cplusplus.github.io/CWG/issues/274.html">274</a></td>
@@ -1745,7 +1745,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/283.html">283</a></td>
     <td>CD1</td>
     <td>Template <I>type-parameter</I>s are not syntactically <I>type-name</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="284">
     <td><a href="https://cplusplus.github.io/CWG/issues/284.html">284</a></td>
@@ -1757,7 +1757,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/285.html">285</a></td>
     <td>NAD</td>
     <td>Identifying a function template being specialized</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="286">
     <td><a href="https://cplusplus.github.io/CWG/issues/286.html">286</a></td>
@@ -1781,7 +1781,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/289.html">289</a></td>
     <td>CD1</td>
     <td>Incomplete list of contexts requiring a complete type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="290">
     <td><a href="https://cplusplus.github.io/CWG/issues/290.html">290</a></td>
@@ -1823,7 +1823,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/296.html">296</a></td>
     <td>CD1</td>
     <td>Can conversion functions be static?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="297">
     <td><a href="https://cplusplus.github.io/CWG/issues/297.html">297</a></td>
@@ -1847,7 +1847,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/300.html">300</a></td>
     <td>CD1</td>
     <td>References to functions in template argument deduction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="301">
     <td><a href="https://cplusplus.github.io/CWG/issues/301.html">301</a></td>
@@ -1967,7 +1967,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/320.html">320</a></td>
     <td>CD1</td>
     <td>Question on copy constructor elision example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="321">
     <td><a href="https://cplusplus.github.io/CWG/issues/321.html">321</a></td>
@@ -2015,7 +2015,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/328.html">328</a></td>
     <td>CD1</td>
     <td>Missing requirement that class member types be complete</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="329">
     <td><a href="https://cplusplus.github.io/CWG/issues/329.html">329</a></td>
@@ -2045,13 +2045,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/333.html">333</a></td>
     <td>NAD</td>
     <td>Ambiguous use of "declaration" in disambiguation section</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="334">
     <td><a href="https://cplusplus.github.io/CWG/issues/334.html">334</a></td>
     <td>NAD</td>
     <td>Is a comma-expression dependent if its first operand is?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="335">
     <td><a href="https://cplusplus.github.io/CWG/issues/335.html">335</a></td>
@@ -2063,13 +2063,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/336.html">336</a></td>
     <td>CD1</td>
     <td>Explicit specialization examples are still incorrect</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="337">
     <td><a href="https://cplusplus.github.io/CWG/issues/337.html">337</a></td>
     <td>CD1</td>
     <td>Attempt to create array of abtract type should cause deduction to fail</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="338">
     <td><a href="https://cplusplus.github.io/CWG/issues/338.html">338</a></td>
@@ -2087,7 +2087,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/340.html">340</a></td>
     <td>NAD</td>
     <td>Unclear wording in disambiguation section</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="341">
     <td><a href="https://cplusplus.github.io/CWG/issues/341.html">341</a></td>
@@ -2117,7 +2117,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/345.html">345</a></td>
     <td>CD1</td>
     <td>Misleading comment on example in templates chapter</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="346">
     <td><a href="https://cplusplus.github.io/CWG/issues/346.html">346</a></td>
@@ -2129,7 +2129,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/347.html">347</a></td>
     <td>NAD</td>
     <td>Use of derived class name in defining base class nested class</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="348">
     <td><a href="https://cplusplus.github.io/CWG/issues/348.html">348</a></td>
@@ -2171,13 +2171,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/354.html">354</a></td>
     <td>CD1</td>
     <td>Null as nontype template argument</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="355">
     <td><a href="https://cplusplus.github.io/CWG/issues/355.html">355</a></td>
     <td>C++11</td>
     <td>Global-scope <TT>::</TT> in <I>nested-name-specifier</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="356">
     <td><a href="https://cplusplus.github.io/CWG/issues/356.html">356</a></td>
@@ -2189,25 +2189,25 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/357.html">357</a></td>
     <td>CD1</td>
     <td>Definition of signature should include name</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="358">
     <td><a href="https://cplusplus.github.io/CWG/issues/358.html">358</a></td>
     <td>NAD</td>
     <td>Namespaces and extern "C"</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="359">
     <td><a href="https://cplusplus.github.io/CWG/issues/359.html">359</a></td>
     <td>NAD</td>
     <td>Type definition in anonymous union</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="360">
     <td><a href="https://cplusplus.github.io/CWG/issues/360.html">360</a></td>
     <td>CD6</td>
     <td>Using-declaration that reduces access</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="361">
     <td><a href="https://cplusplus.github.io/CWG/issues/361.html">361</a></td>
@@ -2231,7 +2231,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/364.html">364</a></td>
     <td>CD1</td>
     <td>Calling overloaded function with static in set, with no object</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="365">
     <td><a href="https://cplusplus.github.io/CWG/issues/365.html">365</a></td>
@@ -2243,13 +2243,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/366.html">366</a></td>
     <td>CD1</td>
     <td>String literal allowed in integral constant expression?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="367">
     <td><a href="https://cplusplus.github.io/CWG/issues/367.html">367</a></td>
     <td>CD1</td>
     <td><TT>throw</TT> operator allowed in constant expression?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="368">
     <td><a href="https://cplusplus.github.io/CWG/issues/368.html">368</a></td>
@@ -2309,7 +2309,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/377.html">377</a></td>
     <td>CD1</td>
     <td>Enum whose enumerators will not fit in any integral type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="378">
     <td><a href="https://cplusplus.github.io/CWG/issues/378.html">378</a></td>
@@ -2333,25 +2333,25 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/381.html">381</a></td>
     <td>CD1</td>
     <td>Incorrect example of base class member lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="382">
     <td><a href="https://cplusplus.github.io/CWG/issues/382.html">382</a></td>
     <td>CD1</td>
     <td>Allow <TT>typename</TT> outside of templates</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="383">
     <td><a href="https://cplusplus.github.io/CWG/issues/383.html">383</a></td>
     <td>CD1</td>
     <td>Is a class with a declared but not defined destructor a POD?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="384">
     <td><a href="https://cplusplus.github.io/CWG/issues/384.html">384</a></td>
     <td>NAD</td>
     <td>Argument-dependent lookup and operator functions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="385">
     <td><a href="https://cplusplus.github.io/CWG/issues/385.html">385</a></td>
@@ -2423,7 +2423,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/396.html">396</a></td>
     <td>CD1</td>
     <td>Misleading note regarding use of <TT>auto</TT> for disambiguation</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="397">
     <td><a href="https://cplusplus.github.io/CWG/issues/397.html">397</a></td>
@@ -2435,7 +2435,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/398.html">398</a></td>
     <td>CD1</td>
     <td>Ambiguous wording on naming a type in deduction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="399">
     <td><a href="https://cplusplus.github.io/CWG/issues/399.html">399</a></td>
@@ -2447,7 +2447,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/400.html">400</a></td>
     <td>CD1</td>
     <td>Using-declarations and the "struct hack"</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="401">
     <td><a href="https://cplusplus.github.io/CWG/issues/401.html">401</a></td>
@@ -2465,7 +2465,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/403.html">403</a></td>
     <td>CD1</td>
     <td>Reference to a type as a <I>template-id</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="404">
     <td><a href="https://cplusplus.github.io/CWG/issues/404.html">404</a></td>
@@ -2477,7 +2477,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/405.html">405</a></td>
     <td>CD6</td>
     <td>Unqualified function name lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="406">
     <td><a href="https://cplusplus.github.io/CWG/issues/406.html">406</a></td>
@@ -2501,7 +2501,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/409.html">409</a></td>
     <td>CD1</td>
     <td>Obsolete paragraph missed by changes for issue 224</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="410">
     <td><a href="https://cplusplus.github.io/CWG/issues/410.html">410</a></td>
@@ -2525,7 +2525,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/413.html">413</a></td>
     <td>CD1</td>
     <td>Definition of "empty class"</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="414">
     <td><a href="https://cplusplus.github.io/CWG/issues/414.html">414</a></td>
@@ -2537,13 +2537,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/415.html">415</a></td>
     <td>CD1</td>
     <td>Template deduction does not cause instantiation</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="416">
     <td><a href="https://cplusplus.github.io/CWG/issues/416.html">416</a></td>
     <td>CD1</td>
     <td>Class must be complete to allow operator lookup?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="417">
     <td><a href="https://cplusplus.github.io/CWG/issues/417.html">417</a></td>
@@ -2573,31 +2573,31 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/421.html">421</a></td>
     <td>CD1</td>
     <td>Is rvalue.field an rvalue?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="422">
     <td><a href="https://cplusplus.github.io/CWG/issues/422.html">422</a></td>
     <td>NAD</td>
     <td>Is a typedef redeclaration allowed with a template type that might be the same?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="423">
     <td><a href="https://cplusplus.github.io/CWG/issues/423.html">423</a></td>
     <td>NAD</td>
     <td>Can a conversion be done on the left operand of a compound assignment?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="424">
     <td><a href="https://cplusplus.github.io/CWG/issues/424.html">424</a></td>
     <td>CD1</td>
     <td>Wording problem with issue 56 resolution on redeclaring typedefs in class scope</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="425">
     <td><a href="https://cplusplus.github.io/CWG/issues/425.html">425</a></td>
     <td>CD1</td>
     <td>Set of candidates for overloaded built-in operator with float operand</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="426">
     <td><a href="https://cplusplus.github.io/CWG/issues/426.html">426</a></td>
@@ -2609,13 +2609,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/427.html">427</a></td>
     <td>CD1</td>
     <td><TT>static_cast</TT> ambiguity: conversion versus cast to derived</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="428">
     <td><a href="https://cplusplus.github.io/CWG/issues/428.html">428</a></td>
     <td>CD1</td>
     <td>Mention of expression with reference type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="429">
     <td><a href="https://cplusplus.github.io/CWG/issues/429.html">429</a></td>
@@ -2627,13 +2627,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/430.html">430</a></td>
     <td>CD1</td>
     <td>Ordering of expression evaluation in initializer list</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="431">
     <td><a href="https://cplusplus.github.io/CWG/issues/431.html">431</a></td>
     <td>C++11</td>
     <td>Defect in wording in 14.2</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="432">
     <td><a href="https://cplusplus.github.io/CWG/issues/432.html">432</a></td>
@@ -2645,7 +2645,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/433.html">433</a></td>
     <td>CD1</td>
     <td>Do elaborated type specifiers in templates inject into enclosing namespace scope?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="434">
     <td><a href="https://cplusplus.github.io/CWG/issues/434.html">434</a></td>
@@ -2663,7 +2663,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/436.html">436</a></td>
     <td>CD1</td>
     <td>Problem in example in 9.6 paragraph 4</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="437">
     <td><a href="https://cplusplus.github.io/CWG/issues/437.html">437</a></td>
@@ -2711,7 +2711,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/444.html">444</a></td>
     <td>NAD</td>
     <td>Overriding and the generated copy assignment operator</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="445">
     <td><a href="https://cplusplus.github.io/CWG/issues/445.html">445</a></td>
@@ -2729,7 +2729,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/447.html">447</a></td>
     <td>CD1</td>
     <td>Is offsetof type-dependent?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="448">
     <td><a href="https://cplusplus.github.io/CWG/issues/448.html">448</a></td>
@@ -2747,19 +2747,19 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/450.html">450</a></td>
     <td>CD1</td>
     <td>Binding a reference to const to a cv-qualified array rvalue</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="451">
     <td><a href="https://cplusplus.github.io/CWG/issues/451.html">451</a></td>
     <td>CD1</td>
     <td>Expressions with invalid results and ill-formedness</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="452">
     <td><a href="https://cplusplus.github.io/CWG/issues/452.html">452</a></td>
     <td>CD1</td>
     <td>Wording nit on description of <TT>this</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
@@ -2783,13 +2783,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/456.html">456</a></td>
     <td>NAD</td>
     <td>Is initialized const int or const bool variable a null pointer constant?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="457">
     <td><a href="https://cplusplus.github.io/CWG/issues/457.html">457</a></td>
     <td>CD1</td>
     <td>Wording nit on use of const variables in constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="458">
     <td><a href="https://cplusplus.github.io/CWG/issues/458.html">458</a></td>
@@ -2807,7 +2807,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/460.html">460</a></td>
     <td>CD1</td>
     <td>Can a <I>using-declaration</I> name a namespace?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="461">
     <td><a href="https://cplusplus.github.io/CWG/issues/461.html">461</a></td>
@@ -2849,13 +2849,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/467.html">467</a></td>
     <td>NAD</td>
     <td>Jump past initialization of local static variable</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="468">
     <td><a href="https://cplusplus.github.io/CWG/issues/468.html">468</a></td>
     <td>CD1</td>
     <td>Allow <TT>::template</TT> outside of templates</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="469">
     <td><a href="https://cplusplus.github.io/CWG/issues/469.html">469</a></td>
@@ -2867,7 +2867,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/470.html">470</a></td>
     <td>CD1</td>
     <td>Instantiation of members of an explicitly-instantiated class template</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="471">
     <td><a href="https://cplusplus.github.io/CWG/issues/471.html">471</a></td>
@@ -2919,7 +2919,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/478.html">478</a></td>
     <td>NAD</td>
     <td>May a function parameter be an array of an abstract class type?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="479">
     <td><a href="https://cplusplus.github.io/CWG/issues/479.html">479</a></td>
@@ -2931,7 +2931,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/480.html">480</a></td>
     <td>CD1</td>
     <td>Is a base of a virtual base also virtual?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="481">
     <td><a href="https://cplusplus.github.io/CWG/issues/481.html">481</a></td>
@@ -2949,37 +2949,37 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/483.html">483</a></td>
     <td>CD3</td>
     <td>Normative requirements on integral ranges</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="484">
     <td><a href="https://cplusplus.github.io/CWG/issues/484.html">484</a></td>
     <td>CD1</td>
     <td>Can a <I>base-specifier</I> name a cv-qualified class type?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="485">
     <td><a href="https://cplusplus.github.io/CWG/issues/485.html">485</a></td>
     <td>CD1</td>
     <td>What is a &#8220;name&#8221;?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="486">
     <td><a href="https://cplusplus.github.io/CWG/issues/486.html">486</a></td>
     <td>CD1</td>
     <td>Invalid return types and template argument deduction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="487">
     <td><a href="https://cplusplus.github.io/CWG/issues/487.html">487</a></td>
     <td>NAD</td>
     <td>Operator overloading in constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="488">
     <td><a href="https://cplusplus.github.io/CWG/issues/488.html">488</a></td>
     <td>CD1</td>
     <td>Local types, overload resolution, and template argument deduction</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.9 (C++11 onwards)</td>
   </tr>
   <tr id="489">
     <td><a href="https://cplusplus.github.io/CWG/issues/489.html">489</a></td>
@@ -3045,7 +3045,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/499.html">499</a></td>
     <td>CD2</td>
     <td>Throwing an array of unknown size</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="500">
     <td><a href="https://cplusplus.github.io/CWG/issues/500.html">500</a></td>
@@ -3057,13 +3057,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/501.html">501</a></td>
     <td>NAD</td>
     <td>Visibility of friend declarations within the befriending class</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="502">
     <td><a href="https://cplusplus.github.io/CWG/issues/502.html">502</a></td>
     <td>C++11</td>
     <td>Dependency of nested enumerations and enumerators</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="503">
     <td><a href="https://cplusplus.github.io/CWG/issues/503.html">503</a></td>
@@ -3081,13 +3081,13 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/505.html">505</a></td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for unknown character escapes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="506">
     <td><a href="https://cplusplus.github.io/CWG/issues/506.html">506</a></td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="507">
     <td><a href="https://cplusplus.github.io/CWG/issues/507.html">507</a></td>
@@ -3123,7 +3123,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/512.html">512</a></td>
     <td>NAD</td>
     <td>Union members with user-declared non-default constructors</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="513">
     <td><a href="https://cplusplus.github.io/CWG/issues/513.html">513</a></td>
@@ -3135,7 +3135,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/514.html">514</a></td>
     <td>CD1</td>
     <td>Is the initializer for a namespace member in the scope of the namespace?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="515">
     <td><a href="https://cplusplus.github.io/CWG/issues/515.html">515</a></td>
@@ -3159,7 +3159,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/518.html">518</a></td>
     <td>CD1</td>
     <td>Trailing comma following <I>enumerator-list</I></td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="519">
     <td><a href="https://cplusplus.github.io/CWG/issues/519.html">519</a></td>
@@ -3183,7 +3183,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/522.html">522</a></td>
     <td>CD1</td>
     <td>Array-to-pointer decay in template argument deduction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="523">
     <td><a href="https://cplusplus.github.io/CWG/issues/523.html">523</a></td>
@@ -3195,19 +3195,19 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/524.html">524</a></td>
     <td>CD1</td>
     <td>Can function-notation calls to operator functions be dependent?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="525">
     <td><a href="https://cplusplus.github.io/CWG/issues/525.html">525</a></td>
     <td>CD1</td>
     <td>Missing <TT>*</TT> in example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="526">
     <td><a href="https://cplusplus.github.io/CWG/issues/526.html">526</a></td>
     <td>CD1</td>
     <td>Confusing aspects in the specification of non-deduced contexts</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="527">
     <td><a href="https://cplusplus.github.io/CWG/issues/527.html">527</a></td>
@@ -3231,7 +3231,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/530.html">530</a></td>
     <td>CD1</td>
     <td>Nontype template arguments in constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="531">
     <td><a href="https://cplusplus.github.io/CWG/issues/531.html">531</a></td>
@@ -3261,7 +3261,7 @@ of class templates</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/535.html">535</a></td>
     <td>CD3</td>
     <td>Copy construction without a copy constructor</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="536">
     <td><a href="https://cplusplus.github.io/CWG/issues/536.html">536</a></td>
@@ -3287,25 +3287,25 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/539.html">539</a></td>
     <td>CD3</td>
     <td>Constraints on <I>type-specifier-seq</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="540">
     <td><a href="https://cplusplus.github.io/CWG/issues/540.html">540</a></td>
     <td>CD1</td>
     <td>Propagation of cv-qualifiers in reference-to-reference collapse</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="541">
     <td><a href="https://cplusplus.github.io/CWG/issues/541.html">541</a></td>
     <td>CD2</td>
     <td>Dependent function types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="542">
     <td><a href="https://cplusplus.github.io/CWG/issues/542.html">542</a></td>
     <td>CD2</td>
     <td>Value initialization of arrays of POD-structs</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="543">
     <td><a href="https://cplusplus.github.io/CWG/issues/543.html">543</a></td>
@@ -3317,7 +3317,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/544.html">544</a></td>
     <td>NAD</td>
     <td>Base class lookup in explicit specialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="545">
     <td><a href="https://cplusplus.github.io/CWG/issues/545.html">545</a></td>
@@ -3329,7 +3329,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/546.html">546</a></td>
     <td>C++11</td>
     <td>Explicit instantiation of class template members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="547">
     <td><a href="https://cplusplus.github.io/CWG/issues/547.html">547</a></td>
@@ -3359,13 +3359,13 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/551.html">551</a></td>
     <td>CD1</td>
     <td>When is <TT>inline</TT> permitted in an explicit instantiation?</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="552">
     <td><a href="https://cplusplus.github.io/CWG/issues/552.html">552</a></td>
     <td>NAD</td>
     <td>Use of <TT>typename</TT> in the type in a non-type <I>parameter-declaration</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="553">
     <td><a href="https://cplusplus.github.io/CWG/issues/553.html">553</a></td>
@@ -3407,7 +3407,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/559.html">559</a></td>
     <td>CD1</td>
     <td>Editing error in issue 382 resolution</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="560">
     <td><a href="https://cplusplus.github.io/CWG/issues/560.html">560</a></td>
@@ -3419,7 +3419,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/561.html">561</a></td>
     <td>CD2</td>
     <td>Internal linkage functions in dependent name lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="562">
     <td><a href="https://cplusplus.github.io/CWG/issues/562.html">562</a></td>
@@ -3437,19 +3437,19 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/564.html">564</a></td>
     <td>CD2</td>
     <td>Agreement of language linkage or <I>linkage-specification</I>s?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="565">
     <td><a href="https://cplusplus.github.io/CWG/issues/565.html">565</a></td>
     <td>CD3</td>
     <td>Conflict rules for <I>using-declaration</I>s naming function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="566">
     <td><a href="https://cplusplus.github.io/CWG/issues/566.html">566</a></td>
     <td>NAD</td>
     <td>Conversion of negative floating point values to integer type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="567">
     <td><a href="https://cplusplus.github.io/CWG/issues/567.html">567</a></td>
@@ -3467,7 +3467,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/569.html">569</a></td>
     <td>CD2</td>
     <td>Spurious semicolons at namespace scope should be allowed</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="570">
     <td><a href="https://cplusplus.github.io/CWG/issues/570.html">570</a></td>
@@ -3485,7 +3485,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/572.html">572</a></td>
     <td>C++11</td>
     <td>Standard conversions for non-built-in types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="573">
     <td><a href="https://cplusplus.github.io/CWG/issues/573.html">573</a></td>
@@ -3503,7 +3503,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/575.html">575</a></td>
     <td>C++11</td>
     <td>Criteria for deduction failure</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="576">
     <td><a href="https://cplusplus.github.io/CWG/issues/576.html">576</a></td>
@@ -3581,19 +3581,19 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/588.html">588</a></td>
     <td>CD2</td>
     <td>Searching dependent bases of classes local to function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="589">
     <td><a href="https://cplusplus.github.io/CWG/issues/589.html">589</a></td>
     <td>CD2</td>
     <td>Direct binding of class and array rvalues in reference initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="590">
     <td><a href="https://cplusplus.github.io/CWG/issues/590.html">590</a></td>
     <td>C++11</td>
     <td>Nested classes and the &#8220;current instantiation&#8221;</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="591">
     <td><a href="https://cplusplus.github.io/CWG/issues/591.html">591</a></td>
@@ -3641,7 +3641,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/598.html">598</a></td>
     <td>CD2</td>
     <td>Associated namespaces of overloaded functions and function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="599">
     <td><a href="https://cplusplus.github.io/CWG/issues/599.html">599</a></td>
@@ -3659,19 +3659,19 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/601.html">601</a></td>
     <td>CD2</td>
     <td>Type of literals in preprocessing expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="602">
     <td><a href="https://cplusplus.github.io/CWG/issues/602.html">602</a></td>
     <td>C++11</td>
     <td>When is the injected-class-name of a class template a template?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="603">
     <td><a href="https://cplusplus.github.io/CWG/issues/603.html">603</a></td>
     <td>CD1</td>
     <td>Type equivalence and unsigned overflow</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="604">
     <td><a href="https://cplusplus.github.io/CWG/issues/604.html">604</a></td>
@@ -3695,13 +3695,13 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/607.html">607</a></td>
     <td>CD6</td>
     <td>Lookup of <I>mem-initializer-id</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="608">
     <td><a href="https://cplusplus.github.io/CWG/issues/608.html">608</a></td>
     <td>CD2</td>
     <td>Determining the final overrider of a virtual function</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="609">
     <td><a href="https://cplusplus.github.io/CWG/issues/609.html">609</a></td>
@@ -3713,13 +3713,13 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/610.html">610</a></td>
     <td>NAD</td>
     <td>Computing the negative of <TT>0U</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="611">
     <td><a href="https://cplusplus.github.io/CWG/issues/611.html">611</a></td>
     <td>CD2</td>
     <td>Zero-initializing references</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="612">
     <td><a href="https://cplusplus.github.io/CWG/issues/612.html">612</a></td>
@@ -3731,19 +3731,19 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/613.html">613</a></td>
     <td>CD1</td>
     <td>Unevaluated uses of non-static class members</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="614">
     <td><a href="https://cplusplus.github.io/CWG/issues/614.html">614</a></td>
     <td>CD1</td>
     <td>Results of integer <TT>/</TT> and <TT>%</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="615">
     <td><a href="https://cplusplus.github.io/CWG/issues/615.html">615</a></td>
     <td>C++11</td>
     <td>Incorrect description of variables that can be initialized</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="616">
     <td><a href="https://cplusplus.github.io/CWG/issues/616.html">616</a></td>
@@ -3761,13 +3761,13 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/618.html">618</a></td>
     <td>CD2</td>
     <td>Casts in preprocessor conditional expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="619">
     <td><a href="https://cplusplus.github.io/CWG/issues/619.html">619</a></td>
     <td>C++11</td>
     <td>Completeness of array types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="620">
     <td><a href="https://cplusplus.github.io/CWG/issues/620.html">620</a></td>
@@ -3779,7 +3779,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/621.html">621</a></td>
     <td>C++11</td>
     <td>Template argument deduction from function return types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="622">
     <td><a href="https://cplusplus.github.io/CWG/issues/622.html">622</a></td>
@@ -3803,19 +3803,19 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/625.html">625</a></td>
     <td>CD2</td>
     <td>Use of <TT>auto</TT> as a <I>template-argument</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="626">
     <td><a href="https://cplusplus.github.io/CWG/issues/626.html">626</a></td>
     <td>CD2</td>
     <td>Preprocessor string literals</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="627">
     <td><a href="https://cplusplus.github.io/CWG/issues/627.html">627</a></td>
     <td>NAD</td>
     <td>Values behaving as types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="628">
     <td><a href="https://cplusplus.github.io/CWG/issues/628.html">628</a></td>
@@ -3833,7 +3833,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/630.html">630</a></td>
     <td>CD2</td>
     <td>Equality of narrow and wide character values in the basic character set</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="631">
     <td><a href="https://cplusplus.github.io/CWG/issues/631.html">631</a></td>
@@ -3845,7 +3845,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/632.html">632</a></td>
     <td>CD1</td>
     <td>Brace-enclosed initializer for scalar member of aggregate</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="633">
     <td><a href="https://cplusplus.github.io/CWG/issues/633.html">633</a></td>
@@ -3857,13 +3857,13 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/634.html">634</a></td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis redux</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="635">
     <td><a href="https://cplusplus.github.io/CWG/issues/635.html">635</a></td>
     <td>NAD</td>
     <td>Names of constructors and destructors of templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="636">
     <td><a href="https://cplusplus.github.io/CWG/issues/636.html">636</a></td>
@@ -3875,7 +3875,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/637.html">637</a></td>
     <td>CD1</td>
     <td>Sequencing rules and example disagree</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="638">
     <td><a href="https://cplusplus.github.io/CWG/issues/638.html">638</a></td>
@@ -3899,13 +3899,13 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/641.html">641</a></td>
     <td>CD2</td>
     <td>Overload resolution and conversion-to-same-type operators</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="642">
     <td><a href="https://cplusplus.github.io/CWG/issues/642.html">642</a></td>
     <td>CD2</td>
     <td>Definition and use of &#8220;block scope&#8221; and &#8220;local scope&#8221;</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="643">
     <td><a href="https://cplusplus.github.io/CWG/issues/643.html">643</a></td>
@@ -3941,7 +3941,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/648.html">648</a></td>
     <td>CD1</td>
     <td>Constant expressions in constexpr initializers</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="649">
     <td><a href="https://cplusplus.github.io/CWG/issues/649.html">649</a></td>
@@ -3959,13 +3959,13 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/651.html">651</a></td>
     <td>CD1</td>
     <td>Problems in <TT>decltype</TT> specification and examples</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="652">
     <td><a href="https://cplusplus.github.io/CWG/issues/652.html">652</a></td>
     <td>CD2</td>
     <td>Compile-time evaluation of floating-point expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="653">
     <td><a href="https://cplusplus.github.io/CWG/issues/653.html">653</a></td>
@@ -3983,13 +3983,13 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/655.html">655</a></td>
     <td>C++11</td>
     <td>Initialization not specified for forwarding constructors</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="656">
     <td><a href="https://cplusplus.github.io/CWG/issues/656.html">656</a></td>
     <td>CD2</td>
     <td>Direct binding to the result of a conversion operator</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="657">
     <td><a href="https://cplusplus.github.io/CWG/issues/657.html">657</a></td>
@@ -4025,7 +4025,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/662.html">662</a></td>
     <td>NAD</td>
     <td>Forming a pointer to a reference type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="663">
     <td><a href="https://cplusplus.github.io/CWG/issues/663.html">663</a></td>
@@ -4037,7 +4037,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/664.html">664</a></td>
     <td>CD2</td>
     <td>Direct binding of references to non-class rvalue references</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="665">
     <td><a href="https://cplusplus.github.io/CWG/issues/665.html">665</a></td>
@@ -4067,7 +4067,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/669.html">669</a></td>
     <td>NAD</td>
     <td>Confusing specification of the meaning of <TT>decltype</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="670">
     <td><a href="https://cplusplus.github.io/CWG/issues/670.html">670</a></td>
@@ -4091,7 +4091,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/673.html">673</a></td>
     <td>NAD</td>
     <td>Injection of names from <I>elaborated-type-specifier</I>s in <TT>friend</TT> declarations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="674">
     <td><a href="https://cplusplus.github.io/CWG/issues/674.html">674</a></td>
@@ -4127,7 +4127,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/679.html">679</a></td>
     <td>CD1</td>
     <td>Equivalence of <I>template-id</I>s and operator function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="680">
     <td><a href="https://cplusplus.github.io/CWG/issues/680.html">680</a></td>
@@ -4151,7 +4151,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/683.html">683</a></td>
     <td>CD1</td>
     <td>Requirements for trivial subobject special functions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="684">
     <td><a href="https://cplusplus.github.io/CWG/issues/684.html">684</a></td>
@@ -4163,7 +4163,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/685.html">685</a></td>
     <td>CD2</td>
     <td>Integral promotion of enumeration ignores fixed underlying type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="686">
     <td><a href="https://cplusplus.github.io/CWG/issues/686.html">686</a></td>
@@ -4283,7 +4283,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/705.html">705</a></td>
     <td>CD2</td>
     <td>Suppressing argument-dependent lookup via parentheses</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="706">
     <td><a href="https://cplusplus.github.io/CWG/issues/706.html">706</a></td>
@@ -4775,7 +4775,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/794.html">794</a></td>
     <td>NAD</td>
     <td>Base-derived conversion in member type of pointer-to-member conversion</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="795">
     <td><a href="https://cplusplus.github.io/CWG/issues/795.html">795</a></td>
@@ -5663,7 +5663,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/974.html">974</a></td>
     <td>CD3</td>
     <td>Default arguments for lambdas</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="975">
     <td><a href="https://cplusplus.github.io/CWG/issues/975.html">975</a></td>
@@ -5681,7 +5681,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/977.html">977</a></td>
     <td>CD3</td>
     <td>When is an enumeration type complete?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="978">
     <td><a href="https://cplusplus.github.io/CWG/issues/978.html">978</a></td>
@@ -6485,7 +6485,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1111.html">1111</a></td>
     <td>C++11</td>
     <td>Remove dual-scope lookup of member template names</td>
-    <td class="full" align="center">Clang 3.2</td>
+    <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1112">
     <td><a href="https://cplusplus.github.io/CWG/issues/1112.html">1112</a></td>
@@ -7153,15 +7153,11 @@ and <I>POD class</I></td>
     <td>Unnecessary restriction on <TT>auto</TT> array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1223">
+  <tr id="1223">
     <td><a href="https://cplusplus.github.io/CWG/issues/1223.html">1223</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Syntactic disambiguation and <I>trailing-return-type</I>s</td>
-    <td align="center">
-      <details>
-        <summary>Not resolved</summary>
-        Clang 17 implements 2023-05-12 resolution
-      </details></td>
+    <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="1224">
     <td><a href="https://cplusplus.github.io/CWG/issues/1224.html">1224</a></td>
@@ -8481,7 +8477,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1443.html">1443</a></td>
     <td>NAD</td>
     <td>Default arguments and non-static data members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="1444">
     <td><a href="https://cplusplus.github.io/CWG/issues/1444.html">1444</a></td>
@@ -8945,11 +8941,11 @@ and <I>POD class</I></td>
     <td>Alias template specialization vs pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1521">
+  <tr id="1521">
     <td><a href="https://cplusplus.github.io/CWG/issues/1521.html">1521</a></td>
-    <td>drafting</td>
+    <td>dup</td>
     <td><TT>T{</TT><I>expr</I><TT>}</TT> with reference types</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1522">
     <td><a href="https://cplusplus.github.io/CWG/issues/1522.html">1522</a></td>
@@ -9201,7 +9197,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1563.html">1563</a></td>
     <td>CD3</td>
     <td>List-initialization and overloaded function disambiguation</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1564">
     <td><a href="https://cplusplus.github.io/CWG/issues/1564.html">1564</a></td>
@@ -10591,7 +10587,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1794.html">1794</a></td>
     <td>C++17</td>
     <td><TT>template</TT> keyword and alias templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1795">
     <td><a href="https://cplusplus.github.io/CWG/issues/1795.html">1795</a></td>
@@ -10711,7 +10707,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1814.html">1814</a></td>
     <td>CD4</td>
     <td>Default arguments in <I>lambda-expression</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1815">
     <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td>
@@ -10759,7 +10755,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1822.html">1822</a></td>
     <td>CD6</td>
     <td>Lookup of parameter names in <I>lambda-expression</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1823">
     <td><a href="https://cplusplus.github.io/CWG/issues/1823.html">1823</a></td>
@@ -11545,11 +11541,11 @@ and <I>POD class</I></td>
     <td>Constant expressions and library undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1953">
+  <tr id="1953">
     <td><a href="https://cplusplus.github.io/CWG/issues/1953.html">1953</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Data races and common initial sequence</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
@@ -11619,7 +11615,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="1965">
     <td><a href="https://cplusplus.github.io/CWG/issues/1965.html">1965</a></td>
-    <td>drafting</td>
+    <td>open</td>
     <td>Explicit casts to reference types</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -12197,7 +12193,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2061.html">2061</a></td>
     <td>CD4</td>
     <td>Inline namespace after simplifications</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2062">
     <td><a href="https://cplusplus.github.io/CWG/issues/2062.html">2062</a></td>
@@ -12449,7 +12445,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2103.html">2103</a></td>
     <td>CD5</td>
     <td>Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2104">
     <td><a href="https://cplusplus.github.io/CWG/issues/2104.html">2104</a></td>
@@ -12693,7 +12689,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2144">
     <td><a href="https://cplusplus.github.io/CWG/issues/2144.html">2144</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Function/variable declaration ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12911,7 +12907,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2180.html">2180</a></td>
     <td>CD4</td>
     <td>Virtual bases in destructors and defaulted assignment operators</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="2181">
     <td><a href="https://cplusplus.github.io/CWG/issues/2181.html">2181</a></td>
@@ -13109,7 +13105,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2213.html">2213</a></td>
     <td>CD6</td>
     <td>Forward declaration of partial specializations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2214">
     <td><a href="https://cplusplus.github.io/CWG/issues/2214.html">2214</a></td>
@@ -13525,11 +13521,11 @@ and <I>POD class</I></td>
     <td>Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2283">
+  <tr id="2283">
     <td><a href="https://cplusplus.github.io/CWG/issues/2283.html">2283</a></td>
-    <td>drafting</td>
+    <td>DR</td>
     <td>Missing complete type requirements</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2284">
     <td><a href="https://cplusplus.github.io/CWG/issues/2284.html">2284</a></td>
@@ -14013,7 +14009,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2363.html">2363</a></td>
     <td>NAD</td>
     <td>Opaque enumeration friend declarations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2364">
     <td><a href="https://cplusplus.github.io/CWG/issues/2364.html">2364</a></td>
@@ -15183,7 +15179,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2557">
     <td><a href="https://cplusplus.github.io/CWG/issues/2557.html">2557</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Class member access referring to an unrelated class</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -15207,7 +15203,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -15373,7 +15369,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>friend declarations and module linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16213,7 +16209,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16734,11 +16730,11 @@ objects</td>
     <td>Alignment requirement of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2815">
+  <tr id="2815">
     <td><a href="https://cplusplus.github.io/CWG/issues/2815.html">2815</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Overload resolution for references/pointers to <TT>noexcept</TT> functions</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2816">
     <td><a href="https://cplusplus.github.io/CWG/issues/2816.html">2816</a></td>
@@ -16754,15 +16750,15 @@ objects</td>
   </tr>
   <tr id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Use of predefined reserved identifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
-    <td>accepted</td>
+    <td>WP</td>
     <td>Cast from null pointer value in a constant expression</td>
-    <td class="full" align="center">Clang 19</td>
+    <td class="full" align="center">Clang 19 (C++26 onwards)</td>
   </tr>
   <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
@@ -16862,7 +16858,7 @@ objects</td>
   </tr>
   <tr id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16904,7 +16900,7 @@ objects</td>
   </tr>
   <tr class="open" id="2843">
     <td><a href="https://cplusplus.github.io/CWG/issues/2843.html">2843</a></td>
-    <td>review</td>
+    <td>drafting</td>
     <td>Undated reference to Unicode makes C++ a moving target</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16998,13 +16994,13 @@ objects</td>
   </tr>
   <tr id="2858">
     <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
-    <td>accepted</td>
+    <td>WP</td>
     <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2859">
     <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Value-initialization with multiple default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17016,7 +17012,7 @@ objects</td>
   </tr>
   <tr id="2861">
     <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>dynamic_cast</TT> on bad pointer value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17034,13 +17030,13 @@ objects</td>
   </tr>
   <tr id="2864">
     <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Narrowing floating-point conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2865">
     <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Regression on result of conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17052,7 +17048,7 @@ objects</td>
   </tr>
   <tr id="2867">
     <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Order of initialization for structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17064,25 +17060,25 @@ objects</td>
   </tr>
   <tr id="2869">
     <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>this</TT> in local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2870">
     <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Combining absent <I>encoding-prefix</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2871">
     <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>User-declared constructor templates inhibiting default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2872">
     <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Linkage and unclear "can be referred to"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17094,7 +17090,7 @@ objects</td>
   </tr>
   <tr id="2874">
     <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Qualified declarations of partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17106,13 +17102,13 @@ objects</td>
   </tr>
   <tr id="2876">
     <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
-    <td>accepted</td>
+    <td>WP</td>
     <td>Disambiguation of <TT>T x = delete("text")</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2877">
     <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type-only lookup for <I>using-enum-declarator</I></td>
     <td class="full" align="center">Clang 19</td>
   </tr>
@@ -17122,33 +17118,33 @@ objects</td>
     <td>C-style casts to reference types</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2879">
+  <tr id="2879">
     <td><a href="https://cplusplus.github.io/CWG/issues/2879.html">2879</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Undesired outcomes with <TT>const_cast</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2880">
     <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
-    <td>accepted</td>
+    <td>WP</td>
     <td>Accessibility check for destructor of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2881">
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type restrictions for the explicit object parameter of a lambda</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2882">
     <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unclear treatment of conversion to <TT>void</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2883">
     <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Definition of "odr-usable" ignores lambda scopes</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -17170,13 +17166,13 @@ objects</td>
   </tr>
   <tr id="2886">
     <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Temporaries and trivial potentially-throwing special member functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2887">
     <td><a href="https://cplusplus.github.io/CWG/issues/2887.html">2887</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing compatibility entries for xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17192,21 +17188,21 @@ objects</td>
     <td>Requiring an accessible destructor for destroying operator delete</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2890">
+  <tr id="2890">
     <td><a href="https://cplusplus.github.io/CWG/issues/2890.html">2890</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Defining members of local classes</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2891">
     <td><a href="https://cplusplus.github.io/CWG/issues/2891.html">2891</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Normative status of implementation limits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2892">
     <td><a href="https://cplusplus.github.io/CWG/issues/2892.html">2892</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unclear usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17216,15 +17212,15 @@ objects</td>
     <td>Instantiations in discarded <TT>if constexpr</TT> substatements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2894">
+  <tr id="2894">
     <td><a href="https://cplusplus.github.io/CWG/issues/2894.html">2894</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Functional casts create prvalues of reference type</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2895">
     <td><a href="https://cplusplus.github.io/CWG/issues/2895.html">2895</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Initialization should ignore the destination type's cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -17246,11 +17242,11 @@ objects</td>
     <td>Clarify implicit conversion sequence from <I>cv</I> <TT>T</TT> to <TT>T</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2899">
+  <tr id="2899">
     <td><a href="https://cplusplus.github.io/CWG/issues/2899.html">2899</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Bad value representations should cause undefined behavior</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2900">
     <td><a href="https://cplusplus.github.io/CWG/issues/2900.html">2900</a></td>
@@ -17258,11 +17254,11 @@ objects</td>
     <td>Deduction of non-type template arguments with placeholder types</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2901">
+  <tr id="2901">
     <td><a href="https://cplusplus.github.io/CWG/issues/2901.html">2901</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear semantics for near-match aliased access</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2902">
     <td><a href="https://cplusplus.github.io/CWG/issues/2902.html">2902</a></td>
@@ -17272,7 +17268,7 @@ objects</td>
   </tr>
   <tr class="open" id="2903">
     <td><a href="https://cplusplus.github.io/CWG/issues/2903.html">2903</a></td>
-    <td>tentatively ready</td>
+    <td>drafting</td>
     <td>Can we omit the <TT>template</TT> disambiguator in <I>nested-name-specifier</I>s in type-only contexts?</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17282,47 +17278,47 @@ objects</td>
     <td>Introducing <I>template-name</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2905">
+  <tr id="2905">
     <td><a href="https://cplusplus.github.io/CWG/issues/2905.html">2905</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Value-dependence of <I>noexcept-expression</I></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2906">
+  <tr id="2906">
     <td><a href="https://cplusplus.github.io/CWG/issues/2906.html">2906</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Lvalue-to-rvalue conversion of class types for conditional operator</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2907">
+  <tr id="2907">
     <td><a href="https://cplusplus.github.io/CWG/issues/2907.html">2907</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Constant lvalue-to-rvalue conversion on uninitialized <TT>std::nullptr_t</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2908">
+  <tr id="2908">
     <td><a href="https://cplusplus.github.io/CWG/issues/2908.html">2908</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Counting physical source lines for <TT>__LINE__</TT></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2909">
+  <tr id="2909">
     <td><a href="https://cplusplus.github.io/CWG/issues/2909.html">2909</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Subtle difference between constant-initialized and constexpr</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2910">
+  <tr id="2910">
     <td><a href="https://cplusplus.github.io/CWG/issues/2910.html">2910</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Effect of <I>requirement-parameter-list</I>s on odr-usability</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2911">
+  <tr id="2911">
     <td><a href="https://cplusplus.github.io/CWG/issues/2911.html">2911</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear meaning of expressions "appearing within" subexpressions</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2912">
     <td><a href="https://cplusplus.github.io/CWG/issues/2912.html">2912</a></td>
@@ -17330,15 +17326,11 @@ objects</td>
     <td>Too-large value for size in array new</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2913">
+  <tr id="2913">
     <td><a href="https://cplusplus.github.io/CWG/issues/2913.html">2913</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Grammar for <I>deduction-guide</I> has <I>requires-clause</I> in the wrong position</td>
-    <td align="center">
-      <details>
-        <summary>Not resolved</summary>
-        Clang 20 implements 2024-08-16 resolution
-      </details></td>
+    <td class="unreleased" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2914">
     <td><a href="https://cplusplus.github.io/CWG/issues/2914.html">2914</a></td>
@@ -17346,15 +17338,11 @@ objects</td>
     <td>Unclear order of initialization of static and thread-local variables</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2915">
+  <tr id="2915">
     <td><a href="https://cplusplus.github.io/CWG/issues/2915.html">2915</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Explicit object parameters of type <TT>void</TT></td>
-    <td align="center">
-      <details>
-        <summary>Not resolved</summary>
-        Clang 20 implements 2024-08-16 resolution
-      </details></td>
+    <td class="unreleased" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2916">
     <td><a href="https://cplusplus.github.io/CWG/issues/2916.html">2916</a></td>
@@ -17372,17 +17360,17 @@ objects</td>
         Clang 20 implements 2024-07-30 resolution
       </details></td>
   </tr>
-  <tr class="open" id="2918">
+  <tr id="2918">
     <td><a href="https://cplusplus.github.io/CWG/issues/2918.html">2918</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Consideration of constraints for address of overloaded function</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2919">
+  <tr id="2919">
     <td><a href="https://cplusplus.github.io/CWG/issues/2919.html">2919</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Conversion function candidates for initialization of const lvalue reference</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2920">
     <td><a href="https://cplusplus.github.io/CWG/issues/2920.html">2920</a></td>
@@ -17390,33 +17378,29 @@ objects</td>
     <td>The <TT>template</TT> keyword for base classes</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2921">
+  <tr id="2921">
     <td><a href="https://cplusplus.github.io/CWG/issues/2921.html">2921</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Exporting redeclarations of entities not attached to a named module</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2922">
+  <tr id="2922">
     <td><a href="https://cplusplus.github.io/CWG/issues/2922.html">2922</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>constexpr placement-new is too permissive</td>
-    <td align="center">
-      <details>
-        <summary>Not resolved</summary>
-        Clang 20 implements 2024-07-10 resolution
-      </details></td>
+    <td class="unreleased" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2923">
     <td><a href="https://cplusplus.github.io/CWG/issues/2923.html">2923</a></td>
-    <td>tentatively ready</td>
+    <td>review</td>
     <td>Note about infinite loops and execution steps</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2924">
+  <tr id="2924">
     <td><a href="https://cplusplus.github.io/CWG/issues/2924.html">2924</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Undefined behavior during constant evaluation</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2925">
     <td><a href="https://cplusplus.github.io/CWG/issues/2925.html">2925</a></td>
@@ -17426,15 +17410,15 @@ objects</td>
   </tr>
   <tr class="open" id="2926">
     <td><a href="https://cplusplus.github.io/CWG/issues/2926.html">2926</a></td>
-    <td>open</td>
+    <td>drafting</td>
     <td>Lookup context for dependent qualified names</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2927">
+  <tr id="2927">
     <td><a href="https://cplusplus.github.io/CWG/issues/2927.html">2927</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear status of translation unit with <TT>module</TT> keyword</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2928">
     <td><a href="https://cplusplus.github.io/CWG/issues/2928.html">2928</a></td>
@@ -17444,21 +17428,21 @@ objects</td>
   </tr>
   <tr class="open" id="2929">
     <td><a href="https://cplusplus.github.io/CWG/issues/2929.html">2929</a></td>
-    <td>tentatively ready</td>
+    <td>review</td>
     <td>Lifetime of trivially-destructible static or thread-local objects</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2930">
+  <tr id="2930">
     <td><a href="https://cplusplus.github.io/CWG/issues/2930.html">2930</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear term "copy/move operation" in specification of copy elision</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2931">
+  <tr id="2931">
     <td><a href="https://cplusplus.github.io/CWG/issues/2931.html">2931</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Restrictions on operator functions that are explicit object member functions</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2932">
     <td><a href="https://cplusplus.github.io/CWG/issues/2932.html">2932</a></td>
@@ -17466,11 +17450,11 @@ objects</td>
     <td>Value range of empty enumeration</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2933">
+  <tr id="2933">
     <td><a href="https://cplusplus.github.io/CWG/issues/2933.html">2933</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Dangling references</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2934">
     <td><a href="https://cplusplus.github.io/CWG/issues/2934.html">2934</a></td>
@@ -17484,17 +17468,17 @@ objects</td>
     <td>Destroying the coroutine state when initial-await-resume-called is false</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2936">
+  <tr id="2936">
     <td><a href="https://cplusplus.github.io/CWG/issues/2936.html">2936</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Local classes of templated functions should be part of the current instantiation</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2937">
+  <tr id="2937">
     <td><a href="https://cplusplus.github.io/CWG/issues/2937.html">2937</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Grammar for <I>preprocessing-file</I> has no normative effect</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2938">
     <td><a href="https://cplusplus.github.io/CWG/issues/2938.html">2938</a></td>
@@ -17502,10 +17486,268 @@ objects</td>
     <td>Inheriting linkage from a previous declaration</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2939">
+  <tr id="2939">
     <td><a href="https://cplusplus.github.io/CWG/issues/2939.html">2939</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Do not allow <TT>reinterpret_cast</TT> from prvalue to rvalue reference</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2940">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2940.html">2940</a></td>
+    <td>review</td>
+    <td>Definition of "object"</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2941">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2941.html">2941</a></td>
+    <td>open</td>
+    <td>Lifetime extension for function-style cast to reference type</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2942">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2942.html">2942</a></td>
+    <td>open</td>
+    <td>Packs in a function's parameter-type-list</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2943">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2943.html">2943</a></td>
+    <td>open</td>
+    <td>Discarding a void return value</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2944">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2944.html">2944</a></td>
+    <td>DR</td>
+    <td>Unsequenced <I>throw-expression</I>s</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2945">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2945.html">2945</a></td>
+    <td>open</td>
+    <td>Redundant constraints on matching function template declarations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2946">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2946.html">2946</a></td>
+    <td>open</td>
+    <td>Dependent call equivalence in non-ADL cases</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2947">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2947.html">2947</a></td>
+    <td>open</td>
+    <td>Limiting macro expansion in <I>pp-module</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2948">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2948.html">2948</a></td>
+    <td>open</td>
+    <td>Late ambiguity for partial template specialization</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2949">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2949.html">2949</a></td>
+    <td>open</td>
+    <td>Treatment of ellipsis during partial ordering</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2950">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2950.html">2950</a></td>
+    <td>open</td>
+    <td>Value preservation in enumeration vs. integer bit-fields</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2951">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2951.html">2951</a></td>
+    <td>open</td>
+    <td>Distinguishing a primary template</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2952">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2952.html">2952</a></td>
+    <td>open</td>
+    <td>Vacuous initialization for subobjects</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2953">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2953.html">2953</a></td>
+    <td>open</td>
+    <td>Value representation for non-trivially-copyable types</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2954">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2954.html">2954</a></td>
+    <td>NAD</td>
+    <td>Simultaneous modifications of an atomic object</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2955">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2955.html">2955</a></td>
+    <td>open</td>
+    <td>Unify rules about conflicting unordered accesses</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2956">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2956.html">2956</a></td>
+    <td>open</td>
+    <td>Missing allowance for pseudo-destructors in qualified lookup</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2957">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2957.html">2957</a></td>
+    <td>open</td>
+    <td>Evaluating a reference member should constitute access</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2958">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2958.html">2958</a></td>
+    <td>open</td>
+    <td>Overload resolution involving lvalue transformation and qualification conversion</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2959">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2959.html">2959</a></td>
+    <td>open</td>
+    <td>Naming enumerators in class member access expressions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2960">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2960.html">2960</a></td>
+    <td>open</td>
+    <td>Introduce discontiguous object lifetime</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2961">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2961.html">2961</a></td>
+    <td>open</td>
+    <td>Checking of ill-formed types in <I>constraint-expression</I>s</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2962">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2962.html">2962</a></td>
+    <td>open</td>
+    <td>Evaluation of destructor call for variable with constant destruction</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2963">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2963.html">2963</a></td>
+    <td>open</td>
+    <td>Paradoxical variable-or-function declaration</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2964">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2964.html">2964</a></td>
+    <td>open</td>
+    <td>Reading "invalid pointer values"</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2965">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2965.html">2965</a></td>
+    <td>open</td>
+    <td>Generic lambdas do not have a template parameter scope</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2966">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2966.html">2966</a></td>
+    <td>open</td>
+    <td>Alignment and value representation of <TT>std::nullptr_t</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2967">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2967.html">2967</a></td>
+    <td>open</td>
+    <td>Explicit conversion functions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2968">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2968.html">2968</a></td>
+    <td>open</td>
+    <td>Name lookup result for <I>typedef-name</I> vs. <I>class-name</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2969">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2969.html">2969</a></td>
+    <td>open</td>
+    <td>Scopes in the <I>function-try-block</I> of a constructor</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2970">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2970.html">2970</a></td>
+    <td>open</td>
+    <td>Races with <TT>volatile sig_atomic_t</TT> bit-fields</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2971">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2971.html">2971</a></td>
+    <td>open</td>
+    <td>Specializations for a class are not decl-reachable</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2972">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2972.html">2972</a></td>
+    <td>open</td>
+    <td>Declarative <I>nested-name-specifier</I> naming a partial specialization</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2973">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2973.html">2973</a></td>
+    <td>open</td>
+    <td>Does an <I>alias-declaration</I> introduce a name for linkage purposes?</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2974">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2974.html">2974</a></td>
+    <td>open</td>
+    <td>Non-deduced context for <I>qualified-id</I> naming a template</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2975">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2975.html">2975</a></td>
+    <td>open</td>
+    <td>Effect of concept <I>template-head</I> on parameter mappings</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2976">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2976.html">2976</a></td>
+    <td>open</td>
+    <td>Transferring control out of a function</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2977">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2977.html">2977</a></td>
+    <td>open</td>
+    <td>Initialization with string literals</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2978">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2978.html">2978</a></td>
+    <td>open</td>
+    <td>Deduction involving reference to similar types</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2979">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2979.html">2979</a></td>
+    <td>open</td>
+    <td>Duplicate declarations of enumerations in class scope</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2980">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2980.html">2980</a></td>
+    <td>open</td>
+    <td>Constraints on template template parameters</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2981">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2981.html">2981</a></td>
+    <td>open</td>
+    <td>Usual arithmetic conversions and result types</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2982">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2982.html">2982</a></td>
+    <td>open</td>
+    <td>Deduction in <I>type-constraint</I>s</td>
     <td align="center">Not resolved</td>
   </tr></table>
 
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index f9a35c6..05de961 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -83,8 +83,8 @@ else:
 
 status_map = collect_tests()
 drs = get_issues(issue_list_path)
-out_file = open(output, 'w')
-out_file.write('''\
+out_html = []
+out_html.append('''\
 <!DOCTYPE html>
 <!-- This file is auto-generated by make_cxx_dr_status. Do not modify. -->
 <html>
@@ -149,7 +149,7 @@ def availability(issue):
     unresolved_status = unresolved_status_match.group(1)
     proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
-      raise AvailabilityError('Issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
+      raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
     status = status[:-1-len(proposed_resolution)]
     status = status[:-1-len(unresolved_status)]
@@ -169,6 +169,12 @@ def availability(issue):
   elif status.endswith(' c++20'):
     status = status[:-6]
     avail_suffix = ' (C++20 onwards)'
+  elif status.endswith(' c++23'):
+    status = status[:-6]
+    avail_suffix = ' (C++23 onwards)'
+  elif status.endswith(' c++26'):
+    status = status[:-6]
+    avail_suffix = ' (C++26 onwards)'
   if status == 'unknown':
     avail = 'Unknown'
     avail_style = 'unknown'
@@ -230,7 +236,7 @@ def availability(issue):
     avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup)
     _, avail_style, _, _ = availability(dup)
   else:
-    raise AvailabilityError('Unknown status %s for issue %s' % (status, dr.issue))
+    raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.issue))
   return (avail + avail_suffix, avail_style, unresolved_status, details)
 
 count = {}
@@ -260,7 +266,7 @@ for dr in drs:
     else:
       if unresolved_status != dr.status:
         availability_error_occurred = True
-        print("Issue %s is marked '%s', which differs from CWG index status '%s'" \
+        print("error: issue %s is marked '%s', which differs from CWG index status '%s'" \
              % (dr.issue, unresolved_status, dr.status))
         continue
   else:
@@ -274,7 +280,7 @@ for dr in drs:
 
     if unresolved_status:
       availability_error_occurred = True
-      print("Issue %s is marked '%s', even though it is resolved in CWG index" \
+      print("error: issue %s is marked '%s', even though it is resolved in CWG index" \
            % (dr.issue, unresolved_status))
       continue
 
@@ -290,7 +296,7 @@ for dr in drs:
         <summary>{avail}</summary>
         {details}
       </details>'''
-  out_file.write(f'''
+  out_html.append(f'''
   <tr{row_style} id="{dr.issue}">
     <td><a href="https://cplusplus.github.io/CWG/issues/{dr.issue}.html">{dr.issue}</a></td>
     <td>{dr.status}</td>
@@ -304,12 +310,14 @@ if availability_error_occurred:
 for status, num in sorted(count.items()):
   print("%s: %s" % (status, num), file=sys.stderr)
 
-out_file.write('''\
+out_html.append('''\
 </table>
 
 </div>
 </body>
 </html>
 ''')
-out_file.close()
 
+out_file = open(output, 'w')
+out_file.write(''.join(out_html))
+out_file.close()
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index 3a67623..c3e734f 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -389,7 +389,8 @@ function(add_compiler_rt_runtime name type)
         set_target_properties(${libname} PROPERTIES IMPORT_PREFIX "")
         set_target_properties(${libname} PROPERTIES IMPORT_SUFFIX ".lib")
       endif()
-      if (APPLE AND NOT CMAKE_LINKER MATCHES ".*lld.*")
+      find_program(CODESIGN codesign)
+      if (APPLE AND NOT CMAKE_LINKER MATCHES ".*lld.*" AND CODESIGN)
         # Apple's linker signs the resulting dylib with an ad-hoc code signature in
         # most situations, except:
         # 1. Versions of ld64 prior to ld64-609 in Xcode 12 predate this behavior.
@@ -404,7 +405,7 @@ function(add_compiler_rt_runtime name type)
         # argument and looking for `invalid argument "linker-signed"` in its output.
         # FIXME: Remove this once all supported toolchains support `-o linker-signed`.
         execute_process(
-          COMMAND sh -c "codesign -f -s - -o linker-signed this-does-not-exist 2>&1 | grep -q linker-signed"
+          COMMAND sh -c "${CODESIGN} -f -s - -o linker-signed this-does-not-exist 2>&1 | grep -q linker-signed"
           RESULT_VARIABLE CODESIGN_SUPPORTS_LINKER_SIGNED
         )
 
@@ -415,7 +416,7 @@ function(add_compiler_rt_runtime name type)
 
         add_custom_command(TARGET ${libname}
           POST_BUILD
-          COMMAND codesign --sign - ${EXTRA_CODESIGN_ARGUMENTS} $<TARGET_FILE:${libname}>
+          COMMAND ${CODESIGN} --sign - ${EXTRA_CODESIGN_ARGUMENTS} $<TARGET_FILE:${libname}>
           WORKING_DIRECTORY ${COMPILER_RT_OUTPUT_LIBRARY_DIR}
           COMMAND_EXPAND_LISTS
         )
diff --git a/compiler-rt/cmake/base-config-ix.cmake b/compiler-rt/cmake/base-config-ix.cmake
index 9717c21..d92bc0e 100644
--- a/compiler-rt/cmake/base-config-ix.cmake
+++ b/compiler-rt/cmake/base-config-ix.cmake
@@ -89,6 +89,11 @@ else()
   set(COMPILER_RT_TEST_COMPILER_ID GNU)
 endif()
 
+# AppleClang expects 'Clang' as compiler-rt test compiler ID.
+if ("${COMPILER_RT_TEST_COMPILER_ID}" STREQUAL "AppleClang")
+  set(COMPILER_RT_TEST_COMPILER_ID Clang)
+endif()
+
 if(NOT DEFINED COMPILER_RT_OS_DIR)
   if(ANDROID)
     # The CMAKE_SYSTEM_NAME for Android is Android, but the OS is Linux and the
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 706a1ff7..b1bde47 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -43,9 +43,9 @@ asm(\"cas w0, w1, [x2]\");
 builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
 void foo(void)  __arm_streaming_compatible {
-  asm(\".arch armv9-a+sme2\");
-  asm(\"smstart\");
-  asm(\"ldr zt0, [sp]\");
+  asm(\".arch armv9-a+sme2\\n\"
+      \"smstart\\n\"
+      \"ldr zt0, [sp]\");
 }
 ")
 
diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt
index 5ec995a..e2f39f2 100644
--- a/compiler-rt/lib/asan/CMakeLists.txt
+++ b/compiler-rt/lib/asan/CMakeLists.txt
@@ -260,7 +260,6 @@ else()
       STATIC
       ARCHS ${ASAN_SUPPORTED_ARCH}
       OBJECT_LIBS RTAsan_cxx
-                  RTUbsan_cxx
       CFLAGS ${ASAN_CFLAGS}
       DEFS ${ASAN_COMMON_DEFINITIONS}
       PARENT_TARGET asan)
diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt
index 998e0ff..00dcbf6 100644
--- a/compiler-rt/lib/asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/asan/tests/CMakeLists.txt
@@ -275,8 +275,7 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS AND NOT ANDROID)
         $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
         $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.${arch}>
         $<TARGET_OBJECTS:RTLSanCommon.${arch}>
-        $<TARGET_OBJECTS:RTUbsan.${arch}>
-        $<TARGET_OBJECTS:RTUbsan_cxx.${arch}>)
+        $<TARGET_OBJECTS:RTUbsan.${arch}>)
     endif()
     add_library(${ASAN_TEST_RUNTIME} STATIC ${ASAN_TEST_RUNTIME_OBJECTS})
     set_target_properties(${ASAN_TEST_RUNTIME} PROPERTIES
@@ -303,7 +302,6 @@ if(ANDROID)
       $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.${arch}>
       $<TARGET_OBJECTS:RTLSanCommon.${arch}>
       $<TARGET_OBJECTS:RTUbsan.${arch}>
-      $<TARGET_OBJECTS:RTUbsan_cxx.${arch}>
       ${COMPILER_RT_GTEST_SOURCE}
       ${ASAN_NOINST_TEST_SOURCES})
     set_target_compile_flags(AsanNoinstTest ${ASAN_UNITTEST_COMMON_CFLAGS})
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 3a868c1..19316c5 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -310,6 +310,7 @@ set(x86_80_BIT_SOURCES
   mulxc3.c
   powixf2.c
   trunctfxf2.c
+  truncxfhf2.c
 )
 
 if (NOT MSVC)
@@ -575,7 +576,11 @@ set(aarch64_SOURCES
 set(COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR NOT(FUCHSIA OR APPLE))
 
 if (COMPILER_RT_HAS_AARCH64_SME)
-  if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR)
+  if (APPLE)
+    list(APPEND aarch64_SOURCES aarch64/arm_apple_sme_abi.s)
+    set_source_files_properties(aarch64/arm_apple_sme_abi.s PROPERTIES COMPILE_FLAGS -march=armv8a+sme)
+    message(STATUS "AArch64 Apple SME ABI routines enabled")
+  elseif (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR)
     list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-assert.c aarch64/sme-libc-routines.c)
     message(STATUS "AArch64 SME ABI routines enabled")
     set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
diff --git a/compiler-rt/lib/builtins/aarch64/arm_apple_sme_abi.s b/compiler-rt/lib/builtins/aarch64/arm_apple_sme_abi.s
new file mode 100644
index 0000000..f0ccaaf
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/arm_apple_sme_abi.s
@@ -0,0 +1,129 @@
+#include "../assembly.h"
+
+.arch armv8-a+sme2
+
+// For Apple platforms at the moment, we just call abort() directly
+// after stopping SM mode unconditionally.
+.p2align 2
+DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
+.cfi_startproc
+	.variant_pcs	SYMBOL_NAME(do_abort)
+	stp	x29, x30, [sp, #-32]!
+  .cfi_def_cfa_offset 32
+  .cfi_offset w30, -24
+  .cfi_offset w29, -32
+	smstop sm
+	bl	SYMBOL_NAME(abort)
+.cfi_endproc
+END_COMPILERRT_FUNCTION(do_abort)
+
+DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_save)
+  // If TPIDR2_EL0 is null, the subroutine does nothing.
+  mrs x16, TPIDR2_EL0
+  cbz x16, 1f
+
+  // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are
+  // nonzero, the subroutine [..] aborts in some platform-defined manner.
+  ldrh  w14, [x16, #10]
+  cbnz  w14, 2f
+  ldr w14, [x16, #12]
+  cbnz  w14, 2f
+
+  // If za_save_buffer is NULL, the subroutine does nothing.
+  ldr x14, [x16]
+  cbz x14, 1f
+
+  // If num_za_save_slices is zero, the subroutine does nothing.
+  ldrh  w14, [x16, #8]
+  cbz x14, 1f
+
+  mov x15, xzr
+  ldr x16, [x16]
+0:
+  str za[w15,0], [x16]
+  addsvl x16, x16, #1
+  add x15, x15, #1
+  cmp x14, x15
+  b.ne  0b
+1:
+  ret
+2:
+  b  SYMBOL_NAME(do_abort)
+END_COMPILERRT_FUNCTION(__arm_tpidr2_save)
+
+.p2align 2
+DEFINE_COMPILERRT_FUNCTION(__arm_za_disable)
+.cfi_startproc
+  // Otherwise, the subroutine behaves as if it did the following:
+  // * Call __arm_tpidr2_save.
+  stp x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+  bl  SYMBOL_NAME(__arm_tpidr2_save)
+
+  // * Set TPIDR2_EL0 to null.
+  msr TPIDR2_EL0, xzr
+
+  // * Set PSTATE.ZA to 0.
+  smstop za
+
+  .cfi_def_cfa wsp, 16
+  ldp x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+0:
+  ret
+.cfi_endproc
+END_COMPILERRT_FUNCTION(__arm_za_disable)
+
+.p2align 2
+DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_restore)
+.cfi_startproc
+  .variant_pcs	SYMBOL_NAME(__arm_tpidr2_restore)
+  // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific
+  // manner.
+  mrs x14, TPIDR2_EL0
+  cbnz  x14, 2f
+
+  // If any of the reserved bytes in the first 16 bytes of BLK are nonzero,
+  // the subroutine [..] aborts in some platform-defined manner.
+  ldrh  w14, [x0, #10]
+  cbnz  w14, 2f
+  ldr w14, [x0, #12]
+  cbnz  w14, 2f
+
+  // If BLK.za_save_buffer is NULL, the subroutine does nothing.
+  ldr x16, [x0]
+  cbz x16, 1f
+
+  // If BLK.num_za_save_slices is zero, the subroutine does nothing.
+  ldrh  w14, [x0, #8]
+  cbz x14, 1f
+
+  mov x15, xzr
+0:
+  ldr za[w15,0], [x16]
+  addsvl x16, x16, #1
+  add x15, x15, #1
+  cmp x14, x15
+  b.ne  0b
+1:
+  ret
+2:
+  b  SYMBOL_NAME(do_abort)
+.cfi_endproc
+END_COMPILERRT_FUNCTION(__arm_tpidr2_restore)
+
+.p2align 2
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_state)
+	.variant_pcs	SYMBOL_NAME(__arm_sme_state)
+  orr x0, x0, #0xC000000000000000
+  mrs x16, SVCR
+  bfxil x0, x16, #0, #2
+  mrs x1, TPIDR2_EL0
+  ret
+END_COMPILERRT_FUNCTION(__arm_sme_state)
diff --git a/compiler-rt/lib/builtins/truncxfhf2.c b/compiler-rt/lib/builtins/truncxfhf2.c
new file mode 100644
index 0000000..0f06398
--- /dev/null
+++ b/compiler-rt/lib/builtins/truncxfhf2.c
@@ -0,0 +1,15 @@
+//===-- lib/truncsfhf2.c - long double -> half conversion ---------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SRC_SINGLE
+#define DST_HALF
+#include "fp_trunc_impl.inc"
+
+COMPILER_RT_ABI dst_t __truncxfhf2(xf_float a) {
+  return __truncXfYf2__((float)a);
+}
diff --git a/compiler-rt/lib/gwp_asan/tests/harness.h b/compiler-rt/lib/gwp_asan/tests/harness.h
index c96f846..3fbcf99 100644
--- a/compiler-rt/lib/gwp_asan/tests/harness.h
+++ b/compiler-rt/lib/gwp_asan/tests/harness.h
@@ -12,7 +12,9 @@
 #include <stdarg.h>
 
 #if defined(__Fuchsia__)
+#ifndef ZXTEST_USE_STREAMABLE_MACROS
 #define ZXTEST_USE_STREAMABLE_MACROS
+#endif
 #include <zxtest/zxtest.h>
 namespace testing = zxtest;
 // zxtest defines a different ASSERT_DEATH, taking a lambda and an error message
diff --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt
index afafa0c..2073941 100644
--- a/compiler-rt/lib/hwasan/CMakeLists.txt
+++ b/compiler-rt/lib/hwasan/CMakeLists.txt
@@ -188,7 +188,6 @@ function(add_hwasan_runtimes arch use_aliases)
     STATIC
     ARCHS ${arch}
     OBJECT_LIBS RTHwasan_cxx
-                RTUbsan_cxx
     CFLAGS ${hwasan_rtl_flags}
     PARENT_TARGET hwasan)
 
diff --git a/compiler-rt/lib/msan/CMakeLists.txt b/compiler-rt/lib/msan/CMakeLists.txt
index b9976b2..a0b9c61 100644
--- a/compiler-rt/lib/msan/CMakeLists.txt
+++ b/compiler-rt/lib/msan/CMakeLists.txt
@@ -66,7 +66,6 @@ foreach(arch ${MSAN_SUPPORTED_ARCH})
     STATIC
     ARCHS ${arch}
     SOURCES ${MSAN_RTL_CXX_SOURCES}
-            $<TARGET_OBJECTS:RTUbsan_cxx.${arch}>
     ADDITIONAL_HEADERS ${MSAN_RTL_HEADERS}
     CFLAGS ${MSAN_RTL_CFLAGS}
     PARENT_TARGET msan)
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 4e51f46..6a5f4b9 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -292,11 +292,34 @@ INTERCEPTOR(int, fputs, const char *s, FILE *stream) {
   return REAL(fputs)(s, stream);
 }
 
+INTERCEPTOR(int, fflush, FILE *stream) {
+  __rtsan_notify_intercepted_call("fflush");
+  return REAL(fflush)(stream);
+}
+
+#if SANITIZER_APPLE
+INTERCEPTOR(int, fpurge, FILE *stream) {
+  __rtsan_notify_intercepted_call("fpurge");
+  return REAL(fpurge)(stream);
+}
+#endif
+
 INTERCEPTOR(FILE *, fdopen, int fd, const char *mode) {
   __rtsan_notify_intercepted_call("fdopen");
   return REAL(fdopen)(fd, mode);
 }
 
+#if SANITIZER_INTERCEPT_FOPENCOOKIE
+INTERCEPTOR(FILE *, fopencookie, void *cookie, const char *mode,
+            cookie_io_functions_t funcs) {
+  __rtsan_notify_intercepted_call("fopencookie");
+  return REAL(fopencookie)(cookie, mode, funcs);
+}
+#define RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE INTERCEPT_FUNCTION(fopencookie)
+#else
+#define RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE
+#endif
+
 #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM
 INTERCEPTOR(FILE *, open_memstream, char **buf, size_t *size) {
   __rtsan_notify_intercepted_call("open_memstream");
@@ -314,6 +337,45 @@ INTERCEPTOR(FILE *, fmemopen, void *buf, size_t size, const char *mode) {
 #define RTSAN_MAYBE_INTERCEPT_FMEMOPEN
 #endif
 
+#if SANITIZER_INTERCEPT_SETVBUF
+INTERCEPTOR(void, setbuf, FILE *stream, char *buf) {
+  __rtsan_notify_intercepted_call("setbuf");
+  return REAL(setbuf)(stream, buf);
+}
+
+INTERCEPTOR(int, setvbuf, FILE *stream, char *buf, int mode, size_t size) {
+  __rtsan_notify_intercepted_call("setvbuf");
+  return REAL(setvbuf)(stream, buf, mode, size);
+}
+
+#if SANITIZER_LINUX
+INTERCEPTOR(void, setlinebuf, FILE *stream) {
+#else
+INTERCEPTOR(int, setlinebuf, FILE *stream) {
+#endif
+  __rtsan_notify_intercepted_call("setlinebuf");
+  return REAL(setlinebuf)(stream);
+}
+
+#if SANITIZER_LINUX
+INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, size_t size) {
+#else
+INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, int size) {
+#endif
+  __rtsan_notify_intercepted_call("setbuffer");
+  return REAL(setbuffer)(stream, buf, size);
+}
+#define RTSAN_MAYBE_INTERCEPT_SETBUF INTERCEPT_FUNCTION(setbuf)
+#define RTSAN_MAYBE_INTERCEPT_SETVBUF INTERCEPT_FUNCTION(setvbuf)
+#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF INTERCEPT_FUNCTION(setlinebuf)
+#define RTSAN_MAYBE_INTERCEPT_SETBUFFER INTERCEPT_FUNCTION(setbuffer)
+#else
+#define RTSAN_MAYBE_INTERCEPT_SETBUF
+#define RTSAN_MAYBE_INTERCEPT_SETVBUF
+#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF
+#define RTSAN_MAYBE_INTERCEPT_SETBUFFER
+#endif
+
 INTERCEPTOR(int, puts, const char *s) {
   __rtsan_notify_intercepted_call("puts");
   return REAL(puts)(s);
@@ -970,10 +1032,16 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_CREAT64;
   INTERCEPT_FUNCTION(puts);
   INTERCEPT_FUNCTION(fputs);
+  INTERCEPT_FUNCTION(fflush);
   INTERCEPT_FUNCTION(fdopen);
   INTERCEPT_FUNCTION(freopen);
+  RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE;
   RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM;
   RTSAN_MAYBE_INTERCEPT_FMEMOPEN;
+  RTSAN_MAYBE_INTERCEPT_SETBUF;
+  RTSAN_MAYBE_INTERCEPT_SETVBUF;
+  RTSAN_MAYBE_INTERCEPT_SETLINEBUF;
+  RTSAN_MAYBE_INTERCEPT_SETBUFFER;
   INTERCEPT_FUNCTION(lseek);
   RTSAN_MAYBE_INTERCEPT_LSEEK64;
   INTERCEPT_FUNCTION(dup);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index b052dd8..5488d3c7 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -353,6 +353,31 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
+#if SANITIZER_INTERCEPT_FOPENCOOKIE
+TEST_F(RtsanFileTest, FopenCookieDieWhenRealtime) {
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+  struct fholder {
+    FILE *fp;
+    size_t read;
+  } fh = {f, 0};
+  auto CookieRead = [](void *cookie, char *buf, size_t size) {
+    fholder *p = reinterpret_cast<fholder *>(cookie);
+    p->read = fread(static_cast<void *>(buf), 1, size, p->fp);
+    EXPECT_NE(0u, p->read);
+  };
+  cookie_io_functions_t funcs = {(cookie_read_function_t *)&CookieRead, nullptr,
+                                 nullptr, nullptr};
+  auto Func = [&fh, &funcs]() {
+    FILE *f = fopencookie(&fh, "w", funcs);
+    EXPECT_THAT(f, Ne(nullptr));
+  };
+
+  ExpectRealtimeDeath(Func, "fopencookie");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 #if SANITIZER_INTERCEPT_OPEN_MEMSTREAM
 TEST_F(RtsanFileTest, OpenMemstreamDiesWhenRealtime) {
   char *buffer;
@@ -378,6 +403,56 @@ TEST_F(RtsanFileTest, FmemOpenDiesWhenRealtime) {
 }
 #endif
 
+#if SANITIZER_INTERCEPT_SETVBUF
+TEST_F(RtsanFileTest, SetbufDieWhenRealtime) {
+  char buffer[BUFSIZ];
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [f, &buffer]() { setbuf(f, buffer); };
+
+  ExpectRealtimeDeath(Func, "setbuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetvbufDieWhenRealtime) {
+  char buffer[1024];
+  size_t size = sizeof(buffer);
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [f, &buffer, size]() {
+    int r = setvbuf(f, buffer, _IOFBF, size);
+    EXPECT_THAT(r, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, "setvbuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetlinebufDieWhenRealtime) {
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [f]() { setlinebuf(f); };
+
+  ExpectRealtimeDeath(Func, "setlinebuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetbufferDieWhenRealtime) {
+  char buffer[1024];
+  size_t size = sizeof(buffer);
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [f, &buffer, size]() { setbuffer(f, buffer, size); };
+
+  ExpectRealtimeDeath(Func, "setbuffer");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 class RtsanOpenedFileTest : public RtsanFileTest {
 protected:
   void SetUp() override {
@@ -579,6 +654,34 @@ TEST_F(RtsanOpenedFileTest, FputsDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
+TEST_F(RtsanFileTest, FflushDiesWhenRealtime) {
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+  int written = fwrite("abc", 1, 3, f);
+  EXPECT_THAT(written, Eq(3));
+  auto Func = [&f]() {
+    int res = fflush(f);
+    EXPECT_THAT(res, Eq(0));
+  };
+  ExpectRealtimeDeath(Func, "fflush");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+#if SANITIZER_APPLE
+TEST_F(RtsanFileTest, FpurgeDiesWhenRealtime) {
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+  int written = fwrite("abc", 1, 3, f);
+  EXPECT_THAT(written, Eq(3));
+  auto Func = [&f]() {
+    int res = fpurge(f);
+    EXPECT_THAT(res, Eq(0));
+  };
+  ExpectRealtimeDeath(Func, "fpurge");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 TEST_F(RtsanOpenedFileTest, ReadDiesWhenRealtime) {
   auto Func = [this]() {
     char c{};
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 47436a6..24a8a2d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -520,14 +520,14 @@ INTERCEPTOR(int, strncmp, const char *s1, const char *s2, usize size) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strncmp, s1, s2, size);
   unsigned char c1 = 0, c2 = 0;
-  uptr i;
+  usize i;
   for (i = 0; i < size; i++) {
     c1 = (unsigned char)s1[i];
     c2 = (unsigned char)s2[i];
     if (c1 != c2 || c1 == '\0') break;
   }
-  uptr i1 = i;
-  uptr i2 = i;
+  usize i1 = i;
+  usize i2 = i;
   if (common_flags()->strict_string_checks) {
     for (; i1 < size && s1[i1]; i1++) {}
     for (; i2 < size && s2[i2]; i2++) {}
@@ -583,14 +583,14 @@ INTERCEPTOR(int, strncasecmp, const char *s1, const char *s2, SIZE_T size) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strncasecmp, s1, s2, size);
   unsigned char c1 = 0, c2 = 0;
-  uptr i;
+  usize i;
   for (i = 0; i < size; i++) {
     c1 = (unsigned char)s1[i];
     c2 = (unsigned char)s2[i];
     if (CharCaseCmp(c1, c2) != 0 || c1 == '\0') break;
   }
-  uptr i1 = i;
-  uptr i2 = i;
+  usize i1 = i;
+  usize i2 = i;
   if (common_flags()->strict_string_checks) {
     for (; i1 < size && s1[i1]; i1++) {}
     for (; i2 < size && s2[i2]; i2++) {}
@@ -851,7 +851,7 @@ int MemcmpInterceptorCommon(void *ctx,
       unsigned char c1 = 0, c2 = 0;
       const unsigned char *s1 = (const unsigned char*)a1;
       const unsigned char *s2 = (const unsigned char*)a2;
-      uptr i;
+      usize i;
       for (i = 0; i < size; i++) {
         c1 = s1[i];
         c2 = s2[i];
diff --git a/compiler-rt/lib/tsan/rtl/CMakeLists.txt b/compiler-rt/lib/tsan/rtl/CMakeLists.txt
index f40e72d..d7d8470 100644
--- a/compiler-rt/lib/tsan/rtl/CMakeLists.txt
+++ b/compiler-rt/lib/tsan/rtl/CMakeLists.txt
@@ -259,7 +259,6 @@ else()
       STATIC
       ARCHS ${arch}
       SOURCES ${TSAN_CXX_SOURCES}
-              $<TARGET_OBJECTS:RTUbsan_cxx.${arch}>
       ADDITIONAL_HEADERS ${TSAN_HEADERS}
       CFLAGS ${TSAN_RTL_CFLAGS}
       PARENT_TARGET tsan)
diff --git a/compiler-rt/lib/tysan/CMakeLists.txt b/compiler-rt/lib/tysan/CMakeLists.txt
index 859b679..7d13ae3 100644
--- a/compiler-rt/lib/tysan/CMakeLists.txt
+++ b/compiler-rt/lib/tysan/CMakeLists.txt
@@ -3,11 +3,25 @@ include_directories(..)
 # Runtime library sources and build flags.
 set(TYSAN_SOURCES
   tysan.cpp
-  tysan_interceptors.cpp)
+  tysan_interceptors.cpp
+  )
+
+SET(TYSAN_HEADERS
+  tysan.h
+  tysan_flags.inc
+  tysan_platform.h
+  )
+
 set(TYSAN_COMMON_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(OFF TYSAN_COMMON_CFLAGS)
 # Prevent clang from generating libc calls.
 append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding TYSAN_COMMON_CFLAGS)
+set(TYSAN_DYNAMIC_CFLAGS ${TYSAN_COMMON_CFLAGS})
+
+set(TYSAN_COMMON_DEFINITIONS "")
+set(TYSAN_DYNAMIC_DEFINITIONS ${TYSAN_COMMON_DEFINITIONS} TYSAN_DYNAMIC=1)
+
+# Compile TYSan sources into an object library.
 
 add_compiler_rt_object_libraries(RTTysan_dynamic
   OS ${SANITIZER_COMMON_SUPPORTED_OS}
@@ -47,17 +61,18 @@ if(APPLE)
     DEFS ${TYSAN_COMMON_DEFINITIONS}
     PARENT_TARGET tysan)
 else()
+  set(TYSAN_CFLAGS ${TYSAN_COMMON_CFLAGS})
+  append_list_if(COMPILER_RT_HAS_FPIE_FLAG -fPIE TYSAN_CFLAGS)
+
   foreach(arch ${TYSAN_SUPPORTED_ARCH})
-    set(TYSAN_CFLAGS ${TYSAN_COMMON_CFLAGS})
-    append_list_if(COMPILER_RT_HAS_FPIE_FLAG -fPIE TYSAN_CFLAGS)
     add_compiler_rt_runtime(clang_rt.tysan
       STATIC
       ARCHS ${arch}
       SOURCES ${TYSAN_SOURCES}
-              $<TARGET_OBJECTS:RTInterception.${arch}>
-              $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
-              $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
-              $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
+      OBJECT_LIBS RTInterception
+                  RTSanitizerCommon
+                  RTSanitizerCommonLibc
+                  RTSanitizerCommonSymbolizer
       CFLAGS ${TYSAN_CFLAGS}
       PARENT_TARGET tysan)
   endforeach()
diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp
index 39d78e7..9c87b47 100644
--- a/compiler-rt/lib/tysan/tysan.cpp
+++ b/compiler-rt/lib/tysan/tysan.cpp
@@ -197,10 +197,14 @@ static void reportError(void *Addr, int Size, tysan_type_descriptor *TD,
     Printf("\n");
 
   if (pc) {
+    uptr top = 0;
+    uptr bottom = 0;
+    if (flags().print_stacktrace)
+      GetThreadStackTopAndBottom(false, &top, &bottom);
 
     bool request_fast = StackTrace::WillUseFastUnwind(true);
     BufferedStackTrace ST;
-    ST.Unwind(kStackTraceMax, pc, bp, 0, 0, 0, request_fast);
+    ST.Unwind(kStackTraceMax, pc, bp, 0, top, bottom, request_fast);
     ST.Print();
   } else {
     Printf("\n");
diff --git a/compiler-rt/lib/tysan/tysan_flags.inc b/compiler-rt/lib/tysan/tysan_flags.inc
index 98b6591..be65c8e 100644
--- a/compiler-rt/lib/tysan/tysan_flags.inc
+++ b/compiler-rt/lib/tysan/tysan_flags.inc
@@ -15,3 +15,6 @@
 
 // TYSAN_FLAG(Type, Name, DefaultValue, Description)
 // See COMMON_FLAG in sanitizer_flags.inc for more details.
+
+TYSAN_FLAG(bool, print_stacktrace, false,
+           "Include full stacktrace into an error report")
diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt
index 5d45a53..a6c98c4 100644
--- a/compiler-rt/lib/ubsan/CMakeLists.txt
+++ b/compiler-rt/lib/ubsan/CMakeLists.txt
@@ -43,18 +43,15 @@ include_directories(..)
 set(UBSAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_list_if(MSVC /Zl UBSAN_CFLAGS)
 append_rtti_flag(OFF UBSAN_CFLAGS)
-append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_CFLAGS)
 
 # Too many existing bugs, needs cleanup.
 append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format UBSAN_CFLAGS)
 
 set(UBSAN_STANDALONE_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(OFF UBSAN_STANDALONE_CFLAGS)
-append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_STANDALONE_CFLAGS)
 
 set(UBSAN_CXXFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(ON UBSAN_CXXFLAGS)
-append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_CXXFLAGS)
 
 # Silence warnings in system headers with MSVC.
 if(NOT CLANG_CL)
diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
index ac7001c..63319f4 100644
--- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
@@ -899,10 +899,7 @@ static void handleCFIBadIcall(CFICheckFailData *Data, ValueHandle Function,
 
 namespace __ubsan {
 
-#ifdef UBSAN_CAN_USE_CXXABI
-
 #ifdef _WIN32
-
 extern "C" void __ubsan_handle_cfi_bad_type_default(CFICheckFailData *Data,
                                                     ValueHandle Vtable,
                                                     bool ValidVtable,
@@ -911,20 +908,17 @@ extern "C" void __ubsan_handle_cfi_bad_type_default(CFICheckFailData *Data,
 }
 
 WIN_WEAK_ALIAS(__ubsan_handle_cfi_bad_type, __ubsan_handle_cfi_bad_type_default)
-#else
-SANITIZER_WEAK_ATTRIBUTE
-#endif
 void __ubsan_handle_cfi_bad_type(CFICheckFailData *Data, ValueHandle Vtable,
                                  bool ValidVtable, ReportOptions Opts);
-
 #else
+SANITIZER_WEAK_ATTRIBUTE
 void __ubsan_handle_cfi_bad_type(CFICheckFailData *Data, ValueHandle Vtable,
                                  bool ValidVtable, ReportOptions Opts) {
   Die();
 }
 #endif
 
-}  // namespace __ubsan
+} // namespace __ubsan
 
 void __ubsan::__ubsan_handle_cfi_check_fail(CFICheckFailData *Data,
                                             ValueHandle Value,
diff --git a/compiler-rt/lib/ubsan/ubsan_value.h b/compiler-rt/lib/ubsan/ubsan_value.h
index 430c9ea..ee523cf 100644
--- a/compiler-rt/lib/ubsan/ubsan_value.h
+++ b/compiler-rt/lib/ubsan/ubsan_value.h
@@ -150,9 +150,12 @@ public:
 
   unsigned getIntegerBitCount() const {
     DCHECK(isIntegerTy());
-    if (isSignedBitIntTy())
-      return *reinterpret_cast<const u32 *>(getBitIntBitCountPointer());
-    else
+    if (isSignedBitIntTy()) {
+      u32 BitCountValue;
+      internal_memcpy(&BitCountValue, getBitIntBitCountPointer(),
+                      sizeof(BitCountValue));
+      return BitCountValue;
+    } else
       return getIntegerBitWidth();
   }
 
diff --git a/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp b/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
index 2d72949..60ef0e5 100644
--- a/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
+++ b/compiler-rt/test/asan/TestCases/Linux/interface_symbols_linux.cpp
@@ -23,6 +23,8 @@
 // RUN:  | grep -v "__sanitizer_weak_hook"                                         \
 // RUN:  | grep -v "__sanitizer_override_function"                                 \
 // RUN:  | grep -v "__sanitizer_override_function_by_addr"                         \
+// RUN:  | grep -v "__ubsan_handle_dynamic_type_cache_miss"                        \
+// RUN:  | grep -v "__ubsan_handle_dynamic_type_cache_miss_abort"                  \
 // RUN:  | grep -v "__sanitizer_register_weak_function"                            \
 // RUN:  | sed -e "s/.*(//" -e "s/).*//" > %t.imports
 //
diff --git a/compiler-rt/test/builtins/Unit/truncxfhf2_test.c b/compiler-rt/test/builtins/Unit/truncxfhf2_test.c
new file mode 100644
index 0000000..b5e2a91
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/truncxfhf2_test.c
@@ -0,0 +1,74 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_truncxfhf2
+
+#include <stdio.h>
+
+#include "fp_test.h"
+
+#if HAS_80_BIT_LONG_DOUBLE
+
+TYPE_FP16 __truncxfhf2(xf_float f);
+
+int test_truncxfhf2(uint16_t inputHi, uint64_t inputLo, uint16_t e) {
+  xf_float a = F80FromRep80(inputHi, inputLo);
+  TYPE_FP16 x = __truncxfhf2(a);
+  int ret = compareResultH(x, e);
+  if (ret) {
+    printf("error in test__truncxfhf2(%Lf) = %#.4x, "
+           "expected %#.4x\n",
+           a, toRep16(x), e);
+  }
+  return ret;
+}
+
+int main() {
+  // Small positive value
+  if (test_truncxfhf2(UINT16_C(0x3ffb), UINT64_C(0xccc0000000000000),
+                      UINT16_C(0x2e66)))
+    return 1;
+
+  // Small negative value
+  if (test_truncxfhf2(UINT16_C(0xbffb), UINT64_C(0xccc0000000000000),
+                      UINT16_C(0xae66)))
+    return 1;
+
+  // Zero
+  if (test_truncxfhf2(UINT16_C(0x0), UINT64_C(0x0), UINT16_C(0)))
+    return 1;
+
+  // Smallest positive non-zero value
+  if (test_truncxfhf2(UINT16_C(0x3fef), UINT64_C(0x8000000000000000),
+                      UINT16_C(0x0100)))
+    return 1;
+
+  // Smallest negative non-zero value
+  if (test_truncxfhf2(UINT16_C(0xbfef), UINT64_C(0x8000000000000000),
+                      UINT16_C(0x8100)))
+    return 1;
+
+  // Positive infinity
+  if (test_truncxfhf2(UINT16_C(0x7fff), UINT64_C(0x8000000000000000),
+                      UINT16_C(0x7c00)))
+    return 1;
+
+  // Negative infinity
+  if (test_truncxfhf2(UINT16_C(0xffff), UINT64_C(0x8000000000000000),
+                      UINT16_C(0xfc00)))
+    return 1;
+
+  // NaN
+  if (test_truncxfhf2(UINT16_C(0x7fff), UINT64_C(0xc000000000000000),
+                      UINT16_C(0x7e00)))
+    return 1;
+
+  return 0;
+}
+
+#else
+
+int main() {
+  printf("skipped\n");
+  return 0;
+}
+
+#endif
diff --git a/compiler-rt/test/fuzzer/noasan-strcmp.test b/compiler-rt/test/fuzzer/noasan-strcmp.test
index 0d82d6b..f73af35 100644
--- a/compiler-rt/test/fuzzer/noasan-strcmp.test
+++ b/compiler-rt/test/fuzzer/noasan-strcmp.test
@@ -1,4 +1,4 @@
-UNSUPPORTED: darwin, target={{.*(freebsd|windows).*}}
+UNSUPPORTED: darwin, target={{.*(freebsd|windows).*}}, target=aarch64{{.*}}
 
 RUN: %cpp_compiler -fno-sanitize=address %S/StrcmpTest.cpp -o %t-NoAsanStrcmpTest
 RUN: not %run %t-NoAsanStrcmpTest -seed=1 -runs=2000000   2>&1 | FileCheck %s
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/basic.c b/compiler-rt/test/profile/ContinuousSyncMode/basic.c
index e8bd087..531877b 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/basic.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/basic.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: %clang_profgen_cont -fcoverage-mapping -o %t.exe %s
 // RUN: echo "garbage" > %t.profraw
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/get-filename.c b/compiler-rt/test/profile/ContinuousSyncMode/get-filename.c
index 40a0cc5..e341dd4 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/get-filename.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/get-filename.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: %clang_pgogen_cont -o %t.exe %s
 // RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe %t.profraw
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c b/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c
index d171bad..fa24e26 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/image-with-mcdc.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: %clang_profgen_cont -fcoverage-mapping -fcoverage-mcdc -O3 -o %t.exe %s
 // RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe 3 3
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp b/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp
index ff05a69..aa0a46e 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp
+++ b/compiler-rt/test/profile/ContinuousSyncMode/multi-threaded.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: rm -f %t.profraw
 // RUN: %clangxx_pgogen_cont -lpthread %s -o %t.exe -mllvm -disable-vp -fprofile-update=atomic
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
index 5434648..c193141 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/online-merging.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // Test the online merging mode (%m) along with continuous mode (%c).
 //
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/pid-substitution.c b/compiler-rt/test/profile/ContinuousSyncMode/pid-substitution.c
index 309b685..8a00b28 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/pid-substitution.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/pid-substitution.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: rm -rf %t.dir && mkdir -p %t.dir
 // RUN: %clang_pgogen_cont -o %t.exe %s
diff --git a/compiler-rt/test/profile/ContinuousSyncMode/set-filename.c b/compiler-rt/test/profile/ContinuousSyncMode/set-filename.c
index 106e12e..abc7264 100644
--- a/compiler-rt/test/profile/ContinuousSyncMode/set-filename.c
+++ b/compiler-rt/test/profile/ContinuousSyncMode/set-filename.c
@@ -1,4 +1,4 @@
-// REQUIRES: target={{.*(darwin|aix).*}}
+// REQUIRES: continuous-mode
 
 // RUN: %clang_pgogen_cont -o %t.exe %s
 // RUN: env LLVM_PROFILE_FILE="%c%t.profraw" %run %t.exe %t.profraw %t.bad
diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov
index aa20276..233fd14 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov
@@ -2,7 +2,6 @@
 // CHECK-NEXT:        -:    0:Graph:instrprof-gcov-exceptions.gcno
 // CHECK-NEXT:        -:    0:Data:instrprof-gcov-exceptions.gcda
 // CHECK-NEXT:        -:    0:Runs:1
-// CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:        -:    1:#include <string>
 // CHECK-NEXT:        -:    2:
 // CHECK-NEXT:        1:    3:void asd(std::string i) {
diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
index 9297073..a25632d 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
@@ -2,7 +2,6 @@
 // CHECK-NEXT:        -:    0:Graph:instrprof-gcov-multiple-bbs-single-line.gcno
 // CHECK-NEXT:        -:    0:Data:instrprof-gcov-multiple-bbs-single-line.gcda
 // CHECK-NEXT:        -:    0:Runs:1
-// CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:function main called 1 returned 100% blocks executed 77%
 // CHECK-NEXT:        1:    1:int main(void)
 // CHECK-NEXT:        -:    2:{
diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov
index 5a570a0..4dc6817 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-one-line-function.c.gcov
@@ -2,7 +2,6 @@
 // CHECK-NEXT:        -:    0:Graph:instrprof-gcov-one-line-function.gcno
 // CHECK-NEXT:        -:    0:Data:instrprof-gcov-one-line-function.gcda
 // CHECK-NEXT:        -:    0:Runs:1
-// CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:        1:    1:void foo() { }
 // CHECK-NEXT:        -:    2:
 // CHECK-NEXT:        1:    3:void bar() { }
diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov
index 741dff5..2b4d67f 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch1.c.gcov
@@ -2,7 +2,6 @@
 // CHECK-NEXT:        -:    0:Graph:instrprof-gcov-switch1.gcno
 // CHECK-NEXT:        -:    0:Data:instrprof-gcov-switch1.gcda
 // CHECK-NEXT:        -:    0:Runs:1
-// CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:        1:    1:int main(void)
 // CHECK-NEXT:        -:    2:{
 // CHECK-NEXT:        1:    3:  int i = 22;
diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov
index c931365..f9501e0 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-switch2.c.gcov
@@ -2,7 +2,6 @@
 // CHECK-NEXT:        -:    0:Graph:instrprof-gcov-switch2.gcno
 // CHECK-NEXT:        -:    0:Data:instrprof-gcov-switch2.gcda
 // CHECK-NEXT:        -:    0:Runs:1
-// CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:        1:    1:int main(void)
 // CHECK-NEXT:        -:    2:{
 // CHECK-NEXT:        1:    3:  int i = 22;
diff --git a/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov
index 6935047..d75a222 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-shared-lib_in-loop.c.gcov
@@ -2,7 +2,6 @@
 // CHECK-NEXT:        -:    0:Graph:instrprof-shared-lib.gcno
 // CHECK-NEXT:        -:    0:Data:instrprof-shared-lib.gcda
 // CHECK-NEXT:        -:    0:Runs:1
-// CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:        -:    1:int g1 = 0;
 // CHECK-NEXT:        -:    2:int g2 = 1;
 // CHECK-NEXT:        -:    3:
diff --git a/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov
index a31a602..24facb5 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-shared-main.c.gcov
@@ -2,7 +2,6 @@
 // CHECK-NEXT:        -:    0:Graph:instrprof-shared-main.gcno
 // CHECK-NEXT:        -:    0:Data:instrprof-shared-main.gcda
 // CHECK-NEXT:        -:    0:Runs:1
-// CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:        -:    1:extern int g1, g2;
 // CHECK-NEXT:        -:    2:extern void foo(int n);
 // CHECK-NEXT:        -:    3:
diff --git a/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c b/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c
index ca13a08..96cf4296 100644
--- a/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c
+++ b/compiler-rt/test/profile/gcov-__gcov_flush-terminate.c
@@ -8,7 +8,6 @@
 // RUN: llvm-cov gcov -t gcov-__gcov_flush-terminate.gcda | FileCheck %s
 
 // CHECK:             -:    0:Runs:1
-// CHECK-NEXT:        -:    0:Programs:1
 
 void __gcov_dump(void);
 void __gcov_reset(void);
diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py
index 72a389e..fc2baf7 100644
--- a/compiler-rt/test/profile/lit.cfg.py
+++ b/compiler-rt/test/profile/lit.cfg.py
@@ -31,7 +31,7 @@ if (
 target_is_msvc = bool(re.match(r".*-windows-msvc$", config.target_triple))
 
 # Whether continous profile collection (%c) requires runtime counter relocation on this platform
-runtime_reloc = bool(config.host_os in ["AIX"])
+runtime_reloc = bool(config.host_os in ["AIX", "Linux"])
 
 if config.host_os in ["Linux"]:
     extra_link_flags = ["-ldl"]
@@ -210,3 +210,6 @@ if config.android:
 
 if config.have_curl:
     config.available_features.add("curl")
+
+if config.host_os in ("AIX", "Darwin", "Linux"):
+    config.available_features.add("continuous-mode")
diff --git a/compiler-rt/test/tysan/print_stacktrace.c b/compiler-rt/test/tysan/print_stacktrace.c
new file mode 100644
index 0000000..3ffb606
--- /dev/null
+++ b/compiler-rt/test/tysan/print_stacktrace.c
@@ -0,0 +1,22 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefixes=CHECK,CHECK-SHORT %s < %t.out
+
+// RUN: %env_tysan_opts=print_stacktrace=1 %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefixes=CHECK,CHECK-LONG %s < %t.out
+
+float *P;
+void zero_array() {
+  int i;
+  for (i = 0; i < 1; ++i)
+    P[i] = 0.0f;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type float accesses an existing object of type p1 float
+  // CHECK: {{#0 0x.* in zero_array .*print_stacktrace.c:}}[[@LINE-3]]
+  // CHECK-SHORT-NOT: {{#1 0x.* in main .*print_stacktrace.c}}
+  // CHECK-LONG-NEXT: {{#1 0x.* in main .*print_stacktrace.c}}
+}
+
+int main() {
+  P = (float *)&P;
+  zero_array();
+}
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 2d1c967..f25f0d1 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -160,7 +160,11 @@ end
 * `<>` as synonym for `.NE.` and `/=`
 * `$` and `@` as legal characters in names
 * Initialization in type declaration statements using `/values/`
-* Saved variables without explicit or default initializers are zero initialized.
+* Saved variables without explicit or default initializers are zero initialized,
+  except for scalar variables from the main program that are not explicitly
+  initialized or marked with an explicit SAVE attribute (these variables may be
+  placed on the stack by flang and not zero initialized). It is not advised to
+  rely on this extension in new code.
 * In a saved entity of a type with a default initializer, components without default
   values are zero initialized.
 * Kind specification with `*`, e.g. `REAL*4`
@@ -406,6 +410,7 @@ end
 * A character length specifier in a component or entity declaration
   is accepted before an array specification (`ch*3(2)`) as well
   as afterwards.
+* A zero field width is allowed for logical formatted output (`L0`).
 
 ### Extensions supported when enabled by options
 
diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
index 665b92b..c78dd7f 100644
--- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
+++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
@@ -90,6 +90,10 @@ SourcePosition OpenMPCounterVisitor::getLocation(const OpenMPConstruct &c) {
             const CharBlock &source{c.source};
             return (parsing->allCooked().GetSourcePositionRange(source))->first;
           },
+          [&](const OpenMPUtilityConstruct &c) -> SourcePosition {
+            const CharBlock &source{c.source};
+            return (parsing->allCooked().GetSourcePositionRange(source))->first;
+          },
       },
       c.u);
 }
@@ -102,10 +106,16 @@ std::string OpenMPCounterVisitor::getName(const OmpWrapperType &w) {
   return getName(*std::get<const OpenMPDeclarativeConstruct *>(w));
 }
 std::string OpenMPCounterVisitor::getName(const OpenMPDeclarativeConstruct &c) {
-  return std::visit(
-      [&](const auto &o) -> std::string {
-        const CharBlock &source{std::get<Verbatim>(o.t).source};
-        return normalize_construct_name(source.ToString());
+  return std::visit( //
+      Fortran::common::visitors{
+          [&](const OpenMPUtilityConstruct &o) -> std::string {
+            const CharBlock &source{o.source};
+            return normalize_construct_name(source.ToString());
+          },
+          [&](const auto &o) -> std::string {
+            const CharBlock &source{std::get<Verbatim>(o.t).source};
+            return normalize_construct_name(source.ToString());
+          },
       },
       c.u);
 }
@@ -143,8 +153,8 @@ std::string OpenMPCounterVisitor::getName(const OpenMPConstruct &c) {
                 },
                 c.u);
           },
-          [&](const OpenMPErrorConstruct &c) -> std::string {
-            const CharBlock &source{std::get<0>(c.t).source};
+          [&](const OpenMPUtilityConstruct &c) -> std::string {
+            const CharBlock &source{c.source};
             return normalize_construct_name(source.ToString());
           },
           [&](const OpenMPSectionConstruct &c) -> std::string {
diff --git a/flang/include/flang/Common/format.h b/flang/include/flang/Common/format.h
index de69671..c5e9fb0 100644
--- a/flang/include/flang/Common/format.h
+++ b/flang/include/flang/Common/format.h
@@ -463,10 +463,13 @@ template <typename CHAR> void FormatValidator<CHAR>::check_r(bool allowed) {
 template <typename CHAR> bool FormatValidator<CHAR>::check_w() {
   if (token_.kind() == TokenKind::UnsignedInteger) {
     wValue_ = integerValue_;
-    if (wValue_ == 0 &&
-        (*argString_ == 'A' || *argString_ == 'L' ||
-            stmt_ == IoStmtKind::Read)) { // C1306, 13.7.2.1p6
-      ReportError("'%s' edit descriptor 'w' value must be positive");
+    if (wValue_ == 0) {
+      if (*argString_ == 'A' || stmt_ == IoStmtKind::Read) {
+        // C1306, 13.7.2.1p6
+        ReportError("'%s' edit descriptor 'w' value must be positive");
+      } else if (*argString_ == 'L') {
+        ReportWarning("'%s' edit descriptor 'w' value should be positive");
+      }
     }
     NextToken();
     return true;
diff --git a/flang/include/flang/Common/idioms.h b/flang/include/flang/Common/idioms.h
index 99f383e..06631bc 100644
--- a/flang/include/flang/Common/idioms.h
+++ b/flang/include/flang/Common/idioms.h
@@ -87,7 +87,10 @@ template <typename... LAMBDAS> visitors(LAMBDAS... x) -> visitors<LAMBDAS...>;
 // To disable, compile with '-DCHECK=(void)'
 #ifndef CHECK
 #define CHECK(x) ((x) || (DIE("CHECK(" #x ") failed"), false))
+#endif
+
 // Same as above, but with a custom error message.
+#ifndef CHECK_MSG
 #define CHECK_MSG(x, y) ((x) || (DIE("CHECK(" #x ") failed: " #y), false))
 #endif
 
diff --git a/flang/include/flang/Evaluate/call.h b/flang/include/flang/Evaluate/call.h
index 7531d8a..6327743 100644
--- a/flang/include/flang/Evaluate/call.h
+++ b/flang/include/flang/Evaluate/call.h
@@ -250,6 +250,7 @@ public:
 
   std::optional<Expr<SubscriptInteger>> LEN() const;
   int Rank() const;
+  static constexpr int Corank() { return 0; } // TODO
   bool IsElemental() const { return proc_.IsElemental(); }
   bool hasAlternateReturns() const { return hasAlternateReturns_; }
 
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index 11533a72..357fc3e 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -102,6 +102,10 @@ public:
     }
     if (auto type{x.GetType()}) {
       TypeAndShape result{*type, GetShape(context, x, invariantOnly)};
+      result.corank_ = GetCorank(x);
+      if (result.corank_ > 0) {
+        result.attrs_.set(Attr::Coarray);
+      }
       if (type->category() == TypeCategory::Character) {
         if (const auto *chExpr{UnwrapExpr<Expr<SomeCharacter>>(x)}) {
           if (auto length{chExpr->LEN()}) {
diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h
index d9866a0..61a8144 100644
--- a/flang/include/flang/Evaluate/constant.h
+++ b/flang/include/flang/Evaluate/constant.h
@@ -65,6 +65,7 @@ public:
   ~ConstantBounds();
   const ConstantSubscripts &shape() const { return shape_; }
   int Rank() const { return GetRank(shape_); }
+  static constexpr int Corank() { return 0; }
   Constant<SubscriptInteger> SHAPE() const;
 
   // It is possible in this representation for a constant array to have
diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h
index 9ea037a..04f4406 100644
--- a/flang/include/flang/Evaluate/expression.h
+++ b/flang/include/flang/Evaluate/expression.h
@@ -92,6 +92,7 @@ public:
 
   std::optional<DynamicType> GetType() const;
   int Rank() const;
+  int Corank() const;
   std::string AsFortran() const;
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const;
@@ -190,6 +191,7 @@ public:
       return rank;
     }
   }
+  static constexpr int Corank() { return 0; }
 
   bool operator==(const Operation &that) const {
     return operand_ == that.operand_;
@@ -395,6 +397,7 @@ struct ImpliedDoIndex {
   using Result = SubscriptInteger;
   bool operator==(const ImpliedDoIndex &) const;
   static constexpr int Rank() { return 0; }
+  static constexpr int Corank() { return 0; }
   parser::CharBlock name; // nested implied DOs must use distinct names
 };
 
@@ -441,6 +444,7 @@ public:
 
   bool operator==(const ArrayConstructorValues &) const;
   static constexpr int Rank() { return 1; }
+  static constexpr int Corank() { return 0; }
   template <typename A> common::NoLvalue<A> Push(A &&x) {
     values_.emplace_back(std::move(x));
   }
@@ -680,6 +684,7 @@ public:
   int Rank() const {
     return common::visit([](const auto &x) { return x.Rank(); }, u);
   }
+  static constexpr int Corank() { return 0; }
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &o) const;
   common::MapTemplate<Relational, DirectlyComparableTypes> u;
 };
@@ -766,7 +771,8 @@ public:
   std::optional<Expr<SomeType>> Find(const Symbol &) const;
 
   StructureConstructor &Add(const semantics::Symbol &, Expr<SomeType> &&);
-  int Rank() const { return 0; }
+  static constexpr int Rank() { return 0; }
+  static constexpr int Corank() { return 0; }
   DynamicType GetType() const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
@@ -820,7 +826,8 @@ using BOZLiteralConstant = typename LargestReal::Scalar::Word;
 // Null pointers without MOLD= arguments are typed by context.
 struct NullPointer {
   constexpr bool operator==(const NullPointer &) const { return true; }
-  constexpr int Rank() const { return 0; }
+  static constexpr int Rank() { return 0; }
+  static constexpr int Corank() { return 0; }
 };
 
 // Procedure pointer targets are treated as if they were typeless.
diff --git a/flang/include/flang/Evaluate/shape.h b/flang/include/flang/Evaluate/shape.h
index e33044c..e679a00 100644
--- a/flang/include/flang/Evaluate/shape.h
+++ b/flang/include/flang/Evaluate/shape.h
@@ -117,6 +117,14 @@ MaybeExtentExpr GetExtent(const Subscript &, const NamedEntity &, int dimension,
 MaybeExtentExpr GetExtent(FoldingContext &, const Subscript &,
     const NamedEntity &, int dimension, bool invariantOnly = true);
 
+// Similar analyses for coarrays
+MaybeExtentExpr GetLCOBOUND(
+    const Symbol &, int dimension, bool invariantOnly = true);
+MaybeExtentExpr GetUCOBOUND(
+    const Symbol &, int dimension, bool invariantOnly = true);
+Shape GetLCOBOUNDs(const Symbol &, bool invariantOnly = true);
+Shape GetUCOBOUNDs(const Symbol &, bool invariantOnly = true);
+
 // Compute an element count for a triplet or trip count for a DO.
 ExtentExpr CountTrips(
     ExtentExpr &&lower, ExtentExpr &&upper, ExtentExpr &&stride);
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index f586c59..ec5fc7a 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -102,22 +102,26 @@ template <typename A> bool IsAssumedRank(const A *x) {
   return x && IsAssumedRank(*x);
 }
 
-// Predicate: true when an expression is a coarray (corank > 0)
-bool IsCoarray(const ActualArgument &);
-bool IsCoarray(const Symbol &);
-template <typename A> bool IsCoarray(const A &) { return false; }
-template <typename A> bool IsCoarray(const Designator<A> &designator) {
-  if (const auto *symbol{std::get_if<SymbolRef>(&designator.u)}) {
-    return IsCoarray(**symbol);
-  }
-  return false;
+// Finds the corank of an entity, possibly packaged in various ways.
+// Unlike rank, only data references have corank > 0.
+int GetCorank(const ActualArgument &);
+static inline int GetCorank(const Symbol &symbol) { return symbol.Corank(); }
+template <typename A> int GetCorank(const A &) { return 0; }
+template <typename T> int GetCorank(const Designator<T> &designator) {
+  return designator.Corank();
 }
-template <typename T> bool IsCoarray(const Expr<T> &expr) {
-  return common::visit([](const auto &x) { return IsCoarray(x); }, expr.u);
+template <typename T> int GetCorank(const Expr<T> &expr) {
+  return common::visit([](const auto &x) { return GetCorank(x); }, expr.u);
 }
-template <typename A> bool IsCoarray(const std::optional<A> &x) {
-  return x && IsCoarray(*x);
+template <typename A> int GetCorank(const std::optional<A> &x) {
+  return x ? GetCorank(*x) : 0;
 }
+template <typename A> int GetCorank(const A *x) {
+  return x ? GetCorank(*x) : 0;
+}
+
+// Predicate: true when an expression is a coarray (corank > 0)
+template <typename A> bool IsCoarray(const A &x) { return GetCorank(x) > 0; }
 
 // Generalizing packagers: these take operations and expressions of more
 // specific types and wrap them in Expr<> containers of more abstract types.
diff --git a/flang/include/flang/Evaluate/variable.h b/flang/include/flang/Evaluate/variable.h
index 9565826..b454d37d 100644
--- a/flang/include/flang/Evaluate/variable.h
+++ b/flang/include/flang/Evaluate/variable.h
@@ -51,6 +51,7 @@ template <typename T> struct Variable;
 struct BaseObject {
   EVALUATE_UNION_CLASS_BOILERPLATE(BaseObject)
   int Rank() const;
+  int Corank() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
   const Symbol *symbol() const {
@@ -84,6 +85,7 @@ public:
   SymbolRef &symbol() { return symbol_; }
 
   int Rank() const;
+  int Corank() const;
   const Symbol &GetFirstSymbol() const;
   const Symbol &GetLastSymbol() const { return symbol_; }
   std::optional<Expr<SubscriptInteger>> LEN() const;
@@ -116,6 +118,7 @@ public:
   Component *UnwrapComponent();
 
   int Rank() const;
+  int Corank() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
   bool operator==(const NamedEntity &) const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
@@ -147,6 +150,7 @@ public:
   const Symbol &parameter() const { return parameter_; }
 
   static constexpr int Rank() { return 0; } // always scalar
+  static constexpr int Corank() { return 0; }
   bool operator==(const TypeParamInquiry &) const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
@@ -224,6 +228,7 @@ public:
   }
 
   int Rank() const;
+  int Corank() const;
   const Symbol &GetFirstSymbol() const;
   const Symbol &GetLastSymbol() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
@@ -271,6 +276,7 @@ public:
   CoarrayRef &set_team(Expr<SomeInteger> &&, bool isTeamNumber = false);
 
   int Rank() const;
+  int Corank() const { return 0; }
   const Symbol &GetFirstSymbol() const;
   const Symbol &GetLastSymbol() const;
   NamedEntity GetBase() const;
@@ -294,6 +300,7 @@ private:
 struct DataRef {
   EVALUATE_UNION_CLASS_BOILERPLATE(DataRef)
   int Rank() const;
+  int Corank() const;
   const Symbol &GetFirstSymbol() const;
   const Symbol &GetLastSymbol() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
@@ -331,6 +338,7 @@ public:
   Parent &parent() { return parent_; }
 
   int Rank() const;
+  int Corank() const;
   template <typename A> const A *GetParentIf() const {
     return std::get_if<A>(&parent_);
   }
@@ -361,6 +369,7 @@ public:
   const DataRef &complex() const { return complex_; }
   Part part() const { return part_; }
   int Rank() const;
+  int Corank() const;
   const Symbol &GetFirstSymbol() const { return complex_.GetFirstSymbol(); }
   const Symbol &GetLastSymbol() const { return complex_.GetLastSymbol(); }
   bool operator==(const ComplexPart &) const;
@@ -396,6 +405,7 @@ public:
 
   std::optional<DynamicType> GetType() const;
   int Rank() const;
+  int Corank() const;
   BaseObject GetBaseObject() const;
   const Symbol *GetLastSymbol() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
@@ -421,6 +431,7 @@ public:
   int dimension() const { return dimension_; }
 
   static constexpr int Rank() { return 0; } // always scalar
+  static constexpr int Corank() { return 0; }
   bool operator==(const DescriptorInquiry &) const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h
index 374fd76..4e3d3cb 100644
--- a/flang/include/flang/Frontend/FrontendActions.h
+++ b/flang/include/flang/Frontend/FrontendActions.h
@@ -19,6 +19,7 @@
 #include "flang/Semantics/semantics.h"
 
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/OwningOpRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Module.h"
 #include <memory>
@@ -215,8 +216,8 @@ protected:
   CodeGenAction(BackendActionTy act) : action{act} {};
   /// @name MLIR
   /// {
-  std::unique_ptr<mlir::ModuleOp> mlirModule;
   std::unique_ptr<mlir::MLIRContext> mlirCtx;
+  mlir::OwningOpRef<mlir::ModuleOp> mlirModule;
   /// }
 
   /// @name LLVM IR
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 8f026ac..607aff4 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -62,7 +62,7 @@ struct SymbolBox;
 namespace pft {
 struct Variable;
 struct FunctionLikeUnit;
-}
+} // namespace pft
 
 using SomeExpr = Fortran::evaluate::Expr<Fortran::evaluate::SomeType>;
 using SymbolRef = Fortran::common::Reference<const Fortran::semantics::Symbol>;
@@ -295,7 +295,7 @@ public:
   /// Get the OpBuilder
   virtual fir::FirOpBuilder &getFirOpBuilder() = 0;
   /// Get the ModuleOp
-  virtual mlir::ModuleOp &getModuleOp() = 0;
+  virtual mlir::ModuleOp getModuleOp() = 0;
   /// Get the MLIRContext
   virtual mlir::MLIRContext &getMLIRContext() = 0;
   /// Unique a symbol (add a containing scope specific prefix)
diff --git a/flang/include/flang/Lower/Bridge.h b/flang/include/flang/Lower/Bridge.h
index 8ea5ed5..6404a16 100644
--- a/flang/include/flang/Lower/Bridge.h
+++ b/flang/include/flang/Lower/Bridge.h
@@ -23,6 +23,7 @@
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/OwningOpRef.h"
 #include <set>
 
 namespace llvm {
@@ -83,7 +84,8 @@ public:
   mlir::MLIRContext &getMLIRContext() { return context; }
 
   /// Get the ModuleOp. It can never be null, which is asserted in the ctor.
-  mlir::ModuleOp &getModule() { return *module.get(); }
+  mlir::ModuleOp getModule() { return *module; }
+  mlir::ModuleOp getModuleAndRelease() { return module.release(); }
 
   const Fortran::common::IntrinsicTypeDefaultKinds &getDefaultKinds() const {
     return defaultKinds;
@@ -166,7 +168,7 @@ private:
   const Fortran::evaluate::TargetCharacteristics &targetCharacteristics;
   const Fortran::parser::AllCookedSources *cooked;
   mlir::MLIRContext &context;
-  std::unique_ptr<mlir::ModuleOp> module;
+  mlir::OwningOpRef<mlir::ModuleOp> module;
   fir::KindMapping &kindMap;
   const Fortran::lower::LoweringOptions &loweringOptions;
   const std::vector<Fortran::lower::EnvironmentDefault> &envDefaults;
diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index fbf61e7..0d7038a 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -19,7 +19,7 @@ namespace llvm {
 template <typename T, unsigned N>
 class SmallVector;
 class StringRef;
-}
+} // namespace llvm
 
 namespace mlir {
 class Location;
@@ -44,7 +44,7 @@ struct OpenACCRoutineConstruct;
 namespace semantics {
 class SemanticsContext;
 class Symbol;
-}
+} // namespace semantics
 
 namespace lower {
 
@@ -78,11 +78,11 @@ void genOpenACCDeclarativeConstruct(AbstractConverter &,
                                     AccRoutineInfoMappingList &);
 void genOpenACCRoutineConstruct(AbstractConverter &,
                                 Fortran::semantics::SemanticsContext &,
-                                mlir::ModuleOp &,
+                                mlir::ModuleOp,
                                 const parser::OpenACCRoutineConstruct &,
                                 AccRoutineInfoMappingList &);
 
-void finalizeOpenACCRoutineAttachment(mlir::ModuleOp &,
+void finalizeOpenACCRoutineAttachment(mlir::ModuleOp,
                                       AccRoutineInfoMappingList &);
 
 /// Get a acc.private.recipe op for the given type or create it if it does not
diff --git a/flang/include/flang/Optimizer/Transforms/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index df1b709..b99e330 100644
--- a/flang/include/flang/Optimizer/Transforms/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -15,6 +15,10 @@
 
 static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
 
+namespace fir {
+class FirOpBuilder;
+} // namespace fir
+
 namespace cuf {
 
 /// Retrieve or create the CUDA Fortran GPU module in the given \p mod.
@@ -24,6 +28,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
 bool isInCUDADeviceContext(mlir::Operation *op);
 bool isRegisteredDeviceGlobal(fir::GlobalOp op);
 
+void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);
+
 } // namespace cuf
 
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 6ee4370..c5d86e7 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -769,6 +769,11 @@ mlir::Value genMaxWithZero(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value genCPtrOrCFunptrAddr(fir::FirOpBuilder &builder, mlir::Location loc,
                                  mlir::Value cPtr, mlir::Type ty);
 
+/// The type(C_DEVPTR) is defined as the derived type with only one
+/// component of C_PTR type. Get the C address from the C_PTR component.
+mlir::Value genCDevPtrAddr(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Value cDevPtr, mlir::Type ty);
+
 /// Get the C address value.
 mlir::Value genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
                                   mlir::Location loc, mlir::Value cPtr);
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 3d05165..18f84c7 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -214,6 +214,7 @@ struct IntrinsicLibrary {
                                            llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue genCDevLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genErfcScaled(mlir::Type resultType,
                             llvm::ArrayRef<mlir::Value> args);
   void genCFPointer(llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index d06587c..9a31ffa 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -140,6 +140,17 @@ def cuf_DeallocateOp : cuf_Op<"deallocate",
   let hasVerifier = 1;
 }
 
+def cuf_SyncDescriptorOp : cuf_Op<"sync_descriptor", []> {
+  let summary =
+      "Synchronize the host and device descriptor of a Fortran pointer";
+
+  let arguments = (ins SymbolRefAttr:$globalName);
+
+  let assemblyFormat = [{
+    $globalName attr-dict
+  }];
+}
+
 def cuf_DataTransferOp : cuf_Op<"data_transfer", []> {
   let summary = "Represent a data transfer between host and device memory";
 
diff --git a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
index 1dbde5c..2414de4 100644
--- a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
+++ b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
@@ -57,6 +57,9 @@ def StrictSmallerWidthPred : Constraint<CPred<
                        "$0.getType().getIntOrFloatBitWidth() < "
                        "$1.getType().getIntOrFloatBitWidth()">>;
 
+def PointerCompatiblePred
+    : Constraint<CPred<"fir::ConvertOp::isPointerCompatible($0.getType())">>;
+
 // floats or ints that undergo successive extensions or successive truncations.
 def ConvertConvertOptPattern
     : Pat<(fir_ConvertOp:$res (fir_ConvertOp:$irm $arg)),
@@ -112,4 +115,18 @@ def ForwardConstantConvertPattern
           (createConstantOp $res, $attr),
           [(IndexTypePred $res), (IntegerTypePred $cnt)]>;
 
+// Optimize redundant pointer conversions, e.g.:
+// %1 = fir.convert %0 :
+//     (!fir.heap<!fir.array<2xf32>>) -> !fir.ref<!fir.array<2xf32>>
+// %2 = fir.convert %1 :
+//     (!fir.ref<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>>
+// Will be optimized into:
+// %2 = fir.convert %0 :
+//     (!fir.heap<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>>
+// which is redundant due to RedundantConvertOptPattern.
+def ChainedPointerConvertsPattern
+    : Pat<(fir_ConvertOp:$res(fir_ConvertOp:$irm $arg)), (fir_ConvertOp $arg),
+          [(PointerCompatiblePred $arg), (PointerCompatiblePred $irm),
+           (PointerCompatiblePred $res)]>;
+
 #endif // FORTRAN_FIR_REWRITE_PATTERNS
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index ed49f50..644f1e3 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -49,4 +49,8 @@ def InlineElementals : Pass<"inline-elementals"> {
   let summary = "Inline chained hlfir.elemental operations";
 }
 
+def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> {
+  let summary = "Inline hlfir.assign operations";
+}
+
 #endif //FORTRAN_DIALECT_HLFIR_PASSES
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 7821d40..3331520 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -486,6 +486,7 @@ public:
   NODE(parser, OmpAffinityClause)
   NODE(OmpAffinityClause, Modifier)
   NODE(parser, OmpAlignment)
+  NODE(parser, OmpAlignClause)
   NODE(parser, OmpAlignedClause)
   NODE(OmpAlignedClause, Modifier)
   NODE(parser, OmpAtClause)
@@ -516,6 +517,8 @@ public:
 #include "llvm/Frontend/OpenMP/OMP.inc"
   NODE(parser, OmpClauseList)
   NODE(parser, OmpCriticalDirective)
+  NODE(parser, OmpErrorDirective)
+  NODE(parser, OmpNothingDirective)
   NODE(parser, OmpDeclareTargetSpecifier)
   NODE(parser, OmpDeclareTargetWithClause)
   NODE(parser, OmpDeclareTargetWithList)
@@ -662,7 +665,7 @@ public:
   NODE(parser, OmpAtomicDefaultMemOrderClause)
   NODE_ENUM(common, OmpAtomicDefaultMemOrderType)
   NODE(parser, OpenMPDepobjConstruct)
-  NODE(parser, OpenMPErrorConstruct)
+  NODE(parser, OpenMPUtilityConstruct)
   NODE(parser, OpenMPFlushConstruct)
   NODE(parser, OpenMPLoopConstruct)
   NODE(parser, OpenMPExecutableAllocate)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 2ef593b..941d70d 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3760,6 +3760,11 @@ struct OmpAffinityClause {
   std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
+// Ref: 5.2: [174]
+struct OmpAlignClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpAlignClause, ScalarIntExpr);
+};
+
 // Ref: [4.5:72-81], [5.0:110-119], [5.1:134-143], [5.2:169-170]
 //
 // aligned-clause ->
@@ -4182,6 +4187,30 @@ struct OmpClauseList {
 
 // --- Directives and constructs
 
+// Ref: [5.1:89-90], [5.2:216]
+//
+// nothing-directive ->
+//    NOTHING                                     // since 5.1
+struct OmpNothingDirective {
+  using EmptyTrait = std::true_type;
+  COPY_AND_ASSIGN_BOILERPLATE(OmpNothingDirective);
+  CharBlock source;
+};
+
+// Ref: OpenMP [5.2:216-218]
+// ERROR AT(compilation|execution) SEVERITY(fatal|warning) MESSAGE("msg-str)
+struct OmpErrorDirective {
+  TUPLE_CLASS_BOILERPLATE(OmpErrorDirective);
+  CharBlock source;
+  std::tuple<Verbatim, OmpClauseList> t;
+};
+
+struct OpenMPUtilityConstruct {
+  UNION_CLASS_BOILERPLATE(OpenMPUtilityConstruct);
+  CharBlock source;
+  std::variant<OmpErrorDirective, OmpNothingDirective> u;
+};
+
 // 2.7.2 SECTIONS
 // 2.11.2 PARALLEL SECTIONS
 struct OmpSectionsDirective {
@@ -4318,7 +4347,7 @@ struct OpenMPDeclarativeConstruct {
   std::variant<OpenMPDeclarativeAllocate, OpenMPDeclareMapperConstruct,
       OpenMPDeclareReductionConstruct, OpenMPDeclareSimdConstruct,
       OpenMPDeclareTargetConstruct, OpenMPThreadprivate,
-      OpenMPRequiresConstruct>
+      OpenMPRequiresConstruct, OpenMPUtilityConstruct>
       u;
 };
 
@@ -4506,14 +4535,6 @@ struct OpenMPDepobjConstruct {
   std::tuple<Verbatim, OmpObject, OmpClause> t;
 };
 
-// Ref: OpenMP [5.2:216-218]
-// ERROR AT(compilation|execution) SEVERITY(fatal|warning) MESSAGE("msg-str)
-struct OpenMPErrorConstruct {
-  TUPLE_CLASS_BOILERPLATE(OpenMPErrorConstruct);
-  CharBlock source;
-  std::tuple<Verbatim, OmpClauseList> t;
-};
-
 // 2.17.8 flush -> FLUSH [memory-order-clause] [(variable-name-list)]
 struct OpenMPFlushConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenMPFlushConstruct);
@@ -4586,7 +4607,7 @@ struct OpenMPConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPConstruct);
   std::variant<OpenMPStandaloneConstruct, OpenMPSectionsConstruct,
       OpenMPSectionConstruct, OpenMPLoopConstruct, OpenMPBlockConstruct,
-      OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPErrorConstruct,
+      OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPUtilityConstruct,
       OpenMPExecutableAllocate, OpenMPAllocatorsConstruct,
       OpenMPCriticalConstruct>
       u;
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index b6f0e7f..4fb4c94 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -20,16 +20,16 @@ extern "C" {
 void RTDECL(CUFRegisterAllocator)();
 }
 
-void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream);
+void *CUFAllocPinned(std::size_t);
 void CUFFreePinned(void *);
 
-void *CUFAllocDevice(std::size_t, std::int64_t);
+void *CUFAllocDevice(std::size_t);
 void CUFFreeDevice(void *);
 
-void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream);
+void *CUFAllocManaged(std::size_t);
 void CUFFreeManaged(void *);
 
-void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream);
+void *CUFAllocUnified(std::size_t);
 void CUFFreeUnified(void *);
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h
index 9c95f72..474f8e6 100644
--- a/flang/include/flang/Runtime/CUDA/common.h
+++ b/flang/include/flang/Runtime/CUDA/common.h
@@ -23,9 +23,6 @@ static constexpr unsigned kHostToDevice = 0;
 static constexpr unsigned kDeviceToHost = 1;
 static constexpr unsigned kDeviceToDevice = 2;
 
-/// Value used for asyncId when no specific stream is specified.
-static constexpr std::int64_t kCudaNoStream = -1;
-
 #define CUDA_REPORT_IF_ERROR(expr) \
   [](cudaError_t err) { \
     if (err == cudaSuccess) \
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index 55878aaa..0ee7fec 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -33,6 +33,10 @@ void *RTDECL(CUFGetDeviceAddress)(
 void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Get the device address of registered with the \p hostPtr and sync them.
+void RTDECL(CUFSyncGlobalDescriptor)(
+    void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index 121c31a..58061d9 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -94,9 +94,9 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the allocatable has a
 // derived type, and is always initialized by AllocatableAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
-int RTDECL(AllocatableAllocate)(Descriptor &, std::int64_t asyncId = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+int RTDECL(AllocatableAllocate)(Descriptor &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/include/flang/Runtime/allocator-registry.h b/flang/include/flang/Runtime/allocator-registry.h
index 4c3295e..29302c5 100644
--- a/flang/include/flang/Runtime/allocator-registry.h
+++ b/flang/include/flang/Runtime/allocator-registry.h
@@ -11,7 +11,6 @@
 
 #include "flang/Common/api-attrs.h"
 #include "flang/Runtime/allocator-registry-consts.h"
-#include <cstdint>
 #include <cstdlib>
 #include <vector>
 
@@ -19,7 +18,7 @@
 
 namespace Fortran::runtime {
 
-using AllocFct = void *(*)(std::size_t, std::int64_t);
+using AllocFct = void *(*)(std::size_t);
 using FreeFct = void (*)(void *);
 
 typedef struct Allocator_t {
@@ -27,11 +26,10 @@ typedef struct Allocator_t {
   FreeFct free{nullptr};
 } Allocator_t;
 
-static RT_API_ATTRS void *MallocWrapper(
-    std::size_t size, [[maybe_unused]] std::int64_t) {
+#ifdef RT_DEVICE_COMPILATION
+static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
   return std::malloc(size);
 }
-#ifdef RT_DEVICE_COMPILATION
 static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
 #endif
 
@@ -41,7 +39,7 @@ struct AllocatorRegistry {
       : allocators{{&MallocWrapper, &FreeWrapper}} {}
 #else
   constexpr AllocatorRegistry() {
-    allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
+    allocators[kDefaultAllocator] = {&std::malloc, &std::free};
   };
 #endif
   RT_API_ATTRS void Register(int, Allocator_t);
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 44e82c6..dd36fba 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -369,7 +369,7 @@ public:
   // before calling.  It (re)computes the byte strides after
   // allocation.  Does not allocate automatic components or
   // perform default component initialization.
-  RT_API_ATTRS int Allocate(std::int64_t asyncId = -1);
+  RT_API_ATTRS int Allocate();
   RT_API_ATTRS void SetByteStrides();
 
   // Deallocates storage; does not call FINAL subroutines or
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 2f97efd..bc6abcc 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -861,23 +861,7 @@ public:
   bool operator!=(const Symbol &that) const { return !(*this == that); }
 
   int Rank() const { return RankImpl(); }
-
-  int Corank() const {
-    return common::visit(
-        common::visitors{
-            [](const SubprogramDetails &sd) {
-              return sd.isFunction() ? sd.result().Corank() : 0;
-            },
-            [](const GenericDetails &) {
-              return 0; /*TODO*/
-            },
-            [](const UseDetails &x) { return x.symbol().Corank(); },
-            [](const HostAssocDetails &x) { return x.symbol().Corank(); },
-            [](const ObjectEntityDetails &oed) { return oed.coshape().Rank(); },
-            [](const auto &) { return 0; },
-        },
-        details_);
-  }
+  int Corank() const { return CorankImpl(); }
 
   // If there is a parent component, return a pointer to its derived type spec.
   // The Scope * argument defaults to this->scope_ but should be overridden
@@ -955,6 +939,32 @@ private:
         },
         details_);
   }
+  inline int CorankImpl(int depth = startRecursionDepth) const {
+    if (depth-- == 0) {
+      return 0;
+    }
+    return common::visit(
+        common::visitors{
+            [&](const SubprogramDetails &sd) {
+              return sd.isFunction() ? sd.result().CorankImpl(depth) : 0;
+            },
+            [](const GenericDetails &) { return 0; },
+            [&](const ProcEntityDetails &ped) {
+              const Symbol *iface{ped.procInterface()};
+              return iface ? iface->CorankImpl(depth) : 0;
+            },
+            [&](const UseDetails &x) { return x.symbol().CorankImpl(depth); },
+            [&](const HostAssocDetails &x) {
+              return x.symbol().CorankImpl(depth);
+            },
+            [](const ObjectEntityDetails &oed) { return oed.coshape().Rank(); },
+            [](const AssocEntityDetails &aed) {
+              return aed.expr() ? aed.expr()->Corank() : 0;
+            },
+            [](const auto &) { return 0; },
+        },
+        details_);
+  }
   template <std::size_t> friend class Symbols;
   template <class, std::size_t> friend class std::array;
 };
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index c0091e1..0286f2a 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -174,7 +174,7 @@ struct OffloadModuleOpts {
 //  Shares assinging of the OpenMP OffloadModuleInterface and its assorted
 //  attributes accross Flang tools (bbc/flang)
 [[maybe_unused]] static void setOffloadModuleInterfaceAttributes(
-    mlir::ModuleOp &module, OffloadModuleOpts Opts) {
+    mlir::ModuleOp module, OffloadModuleOpts Opts) {
   // Should be registered by the OpenMPDialect
   if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
           module.getOperation())) {
@@ -198,7 +198,7 @@ struct OffloadModuleOpts {
 }
 
 [[maybe_unused]] static void setOpenMPVersionAttribute(
-    mlir::ModuleOp &module, int64_t version) {
+    mlir::ModuleOp module, int64_t version) {
   module.getOperation()->setAttr(
       mlir::StringAttr::get(module.getContext(), llvm::Twine{"omp.version"}),
       mlir::omp::VersionAttr::get(module.getContext(), version));
diff --git a/flang/lib/Common/Fortran.cpp b/flang/lib/Common/Fortran.cpp
index 367a51f..eec8341 100644
--- a/flang/lib/Common/Fortran.cpp
+++ b/flang/lib/Common/Fortran.cpp
@@ -136,7 +136,8 @@ bool AreCompatibleCUDADataAttrs(std::optional<CUDADataAttr> x,
       if (*x == CUDADataAttr::Device) {
         if ((y &&
                 (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified ||
-                    *y == CUDADataAttr::Shared)) ||
+                    *y == CUDADataAttr::Shared ||
+                    *y == CUDADataAttr::Constant)) ||
             (!y && (isCudaUnified || isCudaManaged))) {
           if (y && *y == CUDADataAttr::Shared && warning) {
             *warning = "SHARED attribute ignored"s;
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index 324d6b8..3912d1c 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -227,15 +227,14 @@ void TypeAndShape::AcquireAttrs(const semantics::Symbol &symbol) {
   } else if (semantics::IsAssumedSizeArray(symbol)) {
     attrs_.set(Attr::AssumedSize);
   }
+  if (int n{GetCorank(symbol)}) {
+    corank_ = n;
+    attrs_.set(Attr::Coarray);
+  }
   if (const auto *object{
-          symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()}) {
-    corank_ = object->coshape().Rank();
-    if (object->IsAssumedRank()) {
-      attrs_.set(Attr::AssumedRank);
-    }
-    if (object->IsCoarray()) {
-      attrs_.set(Attr::Coarray);
-    }
+          symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()};
+      object && object->IsAssumedRank()) {
+    attrs_.set(Attr::AssumedRank);
   }
 }
 
diff --git a/flang/lib/Evaluate/expression.cpp b/flang/lib/Evaluate/expression.cpp
index 9514ac8..759fe5b 100644
--- a/flang/lib/Evaluate/expression.cpp
+++ b/flang/lib/Evaluate/expression.cpp
@@ -113,6 +113,18 @@ template <typename A> int ExpressionBase<A>::Rank() const {
       derived().u);
 }
 
+template <typename A> int ExpressionBase<A>::Corank() const {
+  return common::visit(
+      [](const auto &x) {
+        if constexpr (common::HasMember<decltype(x), TypelessExpression>) {
+          return 0;
+        } else {
+          return x.Corank();
+        }
+      },
+      derived().u);
+}
+
 DynamicType Parentheses<SomeDerived>::GetType() const {
   return left().GetType().value();
 }
diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp
index 26ae33f..352dec4 100644
--- a/flang/lib/Evaluate/fold-integer.cpp
+++ b/flang/lib/Evaluate/fold-integer.cpp
@@ -71,6 +71,28 @@ static bool CheckDimArg(const std::optional<ActualArgument> &dimArg,
   return true;
 }
 
+static bool CheckCoDimArg(const std::optional<ActualArgument> &dimArg,
+    const Symbol &symbol, parser::ContextualMessages &messages,
+    std::optional<int> &dimVal) {
+  dimVal.reset();
+  if (int corank{symbol.Corank()}; corank > 0) {
+    if (auto dim64{ToInt64(dimArg)}) {
+      if (*dim64 < 1) {
+        messages.Say("DIM=%jd dimension must be positive"_err_en_US, *dim64);
+        return false;
+      } else if (*dim64 > corank) {
+        messages.Say(
+            "DIM=%jd dimension is out of range for corank-%d coarray"_err_en_US,
+            *dim64, corank);
+        return false;
+      } else {
+        dimVal = static_cast<int>(*dim64 - 1); // 1-based to 0-based
+      }
+    }
+  }
+  return true;
+}
+
 // Class to retrieve the constant bound of an expression which is an
 // array that devolves to a type of Constant<T>
 class GetConstantArrayBoundHelper {
@@ -264,6 +286,37 @@ Expr<Type<TypeCategory::Integer, KIND>> UBOUND(FoldingContext &context,
   return Expr<T>{std::move(funcRef)};
 }
 
+// LCOBOUND() & UCOBOUND()
+template <int KIND>
+Expr<Type<TypeCategory::Integer, KIND>> COBOUND(FoldingContext &context,
+    FunctionRef<Type<TypeCategory::Integer, KIND>> &&funcRef, bool isUCOBOUND) {
+  using T = Type<TypeCategory::Integer, KIND>;
+  ActualArguments &args{funcRef.arguments()};
+  if (const Symbol * coarray{UnwrapWholeSymbolOrComponentDataRef(args[0])}) {
+    std::optional<int> dim;
+    if (funcRef.Rank() == 0) {
+      // Optional DIM= argument is present: result is scalar.
+      if (!CheckCoDimArg(args[1], *coarray, context.messages(), dim)) {
+        return MakeInvalidIntrinsic<T>(std::move(funcRef));
+      } else if (!dim) {
+        // DIM= is present but not constant, or error
+        return Expr<T>{std::move(funcRef)};
+      }
+    }
+    if (dim) {
+      if (auto cb{isUCOBOUND ? GetUCOBOUND(*coarray, *dim)
+                             : GetLCOBOUND(*coarray, *dim)}) {
+        return Fold(context, ConvertToType<T>(std::move(*cb)));
+      }
+    } else if (auto cbs{
+                   AsExtentArrayExpr(isUCOBOUND ? GetUCOBOUNDs(*coarray)
+                                                : GetLCOBOUNDs(*coarray))}) {
+      return Fold(context, ConvertToType<T>(Expr<ExtentType>{std::move(*cbs)}));
+    }
+  }
+  return Expr<T>{std::move(funcRef)};
+}
+
 // COUNT()
 template <typename T, int MASK_KIND> class CountAccumulator {
   using MaskT = Type<TypeCategory::Logical, MASK_KIND>;
@@ -1105,6 +1158,8 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
     }
   } else if (name == "lbound") {
     return LBOUND(context, std::move(funcRef));
+  } else if (name == "lcobound") {
+    return COBOUND(context, std::move(funcRef), /*isUCOBOUND=*/false);
   } else if (name == "leadz" || name == "trailz" || name == "poppar" ||
       name == "popcnt") {
     if (auto *sn{UnwrapExpr<Expr<SomeKind<T::category>>>(args[0])}) {
@@ -1396,6 +1451,8 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
     }
   } else if (name == "ubound") {
     return UBOUND(context, std::move(funcRef));
+  } else if (name == "ucobound") {
+    return COBOUND(context, std::move(funcRef), /*isUCOBOUND=*/true);
   } else if (name == "__builtin_numeric_storage_size") {
     if (!context.moduleFileName()) {
       // Don't fold this reference until it appears in the module file
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 28805ef..0dc8e12 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -958,7 +958,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
         {{"coarray", AnyData, Rank::coarray}, RequiredDIM, OptionalTEAM},
         DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"this_image", {{"coarray", AnyData, Rank::coarray}, OptionalTEAM},
-        DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction},
+        DefaultInt, Rank::vector, IntrinsicClass::transformationalFunction},
     {"this_image", {OptionalTEAM}, DefaultInt, Rank::scalar,
         IntrinsicClass::transformationalFunction},
     {"tiny",
@@ -2663,6 +2663,8 @@ private:
       ActualArguments &, FoldingContext &) const;
   std::optional<SpecificCall> HandleC_Loc(
       ActualArguments &, FoldingContext &) const;
+  std::optional<SpecificCall> HandleC_Devloc(
+      ActualArguments &, FoldingContext &) const;
   const std::string &ResolveAlias(const std::string &name) const {
     auto iter{aliases_.find(name)};
     return iter == aliases_.end() ? name : iter->second;
@@ -2690,7 +2692,8 @@ bool IntrinsicProcTable::Implementation::IsIntrinsicFunction(
     return true;
   }
   // special cases
-  return name == "__builtin_c_loc" || name == "null";
+  return name == "__builtin_c_loc" || name == "__builtin_c_devloc" ||
+      name == "null";
 }
 bool IntrinsicProcTable::Implementation::IsIntrinsicSubroutine(
     const std::string &name0) const {
@@ -3080,6 +3083,73 @@ std::optional<SpecificCall> IntrinsicProcTable::Implementation::HandleC_Loc(
   return std::nullopt;
 }
 
+// CUDA Fortran C_DEVLOC(x)
+std::optional<SpecificCall> IntrinsicProcTable::Implementation::HandleC_Devloc(
+    ActualArguments &arguments, FoldingContext &context) const {
+  static const char *const keywords[]{"cptr", nullptr};
+
+  if (CheckAndRearrangeArguments(arguments, context.messages(), keywords)) {
+    CHECK(arguments.size() == 1);
+    const auto *expr{arguments[0].value().UnwrapExpr()};
+    if (auto typeAndShape{characteristics::TypeAndShape::Characterize(
+            arguments[0], context)}) {
+      if (expr && !IsContiguous(*expr, context).value_or(true)) {
+        context.messages().Say(arguments[0]->sourceLocation(),
+            "C_DEVLOC() argument must be contiguous"_err_en_US);
+      }
+      if (auto constExtents{AsConstantExtents(context, typeAndShape->shape())};
+          constExtents && GetSize(*constExtents) == 0) {
+        context.messages().Say(arguments[0]->sourceLocation(),
+            "C_DEVLOC() argument may not be a zero-sized array"_err_en_US);
+      }
+      if (!(typeAndShape->type().category() != TypeCategory::Derived ||
+              typeAndShape->type().IsAssumedType() ||
+              (!typeAndShape->type().IsPolymorphic() &&
+                  CountNonConstantLenParameters(
+                      typeAndShape->type().GetDerivedTypeSpec()) == 0))) {
+        context.messages().Say(arguments[0]->sourceLocation(),
+            "C_DEVLOC() argument must have an intrinsic type, assumed type, or non-polymorphic derived type with no non-constant length parameter"_err_en_US);
+      } else if (typeAndShape->type().knownLength().value_or(1) == 0) {
+        context.messages().Say(arguments[0]->sourceLocation(),
+            "C_DEVLOC() argument may not be zero-length character"_err_en_US);
+      } else if (typeAndShape->type().category() != TypeCategory::Derived &&
+          !IsInteroperableIntrinsicType(typeAndShape->type()).value_or(true)) {
+        if (typeAndShape->type().category() == TypeCategory::Character &&
+            typeAndShape->type().kind() == 1) {
+          // Default character kind, but length is not known to be 1
+          if (context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::CharacterInteroperability)) {
+            context.messages().Say(
+                common::UsageWarning::CharacterInteroperability,
+                arguments[0]->sourceLocation(),
+                "C_DEVLOC() argument has non-interoperable character length"_warn_en_US);
+          }
+        } else if (context.languageFeatures().ShouldWarn(
+                       common::UsageWarning::Interoperability)) {
+          context.messages().Say(common::UsageWarning::Interoperability,
+              arguments[0]->sourceLocation(),
+              "C_DEVLOC() argument has non-interoperable intrinsic type or kind"_warn_en_US);
+        }
+      }
+
+      characteristics::DummyDataObject ddo{std::move(*typeAndShape)};
+      ddo.intent = common::Intent::In;
+      return SpecificCall{
+          SpecificIntrinsic{"__builtin_c_devloc"s,
+              characteristics::Procedure{
+                  characteristics::FunctionResult{
+                      DynamicType{GetBuiltinDerivedType(
+                          builtinsScope_, "__builtin_c_devptr")}},
+                  characteristics::DummyArguments{
+                      characteristics::DummyArgument{"cptr"s, std::move(ddo)}},
+                  characteristics::Procedure::Attrs{
+                      characteristics::Procedure::Attr::Pure}}},
+          std::move(arguments)};
+    }
+  }
+  return std::nullopt;
+}
+
 static bool CheckForNonPositiveValues(FoldingContext &context,
     const ActualArgument &arg, const std::string &procName,
     const std::string &argName) {
@@ -3119,27 +3189,6 @@ static bool CheckForNonPositiveValues(FoldingContext &context,
   return ok;
 }
 
-static bool CheckDimAgainstCorank(SpecificCall &call, FoldingContext &context) {
-  bool ok{true};
-  if (const auto &coarrayArg{call.arguments[0]}) {
-    if (const auto &dimArg{call.arguments[1]}) {
-      if (const auto *symbol{
-              UnwrapWholeSymbolDataRef(coarrayArg->UnwrapExpr())}) {
-        const auto corank = symbol->Corank();
-        if (const auto dimNum{ToInt64(dimArg->UnwrapExpr())}) {
-          if (dimNum < 1 || dimNum > corank) {
-            ok = false;
-            context.messages().Say(dimArg->sourceLocation(),
-                "DIM=%jd dimension is out of range for coarray with corank %d"_err_en_US,
-                static_cast<std::intmax_t>(*dimNum), corank);
-          }
-        }
-      }
-    }
-  }
-  return ok;
-}
-
 static bool CheckAtomicDefineAndRef(FoldingContext &context,
     const std::optional<ActualArgument> &atomArg,
     const std::optional<ActualArgument> &valueArg,
@@ -3207,8 +3256,6 @@ static bool ApplySpecificChecks(SpecificCall &call, FoldingContext &context) {
     if (const auto &arg{call.arguments[0]}) {
       ok = CheckForNonPositiveValues(context, *arg, name, "image");
     }
-  } else if (name == "lcobound") {
-    return CheckDimAgainstCorank(call, context);
   } else if (name == "loc") {
     const auto &arg{call.arguments[0]};
     ok =
@@ -3218,8 +3265,6 @@ static bool ApplySpecificChecks(SpecificCall &call, FoldingContext &context) {
           arg ? arg->sourceLocation() : context.messages().at(),
           "Argument of LOC() must be an object or procedure"_err_en_US);
     }
-  } else if (name == "ucobound") {
-    return CheckDimAgainstCorank(call, context);
   }
   return ok;
 }
@@ -3270,6 +3315,8 @@ std::optional<SpecificCall> IntrinsicProcTable::Implementation::Probe(
   } else { // function
     if (call.name == "__builtin_c_loc") {
       return HandleC_Loc(arguments, context);
+    } else if (call.name == "__builtin_c_devloc") {
+      return HandleC_Devloc(arguments, context);
     } else if (call.name == "null") {
       return HandleNull(arguments, context);
     }
diff --git a/flang/lib/Evaluate/shape.cpp b/flang/lib/Evaluate/shape.cpp
index c62d0cb..c7b2156 100644
--- a/flang/lib/Evaluate/shape.cpp
+++ b/flang/lib/Evaluate/shape.cpp
@@ -723,6 +723,58 @@ Shape GetUBOUNDs(const NamedEntity &base, bool invariantOnly) {
   return GetUBOUNDs(nullptr, base, invariantOnly);
 }
 
+MaybeExtentExpr GetLCOBOUND(
+    const Symbol &symbol0, int dimension, bool invariantOnly) {
+  const Symbol &symbol{ResolveAssociations(symbol0)};
+  if (const auto *object{symbol.detailsIf<semantics::ObjectEntityDetails>()}) {
+    int corank{object->coshape().Rank()};
+    if (dimension < corank) {
+      const semantics::ShapeSpec &shapeSpec{object->coshape()[dimension]};
+      if (const auto &lcobound{shapeSpec.lbound().GetExplicit()}) {
+        if (!invariantOnly || IsScopeInvariantExpr(*lcobound)) {
+          return *lcobound;
+        }
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+MaybeExtentExpr GetUCOBOUND(
+    const Symbol &symbol0, int dimension, bool invariantOnly) {
+  const Symbol &symbol{ResolveAssociations(symbol0)};
+  if (const auto *object{symbol.detailsIf<semantics::ObjectEntityDetails>()}) {
+    int corank{object->coshape().Rank()};
+    if (dimension < corank - 1) {
+      const semantics::ShapeSpec &shapeSpec{object->coshape()[dimension]};
+      if (const auto ucobound{shapeSpec.ubound().GetExplicit()}) {
+        if (!invariantOnly || IsScopeInvariantExpr(*ucobound)) {
+          return *ucobound;
+        }
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+Shape GetLCOBOUNDs(const Symbol &symbol, bool invariantOnly) {
+  Shape result;
+  int corank{symbol.Corank()};
+  for (int dim{0}; dim < corank; ++dim) {
+    result.emplace_back(GetLCOBOUND(symbol, dim, invariantOnly));
+  }
+  return result;
+}
+
+Shape GetUCOBOUNDs(const Symbol &symbol, bool invariantOnly) {
+  Shape result;
+  int corank{symbol.Corank()};
+  for (int dim{0}; dim < corank; ++dim) {
+    result.emplace_back(GetUCOBOUND(symbol, dim, invariantOnly));
+  }
+  return result;
+}
+
 auto GetShapeHelper::operator()(const Symbol &symbol) const -> Result {
   return common::visit(
       common::visitors{
@@ -937,6 +989,10 @@ auto GetShapeHelper::operator()(const ProcedureRef &call) const -> Result {
       if (!call.arguments().empty()) {
         return (*this)(call.arguments()[0]);
       }
+    } else if (intrinsic->name == "lcobound" || intrinsic->name == "ucobound") {
+      if (call.arguments().size() == 3 && !call.arguments().at(1).has_value()) {
+        return Shape(1, ExtentExpr{GetCorank(call.arguments().at(0))});
+      }
     } else if (intrinsic->name == "matmul") {
       if (call.arguments().size() == 2) {
         if (auto ashape{(*this)(call.arguments()[0])}) {
@@ -1076,6 +1132,11 @@ auto GetShapeHelper::operator()(const ProcedureRef &call) const -> Result {
           }
         }
       }
+    } else if (intrinsic->name == "this_image") {
+      if (call.arguments().size() == 2) {
+        // THIS_IMAGE(coarray, no DIM, [TEAM])
+        return Shape(1, ExtentExpr{GetCorank(call.arguments().at(0))});
+      }
     } else if (intrinsic->name == "transpose") {
       if (call.arguments().size() >= 1) {
         if (auto shape{(*this)(call.arguments().at(0))}) {
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 6299084..6bd623a 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -906,13 +906,9 @@ bool IsAssumedRank(const ActualArgument &arg) {
   }
 }
 
-bool IsCoarray(const ActualArgument &arg) {
+int GetCorank(const ActualArgument &arg) {
   const auto *expr{arg.UnwrapExpr()};
-  return expr && IsCoarray(*expr);
-}
-
-bool IsCoarray(const Symbol &symbol) {
-  return GetAssociationRoot(symbol).Corank() > 0;
+  return GetCorank(*expr);
 }
 
 bool IsProcedureDesignator(const Expr<SomeType> &expr) {
diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp
index 707a206..841d0f7 100644
--- a/flang/lib/Evaluate/variable.cpp
+++ b/flang/lib/Evaluate/variable.cpp
@@ -465,6 +465,59 @@ template <typename T> int Designator<T>::Rank() const {
       u);
 }
 
+// Corank()
+int BaseObject::Corank() const {
+  return common::visit(common::visitors{
+                           [](SymbolRef symbol) { return symbol->Corank(); },
+                           [](const StaticDataObject::Pointer &) { return 0; },
+                       },
+      u);
+}
+
+int Component::Corank() const {
+  if (int corank{symbol_->Corank()}; corank > 0) {
+    return corank;
+  }
+  return base().Corank();
+}
+
+int NamedEntity::Corank() const {
+  return common::visit(common::visitors{
+                           [](const SymbolRef s) { return s->Corank(); },
+                           [](const Component &c) { return c.Corank(); },
+                       },
+      u_);
+}
+
+int ArrayRef::Corank() const { return base().Corank(); }
+
+int DataRef::Corank() const {
+  return common::visit(common::visitors{
+                           [](SymbolRef symbol) { return symbol->Corank(); },
+                           [](const auto &x) { return x.Corank(); },
+                       },
+      u);
+}
+
+int Substring::Corank() const {
+  return common::visit(
+      common::visitors{
+          [](const DataRef &dataRef) { return dataRef.Corank(); },
+          [](const StaticDataObject::Pointer &) { return 0; },
+      },
+      parent_);
+}
+
+int ComplexPart::Corank() const { return complex_.Corank(); }
+
+template <typename T> int Designator<T>::Corank() const {
+  return common::visit(common::visitors{
+                           [](SymbolRef symbol) { return symbol->Corank(); },
+                           [](const auto &x) { return x.Corank(); },
+                       },
+      u);
+}
+
 // GetBaseObject(), GetFirstSymbol(), GetLastSymbol(), &c.
 const Symbol &Component::GetFirstSymbol() const {
   return base_.value().GetFirstSymbol();
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 79386c9..340efb1 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -766,6 +766,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
     opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave);
   }
 
+  // -fsave-main-program
+  if (args.hasArg(clang::driver::options::OPT_fsave_main_program)) {
+    opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram);
+  }
+
   if (args.hasArg(
           clang::driver::options::OPT_falternative_parameter_statement)) {
     opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter);
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index 041182b..9a555bc 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -232,6 +232,19 @@ bool FrontendAction::reportFatalErrors(const char (&message)[N]) {
                                            instance->getAllCookedSources());
     return true;
   }
+  if (instance->getParsing().parseTree().has_value() &&
+      !instance->getParsing().consumedWholeFile()) {
+    // Parsing failed without error.
+    const unsigned diagID = instance->getDiagnostics().getCustomDiagID(
+        clang::DiagnosticsEngine::Error, message);
+    instance->getDiagnostics().Report(diagID) << getCurrentFileOrBufferName();
+    instance->getParsing().messages().Emit(llvm::errs(),
+                                           instance->getAllCookedSources());
+    instance->getParsing().EmitMessage(
+        llvm::errs(), instance->getParsing().finalRestingPlace(),
+        "parser FAIL (final position)", "error: ", llvm::raw_ostream::RED);
+    return true;
+  }
   return false;
 }
 
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 77631f7..310cd65 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -149,7 +149,7 @@ bool PrescanAndSemaDebugAction::beginSourceFileAction() {
          (runSemanticChecks() || true) && (generateRtTypeTables() || true);
 }
 
-static void addDependentLibs(mlir::ModuleOp &mlirModule, CompilerInstance &ci) {
+static void addDependentLibs(mlir::ModuleOp mlirModule, CompilerInstance &ci) {
   const std::vector<std::string> &libs =
       ci.getInvocation().getCodeGenOpts().DependentLibs;
   if (libs.empty()) {
@@ -171,7 +171,7 @@ static void addDependentLibs(mlir::ModuleOp &mlirModule, CompilerInstance &ci) {
 // Add to MLIR code target specific items which are dependent on target
 // configuration specified by the user.
 // Clang equivalent function: AMDGPUTargetCodeGenInfo::emitTargetGlobals
-static void addAMDGPUSpecificMLIRItems(mlir::ModuleOp &mlirModule,
+static void addAMDGPUSpecificMLIRItems(mlir::ModuleOp mlirModule,
                                        CompilerInstance &ci) {
   const TargetOptions &targetOpts = ci.getInvocation().getTargetOpts();
   const llvm::Triple triple(targetOpts.triple);
@@ -269,7 +269,7 @@ bool CodeGenAction::beginSourceFileAction() {
       return false;
     }
 
-    mlirModule = std::make_unique<mlir::ModuleOp>(module.release());
+    mlirModule = std::move(module);
     const llvm::DataLayout &dl = targetMachine.createDataLayout();
     fir::support::setMLIRDataLayout(*mlirModule, dl);
     return true;
@@ -303,14 +303,11 @@ bool CodeGenAction::beginSourceFileAction() {
       ci.getInvocation().getFrontendOpts().features, targetMachine,
       ci.getInvocation().getTargetOpts(), ci.getInvocation().getCodeGenOpts());
 
-  // Fetch module from lb, so we can set
-  mlirModule = std::make_unique<mlir::ModuleOp>(lb.getModule());
-
   if (ci.getInvocation().getFrontendOpts().features.IsEnabled(
           Fortran::common::LanguageFeature::OpenMP)) {
-    setOffloadModuleInterfaceAttributes(*mlirModule,
+    setOffloadModuleInterfaceAttributes(lb.getModule(),
                                         ci.getInvocation().getLangOpts());
-    setOpenMPVersionAttribute(*mlirModule,
+    setOpenMPVersionAttribute(lb.getModule(),
                               ci.getInvocation().getLangOpts().OpenMPVersion);
   }
 
@@ -318,6 +315,9 @@ bool CodeGenAction::beginSourceFileAction() {
   Fortran::parser::Program &parseTree{*ci.getParsing().parseTree()};
   lb.lower(parseTree, ci.getSemanticsContext());
 
+  // Fetch module from lb, so we can set
+  mlirModule = lb.getModuleAndRelease();
+
   // Add target specific items like dependent libraries, target specific
   // constants etc.
   addDependentLibs(*mlirModule, ci);
@@ -566,9 +566,11 @@ void DebugMeasureParseTreeAction::executeAction() {
   // Parse. In case of failure, report and return.
   ci.getParsing().Parse(llvm::outs());
 
-  if (!ci.getParsing().messages().empty() &&
-      (ci.getInvocation().getWarnAsErr() ||
-       ci.getParsing().messages().AnyFatalError())) {
+  if ((ci.getParsing().parseTree().has_value() &&
+       !ci.getParsing().consumedWholeFile()) ||
+      (!ci.getParsing().messages().empty() &&
+       (ci.getInvocation().getWarnAsErr() ||
+        ci.getParsing().messages().AnyFatalError()))) {
     unsigned diagID = ci.getDiagnostics().getCustomDiagID(
         clang::DiagnosticsEngine::Error, "Could not parse %0");
     ci.getDiagnostics().Report(diagID) << getCurrentFileOrBufferName();
@@ -961,6 +963,9 @@ static void generateMachineCodeOrAssemblyImpl(clang::DiagnosticsEngine &diags,
 
   // Run the passes
   codeGenPasses.run(llvmModule);
+
+  // Cleanup
+  delete tlii;
 }
 
 void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
@@ -1043,6 +1048,9 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
 
   // Run the passes.
   mpm.run(*llvmModule, mam);
+
+  // Cleanup
+  delete tlii;
 }
 
 // This class handles optimization remark messages requested if
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index f143656..dc13554 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -22,12 +22,14 @@
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/Runtime.h"
 #include "flang/Lower/StatementContext.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Parser/parse-tree.h"
@@ -184,14 +186,9 @@ static mlir::Value genRuntimeAllocate(fir::FirOpBuilder &builder,
           ? fir::runtime::getRuntimeFunc<mkRTKey(PointerAllocate)>(loc, builder)
           : fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
                                                                        builder);
-  llvm::SmallVector<mlir::Value> args{box.getAddr()};
-  if (!box.isPointer())
-    args.push_back(
-        builder.createIntegerConstant(loc, builder.getI64Type(), -1));
-  args.push_back(errorManager.hasStat);
-  args.push_back(errorManager.errMsgAddr);
-  args.push_back(errorManager.sourceFile);
-  args.push_back(errorManager.sourceLine);
+  llvm::SmallVector<mlir::Value> args{
+      box.getAddr(), errorManager.hasStat, errorManager.errMsgAddr,
+      errorManager.sourceFile, errorManager.sourceLine};
   llvm::SmallVector<mlir::Value> operands;
   for (auto [fst, snd] : llvm::zip(args, callee.getFunctionType().getInputs()))
     operands.emplace_back(builder.createConvert(loc, snd, fst));
@@ -457,6 +454,19 @@ private:
                                                    alloc.getSymbol());
   }
 
+  void setPinnedToFalse() {
+    if (!pinnedExpr)
+      return;
+    Fortran::lower::StatementContext stmtCtx;
+    mlir::Value pinned =
+        fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx));
+    mlir::Location loc = pinned.getLoc();
+    mlir::Value falseValue = builder.createBool(loc, false);
+    mlir::Value falseConv = builder.createConvert(
+        loc, fir::unwrapRefType(pinned.getType()), falseValue);
+    builder.create<fir::StoreOp>(loc, falseConv, pinned);
+  }
+
   void genSimpleAllocation(const Allocation &alloc,
                            const fir::MutableBoxValue &box) {
     bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
@@ -472,6 +482,7 @@ private:
       // can be validated.
       genInlinedAllocation(alloc, box);
       postAllocationAction(alloc);
+      setPinnedToFalse();
       return;
     }
 
@@ -485,11 +496,13 @@ private:
     genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (!isCudaSymbol)
+    if (!isCudaSymbol) {
       stat = genRuntimeAllocate(builder, loc, box, errorManager);
-    else
+      setPinnedToFalse();
+    } else {
       stat =
           genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
+    }
     fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
     postAllocationAction(alloc);
     errorManager.assignStat(builder, loc, stat);
@@ -619,13 +632,16 @@ private:
       genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
       stat =
           genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
-    else if (isSource)
-      stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
-    else
-      stat = genRuntimeAllocate(builder, loc, box, errorManager);
+    } else {
+      if (isSource)
+        stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
+      else
+        stat = genRuntimeAllocate(builder, loc, box, errorManager);
+      setPinnedToFalse();
+    }
     fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
     postAllocationAction(alloc);
     errorManager.assignStat(builder, loc, stat);
@@ -1098,11 +1114,13 @@ void Fortran::lower::associateMutableBox(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   if (Fortran::evaluate::UnwrapExpr<Fortran::evaluate::NullPointer>(source)) {
     fir::factory::disassociateMutableBox(builder, loc, box);
+    cuf::genPointerSync(box.getAddr(), builder);
     return;
   }
   if (converter.getLoweringOptions().getLowerToHighLevelFIR()) {
     fir::ExtendedValue rhs = converter.genExprAddr(loc, source, stmtCtx);
     fir::factory::associateMutableBox(builder, loc, box, rhs, lbounds);
+    cuf::genPointerSync(box.getAddr(), builder);
     return;
   }
   // The right hand side is not be evaluated into a temp. Array sections can
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 17b794d1..37f51d7 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -34,6 +34,7 @@
 #include "flang/Lower/StatementContext.h"
 #include "flang/Lower/Support/Utils.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/Character.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Runtime/Assign.h"
@@ -832,7 +833,11 @@ public:
           if_builder.end();
         },
         [&](const auto &) -> void {
-          if (skipDefaultInit)
+          // Always initialize allocatable component descriptor, even when the
+          // value is later copied from the host (e.g. firstprivate) because the
+          // assignment from the host to the copy will fail if the component
+          // descriptors are not initialized.
+          if (skipDefaultInit && !hlfir::mayHaveAllocatableComponent(hSymType))
             return;
           // Initialize local/private derived types with default
           // initialization (Fortran 2023 section 11.1.7.5 and OpenMP 5.2
@@ -1028,7 +1033,7 @@ public:
 
   fir::FirOpBuilder &getFirOpBuilder() override final { return *builder; }
 
-  mlir::ModuleOp &getModuleOp() override final { return bridge.getModule(); }
+  mlir::ModuleOp getModuleOp() override final { return bridge.getModule(); }
 
   mlir::MLIRContext &getMLIRContext() override final {
     return bridge.getMLIRContext();
@@ -3952,6 +3957,7 @@ private:
       } else {
         fir::MutableBoxValue box = genExprMutableBox(loc, *expr);
         fir::factory::disassociateMutableBox(*builder, loc, box);
+        cuf::genPointerSync(box.getAddr(), *builder);
       }
     }
   }
@@ -6137,10 +6143,7 @@ void Fortran::lower::LoweringBridge::lower(
 }
 
 void Fortran::lower::LoweringBridge::parseSourceFile(llvm::SourceMgr &srcMgr) {
-  mlir::OwningOpRef<mlir::ModuleOp> owningRef =
-      mlir::parseSourceFile<mlir::ModuleOp>(srcMgr, &context);
-  module.reset(new mlir::ModuleOp(owningRef.get().getOperation()));
-  owningRef.release();
+  module = mlir::parseSourceFile<mlir::ModuleOp>(srcMgr, &context);
 }
 
 Fortran::lower::LoweringBridge::LoweringBridge(
@@ -6207,19 +6210,18 @@ Fortran::lower::LoweringBridge::LoweringBridge(
   };
 
   // Create the module and attach the attributes.
-  module = std::make_unique<mlir::ModuleOp>(
+  module = mlir::OwningOpRef<mlir::ModuleOp>(
       mlir::ModuleOp::create(getPathLocation()));
-  assert(module.get() && "module was not created");
-  fir::setTargetTriple(*module.get(), triple);
-  fir::setKindMapping(*module.get(), kindMap);
-  fir::setTargetCPU(*module.get(), targetMachine.getTargetCPU());
-  fir::setTuneCPU(*module.get(), targetOpts.cpuToTuneFor);
-  fir::setTargetFeatures(*module.get(), targetMachine.getTargetFeatureString());
-  fir::support::setMLIRDataLayout(*module.get(),
-                                  targetMachine.createDataLayout());
-  fir::setIdent(*module.get(), Fortran::common::getFlangFullVersion());
+  assert(*module && "module was not created");
+  fir::setTargetTriple(*module, triple);
+  fir::setKindMapping(*module, kindMap);
+  fir::setTargetCPU(*module, targetMachine.getTargetCPU());
+  fir::setTuneCPU(*module, targetOpts.cpuToTuneFor);
+  fir::setTargetFeatures(*module, targetMachine.getTargetFeatureString());
+  fir::support::setMLIRDataLayout(*module, targetMachine.createDataLayout());
+  fir::setIdent(*module, Fortran::common::getFlangFullVersion());
   if (cgOpts.RecordCommandLine)
-    fir::setCommandline(*module.get(), *cgOpts.RecordCommandLine);
+    fir::setCommandline(*module, *cgOpts.RecordCommandLine);
 }
 
 void Fortran::lower::genCleanUpInRegionIfAny(
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index ba6622d..f57f0e7 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -29,6 +29,7 @@ add_flang_library(FortranLower
   OpenMP/DataSharingProcessor.cpp
   OpenMP/Decomposer.cpp
   OpenMP/OpenMP.cpp
+  OpenMP/PrivateReductionUtils.cpp
   OpenMP/ReductionProcessor.cpp
   OpenMP/Utils.cpp
   PFTBuilder.cpp
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index ed18ad8..8155c363 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -2670,11 +2670,13 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
           asyncOnlyDeviceTypes);
       attachEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
-    } else if(const auto *defaultClause = 
-                  std::get_if<Fortran::parser::AccClause::Default>(&clause.u)) {
+    } else if (const auto *defaultClause =
+                   std::get_if<Fortran::parser::AccClause::Default>(
+                       &clause.u)) {
       if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_none)
         hasDefaultNone = true;
-      else if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_present)
+      else if ((defaultClause->v).v ==
+               llvm::acc::DefaultValue::ACC_Default_present)
         hasDefaultPresent = true;
     }
   }
@@ -3830,7 +3832,7 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
 
 static void
 genDeclareInModule(Fortran::lower::AbstractConverter &converter,
-                   mlir::ModuleOp &moduleOp,
+                   mlir::ModuleOp moduleOp,
                    const Fortran::parser::AccClauseList &accClauseList) {
   mlir::OpBuilder modBuilder(moduleOp.getBodyRegion());
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
@@ -3981,7 +3983,7 @@ static void attachRoutineInfo(mlir::func::FuncOp func,
 
 void Fortran::lower::genOpenACCRoutineConstruct(
     Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semanticsContext, mlir::ModuleOp &mod,
+    Fortran::semantics::SemanticsContext &semanticsContext, mlir::ModuleOp mod,
     const Fortran::parser::OpenACCRoutineConstruct &routineConstruct,
     Fortran::lower::AccRoutineInfoMappingList &accRoutineInfos) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
@@ -4139,7 +4141,7 @@ void Fortran::lower::genOpenACCRoutineConstruct(
 }
 
 void Fortran::lower::finalizeOpenACCRoutineAttachment(
-    mlir::ModuleOp &mod,
+    mlir::ModuleOp mod,
     Fortran::lower::AccRoutineInfoMappingList &accRoutineInfos) {
   for (auto &mapping : accRoutineInfos) {
     mlir::func::FuncOp funcOp =
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 3c98311..fb8e007 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -153,10 +153,13 @@ genDependKindAttr(lower::AbstractConverter &converter,
     pbKind = mlir::omp::ClauseTaskDepend::taskdependinout;
     break;
   case omp::clause::DependenceType::Mutexinoutset:
+    pbKind = mlir::omp::ClauseTaskDepend::taskdependmutexinoutset;
+    break;
   case omp::clause::DependenceType::Inoutset:
+    pbKind = mlir::omp::ClauseTaskDepend::taskdependinoutset;
+    break;
   case omp::clause::DependenceType::Depobj:
-    TODO(currentLocation,
-         "INOUTSET, MUTEXINOUTSET and DEPOBJ dependence-types");
+    TODO(currentLocation, "DEPOBJ dependence-type");
     break;
   case omp::clause::DependenceType::Sink:
   case omp::clause::DependenceType::Source:
@@ -610,6 +613,8 @@ addAlignedClause(lower::AbstractConverter &converter,
   // Do not generate alignment assumption if alignment is less than or equal to
   // 0.
   if (alignment > 0) {
+    // alignment value must be power of 2
+    assert((alignment & (alignment - 1)) == 0 && "alignment is not power of 2");
     auto &objects = std::get<omp::ObjectList>(clause.t);
     if (!objects.empty())
       genObjectList(objects, converter, alignedVars);
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index cd31253..9dfdbd8 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -126,7 +126,8 @@ void DataSharingProcessor::cloneSymbol(const semantics::Symbol *sym) {
     assert(sb);
     mlir::Value addr = sb.getAddr();
     assert(addr);
-    return hlfir::mayHaveAllocatableComponent(addr.getType());
+    return !fir::isPointerType(addr.getType()) &&
+           hlfir::mayHaveAllocatableComponent(addr.getType());
   };
 
   if (needInitClone()) {
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index b07e89d..cd4b25a 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -923,13 +923,24 @@ static void genBodyOfTargetOp(
   while (!valuesDefinedAbove.empty()) {
     for (mlir::Value val : valuesDefinedAbove) {
       mlir::Operation *valOp = val.getDefiningOp();
-      if (mlir::isMemoryEffectFree(valOp)) {
+      assert(valOp != nullptr);
+
+      // NOTE: We skip BoxDimsOp's as the lesser of two evils is to map the
+      // indices separately, as the alternative is to eventually map the Box,
+      // which comes with a fairly large overhead comparatively. We could be
+      // more robust about this and check using a BackwardsSlice to see if we
+      // run the risk of mapping a box.
+      if (mlir::isMemoryEffectFree(valOp) &&
+          !mlir::isa<fir::BoxDimsOp>(valOp)) {
         mlir::Operation *clonedOp = valOp->clone();
         entryBlock->push_front(clonedOp);
-        val.replaceUsesWithIf(clonedOp->getResult(0),
-                              [entryBlock](mlir::OpOperand &use) {
-                                return use.getOwner()->getBlock() == entryBlock;
-                              });
+
+        auto replace = [entryBlock](mlir::OpOperand &use) {
+          return use.getOwner()->getBlock() == entryBlock;
+        };
+
+        valOp->getResults().replaceUsesWithIf(clonedOp->getResults(), replace);
+        valOp->replaceUsesWithIf(clonedOp, replace);
       } else {
         auto savedIP = firOpBuilder.getInsertionPoint();
         firOpBuilder.setInsertionPointAfter(valOp);
@@ -937,9 +948,36 @@ static void genBodyOfTargetOp(
             firOpBuilder.createTemporary(val.getLoc(), val.getType());
         firOpBuilder.createStoreWithConvert(copyVal.getLoc(), val, copyVal);
 
-        llvm::SmallVector<mlir::Value> bounds;
+        lower::AddrAndBoundsInfo info = lower::getDataOperandBaseAddr(
+            firOpBuilder, val, /*isOptional=*/false, val.getLoc());
+        llvm::SmallVector<mlir::Value> bounds =
+            Fortran::lower::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
+                                                 mlir::omp::MapBoundsType>(
+                firOpBuilder, info,
+                hlfir::translateToExtendedValue(val.getLoc(), firOpBuilder,
+                                                hlfir::Entity{val})
+                    .first,
+                /*dataExvIsAssumedSize=*/false, val.getLoc());
+
         std::stringstream name;
         firOpBuilder.setInsertionPoint(targetOp);
+
+        llvm::omp::OpenMPOffloadMappingFlags mapFlag =
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+        mlir::omp::VariableCaptureKind captureKind =
+            mlir::omp::VariableCaptureKind::ByRef;
+
+        mlir::Type eleType = copyVal.getType();
+        if (auto refType =
+                mlir::dyn_cast<fir::ReferenceType>(copyVal.getType()))
+          eleType = refType.getElementType();
+
+        if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
+          captureKind = mlir::omp::VariableCaptureKind::ByCopy;
+        } else if (!fir::isa_builtin_cptr_type(eleType)) {
+          mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+        }
+
         mlir::Value mapOp = createMapInfoOp(
             firOpBuilder, copyVal.getLoc(), copyVal,
             /*varPtrPtr=*/mlir::Value{}, name.str(), bounds,
@@ -947,8 +985,8 @@ static void genBodyOfTargetOp(
             /*membersIndex=*/mlir::ArrayAttr{},
             static_cast<
                 std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-                llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
-            mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
+                mapFlag),
+            captureKind, copyVal.getType());
 
         // Get the index of the first non-map argument before modifying mapVars,
         // then append an element to mapVars and an associated entry block
@@ -2586,6 +2624,10 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
 //===----------------------------------------------------------------------===//
 // OpenMPDeclarativeConstruct visitors
 //===----------------------------------------------------------------------===//
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
+                   const parser::OpenMPUtilityConstruct &);
 
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
@@ -2907,8 +2949,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
-                   const parser::OpenMPErrorConstruct &) {
-  TODO(converter.getCurrentLocation(), "OpenMPErrorConstruct");
+                   const parser::OpenMPUtilityConstruct &) {
+  TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp
new file mode 100644
index 0000000..83f0d4e
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp
@@ -0,0 +1,236 @@
+//===-- PrivateReductionUtils.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "PrivateReductionUtils.h"
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/FatalError.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Location.h"
+
+static void createCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Type argType,
+                                mlir::Region &cleanupRegion) {
+  assert(cleanupRegion.empty());
+  mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(),
+                                           {argType}, {loc});
+  builder.setInsertionPointToEnd(block);
+
+  auto typeError = [loc]() {
+    fir::emitFatalError(loc,
+                        "Attempt to create an omp cleanup region "
+                        "for a type that wasn't allocated",
+                        /*genCrashDiag=*/true);
+  };
+
+  mlir::Type valTy = fir::unwrapRefType(argType);
+  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(valTy)) {
+    if (!mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy())) {
+      mlir::Type innerTy = fir::extractSequenceType(boxTy);
+      if (!mlir::isa<fir::SequenceType>(innerTy))
+        typeError();
+    }
+
+    mlir::Value arg = builder.loadIfRef(loc, block->getArgument(0));
+    assert(mlir::isa<fir::BaseBoxType>(arg.getType()));
+
+    // Deallocate box
+    // The FIR type system doesn't nesecarrily know that this is a mutable box
+    // if we allocated the thread local array on the heap to avoid looped stack
+    // allocations.
+    mlir::Value addr =
+        hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg});
+    mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr);
+    fir::IfOp ifOp =
+        builder.create<fir::IfOp>(loc, isAllocated, /*withElseRegion=*/false);
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
+    mlir::Value cast = builder.createConvert(
+        loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
+    builder.create<fir::FreeMemOp>(loc, cast);
+
+    builder.setInsertionPointAfter(ifOp);
+    builder.create<mlir::omp::YieldOp>(loc);
+    return;
+  }
+
+  typeError();
+}
+
+fir::ShapeShiftOp Fortran::lower::omp::getShapeShift(fir::FirOpBuilder &builder,
+                                                     mlir::Location loc,
+                                                     mlir::Value box) {
+  fir::SequenceType sequenceType = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(box.getType()));
+  const unsigned rank = sequenceType.getDimension();
+  llvm::SmallVector<mlir::Value> lbAndExtents;
+  lbAndExtents.reserve(rank * 2);
+
+  mlir::Type idxTy = builder.getIndexType();
+  for (unsigned i = 0; i < rank; ++i) {
+    // TODO: ideally we want to hoist box reads out of the critical section.
+    // We could do this by having box dimensions in block arguments like
+    // OpenACC does
+    mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
+    auto dimInfo =
+        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
+    lbAndExtents.push_back(dimInfo.getLowerBound());
+    lbAndExtents.push_back(dimInfo.getExtent());
+  }
+
+  auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank);
+  auto shapeShift =
+      builder.create<fir::ShapeShiftOp>(loc, shapeShiftTy, lbAndExtents);
+  return shapeShift;
+}
+
+void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
+    fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type argType,
+    mlir::Value scalarInitValue, mlir::Block *initBlock,
+    mlir::Value allocatedPrivVarArg, mlir::Value moldArg,
+    mlir::Region &cleanupRegion) {
+  mlir::Type ty = fir::unwrapRefType(argType);
+  builder.setInsertionPointToEnd(initBlock);
+  auto yield = [&](mlir::Value ret) {
+    builder.create<mlir::omp::YieldOp>(loc, ret);
+  };
+
+  if (fir::isa_trivial(ty)) {
+    builder.setInsertionPointToEnd(initBlock);
+
+    if (scalarInitValue)
+      builder.createStoreWithConvert(loc, scalarInitValue, allocatedPrivVarArg);
+    yield(allocatedPrivVarArg);
+    return;
+  }
+
+  // check if an allocatable box is unallocated. If so, initialize the boxAlloca
+  // to be unallocated e.g.
+  // %box_alloca = fir.alloca !fir.box<!fir.heap<...>>
+  // %addr = fir.box_addr %box
+  // if (%addr == 0) {
+  //   %nullbox = fir.embox %addr
+  //   fir.store %nullbox to %box_alloca
+  // } else {
+  //   // ...
+  //   fir.store %something to %box_alloca
+  // }
+  // omp.yield %box_alloca
+  moldArg = builder.loadIfRef(loc, moldArg);
+  auto handleNullAllocatable = [&](mlir::Value boxAlloca) -> fir::IfOp {
+    mlir::Value addr = builder.create<fir::BoxAddrOp>(loc, moldArg);
+    mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr);
+    fir::IfOp ifOp = builder.create<fir::IfOp>(loc, isNotAllocated,
+                                               /*withElseRegion=*/true);
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+    // just embox the null address and return
+    mlir::Value nullBox = builder.create<fir::EmboxOp>(loc, ty, addr);
+    builder.create<fir::StoreOp>(loc, nullBox, boxAlloca);
+    return ifOp;
+  };
+
+  // all arrays are boxed
+  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
+    bool isAllocatableOrPointer =
+        mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy());
+
+    builder.setInsertionPointToEnd(initBlock);
+    mlir::Value boxAlloca = allocatedPrivVarArg;
+    mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy());
+    if (fir::isa_trivial(innerTy)) {
+      // boxed non-sequence value e.g. !fir.box<!fir.heap<i32>>
+      if (!isAllocatableOrPointer)
+        TODO(loc,
+             "Reduction/Privatization of non-allocatable trivial typed box");
+
+      fir::IfOp ifUnallocated = handleNullAllocatable(boxAlloca);
+
+      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
+      mlir::Value valAlloc = builder.create<fir::AllocMemOp>(loc, innerTy);
+      if (scalarInitValue)
+        builder.createStoreWithConvert(loc, scalarInitValue, valAlloc);
+      mlir::Value box = builder.create<fir::EmboxOp>(loc, ty, valAlloc);
+      builder.create<fir::StoreOp>(loc, box, boxAlloca);
+
+      createCleanupRegion(builder, loc, argType, cleanupRegion);
+      builder.setInsertionPointAfter(ifUnallocated);
+      yield(boxAlloca);
+      return;
+    }
+    innerTy = fir::extractSequenceType(boxTy);
+    if (!mlir::isa<fir::SequenceType>(innerTy))
+      TODO(loc, "Unsupported boxed type for reduction/privatization");
+
+    fir::IfOp ifUnallocated{nullptr};
+    if (isAllocatableOrPointer) {
+      ifUnallocated = handleNullAllocatable(boxAlloca);
+      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
+    }
+
+    // Create the private copy from the initial fir.box:
+    mlir::Value loadedBox = builder.loadIfRef(loc, moldArg);
+    hlfir::Entity source = hlfir::Entity{loadedBox};
+
+    // Allocating on the heap in case the whole reduction is nested inside of a
+    // loop
+    // TODO: compare performance here to using allocas - this could be made to
+    // work by inserting stacksave/stackrestore around the reduction in
+    // openmpirbuilder
+    auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
+    // if needsDealloc isn't statically false, add cleanup region. Always
+    // do this for allocatable boxes because they might have been re-allocated
+    // in the body of the loop/parallel region
+
+    std::optional<int64_t> cstNeedsDealloc =
+        fir::getIntIfConstant(needsDealloc);
+    assert(cstNeedsDealloc.has_value() &&
+           "createTempFromMold decides this statically");
+    if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      createCleanupRegion(builder, loc, argType, cleanupRegion);
+    } else {
+      assert(!isAllocatableOrPointer &&
+             "Pointer-like arrays must be heap allocated");
+    }
+
+    // Put the temporary inside of a box:
+    // hlfir::genVariableBox doesn't handle non-default lower bounds
+    mlir::Value box;
+    fir::ShapeShiftOp shapeShift = getShapeShift(builder, loc, loadedBox);
+    mlir::Type boxType = loadedBox.getType();
+    if (mlir::isa<fir::BaseBoxType>(temp.getType()))
+      // the box created by the declare form createTempFromMold is missing lower
+      // bounds info
+      box = builder.create<fir::ReboxOp>(loc, boxType, temp, shapeShift,
+                                         /*shift=*/mlir::Value{});
+    else
+      box = builder.create<fir::EmboxOp>(
+          loc, boxType, temp, shapeShift,
+          /*slice=*/mlir::Value{},
+          /*typeParams=*/llvm::ArrayRef<mlir::Value>{});
+
+    if (scalarInitValue)
+      builder.create<hlfir::AssignOp>(loc, scalarInitValue, box);
+    builder.create<fir::StoreOp>(loc, box, boxAlloca);
+    if (ifUnallocated)
+      builder.setInsertionPointAfter(ifUnallocated);
+    yield(boxAlloca);
+    return;
+  }
+
+  TODO(loc,
+       "creating reduction/privatization init region for unsupported type");
+  return;
+}
diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.h b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h
new file mode 100644
index 0000000..b4abc40
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h
@@ -0,0 +1,51 @@
+//===-- Lower/OpenMP/PrivateReductionUtils.h --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H
+#define FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H
+
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+class Region;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+class ShapeShiftOp;
+} // namespace fir
+
+namespace Fortran {
+namespace lower {
+namespace omp {
+
+/// Generate init and cleanup regions suitable for reduction or privatizer
+/// declarations. `scalarInitValue` may be nullptr if there is no default
+/// initialization (for privatization).
+void populateByRefInitAndCleanupRegions(fir::FirOpBuilder &builder,
+                                        mlir::Location loc, mlir::Type argType,
+                                        mlir::Value scalarInitValue,
+                                        mlir::Block *initBlock,
+                                        mlir::Value allocatedPrivVarArg,
+                                        mlir::Value moldArg,
+                                        mlir::Region &cleanupRegion);
+
+/// Generate a fir::ShapeShift op describing the provided boxed array.
+fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Value box);
+
+} // namespace omp
+} // namespace lower
+} // namespace Fortran
+
+#endif // FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 736de2ee..2cd2110 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -12,6 +12,7 @@
 
 #include "ReductionProcessor.h"
 
+#include "PrivateReductionUtils.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/ConvertType.h"
 #include "flang/Lower/SymbolMap.h"
@@ -294,33 +295,6 @@ mlir::Value ReductionProcessor::createScalarCombiner(
   return reductionOp;
 }
 
-/// Generate a fir::ShapeShift op describing the provided boxed array.
-static fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder,
-                                       mlir::Location loc, mlir::Value box) {
-  fir::SequenceType sequenceType = mlir::cast<fir::SequenceType>(
-      hlfir::getFortranElementOrSequenceType(box.getType()));
-  const unsigned rank = sequenceType.getDimension();
-  llvm::SmallVector<mlir::Value> lbAndExtents;
-  lbAndExtents.reserve(rank * 2);
-
-  mlir::Type idxTy = builder.getIndexType();
-  for (unsigned i = 0; i < rank; ++i) {
-    // TODO: ideally we want to hoist box reads out of the critical section.
-    // We could do this by having box dimensions in block arguments like
-    // OpenACC does
-    mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
-    auto dimInfo =
-        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
-    lbAndExtents.push_back(dimInfo.getLowerBound());
-    lbAndExtents.push_back(dimInfo.getExtent());
-  }
-
-  auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank);
-  auto shapeShift =
-      builder.create<fir::ShapeShiftOp>(loc, shapeShiftTy, lbAndExtents);
-  return shapeShift;
-}
-
 /// Create reduction combiner region for reduction variables which are boxed
 /// arrays
 static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -422,59 +396,6 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
   TODO(loc, "OpenMP genCombiner for unsupported reduction variable type");
 }
 
-static void
-createReductionCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
-                             mlir::omp::DeclareReductionOp &reductionDecl) {
-  mlir::Type redTy = reductionDecl.getType();
-
-  mlir::Region &cleanupRegion = reductionDecl.getCleanupRegion();
-  assert(cleanupRegion.empty());
-  mlir::Block *block =
-      builder.createBlock(&cleanupRegion, cleanupRegion.end(), {redTy}, {loc});
-  builder.setInsertionPointToEnd(block);
-
-  auto typeError = [loc]() {
-    fir::emitFatalError(loc,
-                        "Attempt to create an omp reduction cleanup region "
-                        "for a type that wasn't allocated",
-                        /*genCrashDiag=*/true);
-  };
-
-  mlir::Type valTy = fir::unwrapRefType(redTy);
-  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(valTy)) {
-    if (!mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy())) {
-      mlir::Type innerTy = fir::extractSequenceType(boxTy);
-      if (!mlir::isa<fir::SequenceType>(innerTy))
-        typeError();
-    }
-
-    mlir::Value arg = block->getArgument(0);
-    arg = builder.loadIfRef(loc, arg);
-    assert(mlir::isa<fir::BaseBoxType>(arg.getType()));
-
-    // Deallocate box
-    // The FIR type system doesn't nesecarrily know that this is a mutable box
-    // if we allocated the thread local array on the heap to avoid looped stack
-    // allocations.
-    mlir::Value addr =
-        hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg});
-    mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr);
-    fir::IfOp ifOp =
-        builder.create<fir::IfOp>(loc, isAllocated, /*withElseRegion=*/false);
-    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-
-    mlir::Value cast = builder.createConvert(
-        loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
-    builder.create<fir::FreeMemOp>(loc, cast);
-
-    builder.setInsertionPointAfter(ifOp);
-    builder.create<mlir::omp::YieldOp>(loc);
-    return;
-  }
-
-  typeError();
-}
-
 // like fir::unwrapSeqOrBoxedSeqType except it also works for non-sequence boxes
 static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) {
   if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
@@ -517,154 +438,31 @@ static void createReductionAllocAndInitRegions(
   mlir::Value initValue = ReductionProcessor::getReductionInitValue(
       loc, unwrapSeqOrBoxedType(ty), redId, builder);
 
+  if (isByRef) {
+    populateByRefInitAndCleanupRegions(builder, loc, type, initValue, initBlock,
+                                       reductionDecl.getInitializerAllocArg(),
+                                       reductionDecl.getInitializerMoldArg(),
+                                       reductionDecl.getCleanupRegion());
+  }
+
   if (fir::isa_trivial(ty)) {
     if (isByRef) {
       // alloc region
-      {
-        builder.setInsertionPointToEnd(allocBlock);
-        mlir::Value alloca = builder.create<fir::AllocaOp>(loc, ty);
-        yield(alloca);
-      }
-
-      // init region
-      {
-        builder.setInsertionPointToEnd(initBlock);
-        // block arg is mapped to the alloca yielded from the alloc region
-        mlir::Value alloc = reductionDecl.getInitializerAllocArg();
-        builder.createStoreWithConvert(loc, initValue, alloc);
-        yield(alloc);
-      }
+      builder.setInsertionPointToEnd(allocBlock);
+      mlir::Value alloca = builder.create<fir::AllocaOp>(loc, ty);
+      yield(alloca);
       return;
     }
     // by val
     yield(initValue);
     return;
   }
+  assert(isByRef && "passing non-trivial types by val is unsupported");
 
-  // check if an allocatable box is unallocated. If so, initialize the boxAlloca
-  // to be unallocated e.g.
-  // %box_alloca = fir.alloca !fir.box<!fir.heap<...>>
-  // %addr = fir.box_addr %box
-  // if (%addr == 0) {
-  //   %nullbox = fir.embox %addr
-  //   fir.store %nullbox to %box_alloca
-  // } else {
-  //   // ...
-  //   fir.store %something to %box_alloca
-  // }
-  // omp.yield %box_alloca
-  mlir::Value moldArg =
-      builder.loadIfRef(loc, reductionDecl.getInitializerMoldArg());
-  auto handleNullAllocatable = [&](mlir::Value boxAlloca) -> fir::IfOp {
-    mlir::Value addr = builder.create<fir::BoxAddrOp>(loc, moldArg);
-    mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr);
-    fir::IfOp ifOp = builder.create<fir::IfOp>(loc, isNotAllocated,
-                                               /*withElseRegion=*/true);
-    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-    // just embox the null address and return
-    mlir::Value nullBox = builder.create<fir::EmboxOp>(loc, ty, addr);
-    builder.create<fir::StoreOp>(loc, nullBox, boxAlloca);
-    return ifOp;
-  };
-
-  // all arrays are boxed
-  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
-    assert(isByRef && "passing boxes by value is unsupported");
-    bool isAllocatableOrPointer =
-        mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy());
-
-    // alloc region
-    {
-      builder.setInsertionPointToEnd(allocBlock);
-      mlir::Value boxAlloca = builder.create<fir::AllocaOp>(loc, ty);
-      yield(boxAlloca);
-    }
-
-    // init region
-    builder.setInsertionPointToEnd(initBlock);
-    mlir::Value boxAlloca = reductionDecl.getInitializerAllocArg();
-    mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy());
-    if (fir::isa_trivial(innerTy)) {
-      // boxed non-sequence value e.g. !fir.box<!fir.heap<i32>>
-      if (!isAllocatableOrPointer)
-        TODO(loc, "Reduction of non-allocatable trivial typed box");
-
-      fir::IfOp ifUnallocated = handleNullAllocatable(boxAlloca);
-
-      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
-      mlir::Value valAlloc = builder.create<fir::AllocMemOp>(loc, innerTy);
-      builder.createStoreWithConvert(loc, initValue, valAlloc);
-      mlir::Value box = builder.create<fir::EmboxOp>(loc, ty, valAlloc);
-      builder.create<fir::StoreOp>(loc, box, boxAlloca);
-
-      auto insPt = builder.saveInsertionPoint();
-      createReductionCleanupRegion(builder, loc, reductionDecl);
-      builder.restoreInsertionPoint(insPt);
-      builder.setInsertionPointAfter(ifUnallocated);
-      yield(boxAlloca);
-      return;
-    }
-    innerTy = fir::extractSequenceType(boxTy);
-    if (!mlir::isa<fir::SequenceType>(innerTy))
-      TODO(loc, "Unsupported boxed type for reduction");
-
-    fir::IfOp ifUnallocated{nullptr};
-    if (isAllocatableOrPointer) {
-      ifUnallocated = handleNullAllocatable(boxAlloca);
-      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
-    }
-
-    // Create the private copy from the initial fir.box:
-    mlir::Value loadedBox = builder.loadIfRef(loc, moldArg);
-    hlfir::Entity source = hlfir::Entity{loadedBox};
-
-    // Allocating on the heap in case the whole reduction is nested inside of a
-    // loop
-    // TODO: compare performance here to using allocas - this could be made to
-    // work by inserting stacksave/stackrestore around the reduction in
-    // openmpirbuilder
-    auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
-    // if needsDealloc isn't statically false, add cleanup region. Always
-    // do this for allocatable boxes because they might have been re-allocated
-    // in the body of the loop/parallel region
-
-    std::optional<int64_t> cstNeedsDealloc =
-        fir::getIntIfConstant(needsDealloc);
-    assert(cstNeedsDealloc.has_value() &&
-           "createTempFromMold decides this statically");
-    if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
-      mlir::OpBuilder::InsertionGuard guard(builder);
-      createReductionCleanupRegion(builder, loc, reductionDecl);
-    } else {
-      assert(!isAllocatableOrPointer &&
-             "Pointer-like arrays must be heap allocated");
-    }
-
-    // Put the temporary inside of a box:
-    // hlfir::genVariableBox doesn't handle non-default lower bounds
-    mlir::Value box;
-    fir::ShapeShiftOp shapeShift = getShapeShift(builder, loc, loadedBox);
-    mlir::Type boxType = loadedBox.getType();
-    if (mlir::isa<fir::BaseBoxType>(temp.getType()))
-      // the box created by the declare form createTempFromMold is missing lower
-      // bounds info
-      box = builder.create<fir::ReboxOp>(loc, boxType, temp, shapeShift,
-                                         /*shift=*/mlir::Value{});
-    else
-      box = builder.create<fir::EmboxOp>(
-          loc, boxType, temp, shapeShift,
-          /*slice=*/mlir::Value{},
-          /*typeParams=*/llvm::ArrayRef<mlir::Value>{});
-
-    builder.create<hlfir::AssignOp>(loc, initValue, box);
-    builder.create<fir::StoreOp>(loc, box, boxAlloca);
-    if (ifUnallocated)
-      builder.setInsertionPointAfter(ifUnallocated);
-    yield(boxAlloca);
-    return;
-  }
-
-  TODO(loc, "createReductionInitRegion for unsupported type");
+  // alloc region
+  builder.setInsertionPointToEnd(allocBlock);
+  mlir::Value boxAlloca = builder.create<fir::AllocaOp>(loc, ty);
+  yield(boxAlloca);
 }
 
 mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction(
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index 611f212..e33d8fa 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -505,30 +505,17 @@ getAttrsFromVariable(fir::FortranVariableOpInterface var) {
 }
 
 template <typename OMPTypeOp, typename DeclTypeOp>
-static Value getPrivateArg(omp::BlockArgOpenMPOpInterface &argIface,
-                           OMPTypeOp &op, DeclTypeOp &declOp) {
-  Value privateArg;
+static bool isPrivateArg(omp::BlockArgOpenMPOpInterface &argIface,
+                         OMPTypeOp &op, DeclTypeOp &declOp) {
   if (!op.getPrivateSyms().has_value())
-    return privateArg;
+    return false;
   for (auto [opSym, blockArg] :
        llvm::zip_equal(*op.getPrivateSyms(), argIface.getPrivateBlockArgs())) {
     if (blockArg == declOp.getMemref()) {
-      omp::PrivateClauseOp privateOp =
-          SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
-              op, cast<SymbolRefAttr>(opSym));
-      privateOp.walk([&](omp::YieldOp yieldOp) {
-        // TODO Extend alias analysis if omp.yield points to
-        // block argument value
-        if (!yieldOp.getResults()[0].getDefiningOp())
-          return;
-        llvm::TypeSwitch<Operation *>(yieldOp.getResults()[0].getDefiningOp())
-            .template Case<fir::DeclareOp, hlfir::DeclareOp>(
-                [&](auto declOp) { privateArg = declOp.getMemref(); });
-      });
-      return privateArg;
+      return true;
     }
   }
-  return privateArg;
+  return false;
 }
 
 AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
@@ -631,6 +618,7 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           breakFromLoop = true;
         })
         .Case<hlfir::DeclareOp, fir::DeclareOp>([&](auto op) {
+          bool isPrivateItem = false;
           if (omp::BlockArgOpenMPOpInterface argIface =
                   dyn_cast<omp::BlockArgOpenMPOpInterface>(op->getParentOp())) {
             Value ompValArg;
@@ -644,19 +632,18 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
                       omp::MapInfoOp mapInfo =
                           llvm::cast<omp::MapInfoOp>(opArg.getDefiningOp());
                       ompValArg = mapInfo.getVarPtr();
-                      break;
+                      return;
                     }
                   }
                   // If given operation does not reflect mapping item,
                   // check private clause
-                  if (!ompValArg)
-                    ompValArg = getPrivateArg(argIface, targetOp, op);
+                  isPrivateItem = isPrivateArg(argIface, targetOp, op);
                 })
                 .template Case<omp::DistributeOp, omp::ParallelOp,
                                omp::SectionsOp, omp::SimdOp, omp::SingleOp,
                                omp::TaskloopOp, omp::TaskOp, omp::WsloopOp>(
                     [&](auto privateOp) {
-                      ompValArg = getPrivateArg(argIface, privateOp, op);
+                      isPrivateItem = isPrivateArg(argIface, privateOp, op);
                     });
             if (ompValArg) {
               v = ompValArg;
@@ -706,6 +693,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           } else {
             instantiationPoint = op;
           }
+          if (isPrivateItem) {
+            type = SourceKind::Allocate;
+            breakFromLoop = true;
+            return;
+          }
           // TODO: Look for the fortran attributes present on the operation
           // Track further through the operand
           v = op.getMemref();
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 05164d4..0960e85 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -5,6 +5,7 @@ add_flang_library(FIRBuilder
   BoxValue.cpp
   Character.cpp
   Complex.cpp
+  CUFCommon.cpp
   DoLoopHelper.cpp
   FIRBuilder.cpp
   HLFIRTools.cpp
@@ -48,6 +49,7 @@ add_flang_library(FIRBuilder
   FIRDialect
   FIRDialectSupport
   FIRSupport
+  FortranEvaluate
   HLFIRDialect
   ${dialect_libs}
   ${extension_libs}
diff --git a/flang/lib/Optimizer/Transforms/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index bbe3321..3984820 100644
--- a/flang/lib/Optimizer/Transforms/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -6,8 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Optimizer/Transforms/CUFCommon.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 
@@ -54,3 +56,18 @@ bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
     return true;
   return false;
 }
+
+void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
+  if (auto declareOp = box.getDefiningOp<hlfir::DeclareOp>()) {
+    if (auto addrOfOp = declareOp.getMemref().getDefiningOp<fir::AddrOfOp>()) {
+      auto mod = addrOfOp->getParentOfType<mlir::ModuleOp>();
+      if (auto globalOp =
+              mod.lookupSymbol<fir::GlobalOp>(addrOfOp.getSymbol())) {
+        if (cuf::isRegisteredDeviceGlobal(globalOp)) {
+          builder.create<cuf::SyncDescriptorOp>(box.getLoc(),
+                                                addrOfOp.getSymbol());
+        }
+      }
+    }
+  }
+}
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 3a39c45..d01becf 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1626,6 +1626,25 @@ mlir::Value fir::factory::genCPtrOrCFunptrAddr(fir::FirOpBuilder &builder,
                                            cPtr, addrFieldIndex);
 }
 
+mlir::Value fir::factory::genCDevPtrAddr(fir::FirOpBuilder &builder,
+                                         mlir::Location loc,
+                                         mlir::Value cDevPtr, mlir::Type ty) {
+  auto recTy = mlir::cast<fir::RecordType>(ty);
+  assert(recTy.getTypeList().size() == 1);
+  auto cptrFieldName = recTy.getTypeList()[0].first;
+  mlir::Type cptrFieldTy = recTy.getTypeList()[0].second;
+  auto fieldIndexType = fir::FieldType::get(ty.getContext());
+  mlir::Value cptrFieldIndex = builder.create<fir::FieldIndexOp>(
+      loc, fieldIndexType, cptrFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  auto cptrCoord = builder.create<fir::CoordinateOp>(
+      loc, builder.getRefType(cptrFieldTy), cDevPtr, cptrFieldIndex);
+  auto [addrFieldIndex, addrFieldTy] =
+      genCPtrOrCFunptrFieldIndex(builder, loc, cptrFieldTy);
+  return builder.create<fir::CoordinateOp>(loc, builder.getRefType(addrFieldTy),
+                                           cptrCoord, addrFieldIndex);
+}
+
 mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
                                                 mlir::Location loc,
                                                 mlir::Value cPtr) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 9a37779..cb0af39 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -167,6 +167,7 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genCAssociatedCPtr,
      {{{"c_ptr_1", asAddr}, {"c_ptr_2", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
+    {"c_devloc", &I::genCDevLoc, {{{"x", asBox}}}, /*isElemental=*/false},
     {"c_f_pointer",
      &I::genCFPointer,
      {{{"cptr", asValue},
@@ -2867,11 +2868,14 @@ static mlir::Value getAddrFromBox(fir::FirOpBuilder &builder,
 static fir::ExtendedValue
 genCLocOrCFunLoc(fir::FirOpBuilder &builder, mlir::Location loc,
                  mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args,
-                 bool isFunc = false) {
+                 bool isFunc = false, bool isDevLoc = false) {
   assert(args.size() == 1);
   mlir::Value res = builder.create<fir::AllocaOp>(loc, resultType);
-  mlir::Value resAddr =
-      fir::factory::genCPtrOrCFunptrAddr(builder, loc, res, resultType);
+  mlir::Value resAddr;
+  if (isDevLoc)
+    resAddr = fir::factory::genCDevPtrAddr(builder, loc, res, resultType);
+  else
+    resAddr = fir::factory::genCPtrOrCFunptrAddr(builder, loc, res, resultType);
   assert(fir::isa_box_type(fir::getBase(args[0]).getType()) &&
          "argument must have been lowered to box type");
   mlir::Value argAddr = getAddrFromBox(builder, loc, args[0], isFunc);
@@ -2928,6 +2932,14 @@ IntrinsicLibrary::genCAssociatedCPtr(mlir::Type resultType,
   return genCAssociated(builder, loc, resultType, args);
 }
 
+// C_DEVLOC
+fir::ExtendedValue
+IntrinsicLibrary::genCDevLoc(mlir::Type resultType,
+                             llvm::ArrayRef<fir::ExtendedValue> args) {
+  return genCLocOrCFunLoc(builder, loc, resultType, args, /*isFunc=*/false,
+                          /*isDevLoc=*/true);
+}
+
 // C_F_POINTER
 void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 3);
diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
index 28452d3..70a88ff 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@@ -76,19 +76,16 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc, builder)};
   mlir::FunctionType fTy{func.getFunctionType()};
-  mlir::Value asyncId =
-      builder.createIntegerConstant(loc, builder.getI64Type(), -1);
   mlir::Value sourceFile{fir::factory::locationToFilename(builder, loc)};
   mlir::Value sourceLine{
-      fir::factory::locationToLineNo(builder, loc, fTy.getInput(5))};
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(4))};
   if (!hasStat)
     hasStat = builder.createBool(loc, false);
   if (!errMsg) {
     mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
     errMsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
   }
-  llvm::SmallVector<mlir::Value> args{
-      fir::runtime::createArguments(builder, loc, fTy, desc, asyncId, hasStat,
-                                    errMsg, sourceFile, sourceLine)};
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, desc, hasStat, errMsg, sourceFile, sourceLine)};
   builder.create<fir::CallOp>(loc, func, args);
 }
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 1bb91d2..104ae74 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -172,7 +172,6 @@ public:
     addConversion([&](TypeDescType ty) {
       return TypeDescType::get(convertType(ty.getOfTy()));
     });
-    addArgumentMaterialization(materializeProcedure);
     addSourceMaterialization(materializeProcedure);
     addTargetMaterialization(materializeProcedure);
   }
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 926f83b9..5ba93fe 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3087,7 +3087,7 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
 private:
   static void addComdat(mlir::LLVM::GlobalOp &global,
                         mlir::ConversionPatternRewriter &rewriter,
-                        mlir::ModuleOp &module) {
+                        mlir::ModuleOp module) {
     const char *comdatName = "__llvm_comdat";
     mlir::LLVM::ComdatOp comdatOp =
         module.lookupSymbol<mlir::LLVM::ComdatOp>(comdatName);
@@ -3928,6 +3928,7 @@ public:
     mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, pattern);
     mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter,
                                                           pattern);
+    mlir::cf::populateAssertToLLVMConversionPattern(typeConverter, pattern);
     // Math operations that have not been converted yet must be converted
     // to Libm.
     if (!isAMDGCN)
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index cdcf9bd..fa83aa3 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -1313,7 +1313,8 @@ void fir::ConvertOp::getCanonicalizationPatterns(
   results.insert<ConvertConvertOptPattern, ConvertAscendingIndexOptPattern,
                  ConvertDescendingIndexOptPattern, RedundantConvertOptPattern,
                  CombineConvertOptPattern, CombineConvertTruncOptPattern,
-                 ForwardConstantConvertPattern>(context);
+                 ForwardConstantConvertPattern, ChainedPointerConvertsPattern>(
+      context);
 }
 
 mlir::OpFoldResult fir::ConvertOp::fold(FoldAdaptor adaptor) {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index d18df2e..25a5322 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_flang_library(HLFIRTransforms
   BufferizeHLFIR.cpp
   ConvertToFIR.cpp
   InlineElementals.cpp
+  InlineHLFIRAssign.cpp
   LowerHLFIRIntrinsics.cpp
   LowerHLFIROrderedAssignments.cpp
   ScheduleOrderedAssignments.cpp
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
new file mode 100644
index 0000000..249976d
--- /dev/null
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineHLFIRAssign.cpp
@@ -0,0 +1,152 @@
+//===- InlineHLFIRAssign.cpp - Inline hlfir.assign ops --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Transform hlfir.assign array operations into loop nests performing element
+// per element assignments. The inlining is done for trivial data types always,
+// though, we may add performance/code-size heuristics in future.
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace hlfir {
+#define GEN_PASS_DEF_INLINEHLFIRASSIGN
+#include "flang/Optimizer/HLFIR/Passes.h.inc"
+} // namespace hlfir
+
+#define DEBUG_TYPE "inline-hlfir-assign"
+
+namespace {
+/// Expand hlfir.assign of array RHS to array LHS into a loop nest
+/// of element-by-element assignments:
+///   hlfir.assign %4 to %5 : !fir.ref<!fir.array<3x3xf32>>,
+///                           !fir.ref<!fir.array<3x3xf32>>
+/// into:
+///   fir.do_loop %arg1 = %c1 to %c3 step %c1 unordered {
+///     fir.do_loop %arg2 = %c1 to %c3 step %c1 unordered {
+///       %6 = hlfir.designate %4 (%arg2, %arg1)  :
+///           (!fir.ref<!fir.array<3x3xf32>>, index, index) -> !fir.ref<f32>
+///       %7 = fir.load %6 : !fir.ref<f32>
+///       %8 = hlfir.designate %5 (%arg2, %arg1)  :
+///           (!fir.ref<!fir.array<3x3xf32>>, index, index) -> !fir.ref<f32>
+///       hlfir.assign %7 to %8 : f32, !fir.ref<f32>
+///     }
+///   }
+///
+/// The transformation is correct only when LHS and RHS do not alias.
+/// When RHS is an array expression, then there is no aliasing.
+/// This transformation does not support runtime checking for
+/// non-conforming LHS/RHS arrays' shapes currently.
+class InlineHLFIRAssignConversion
+    : public mlir::OpRewritePattern<hlfir::AssignOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::AssignOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::AssignOp assign,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (assign.isAllocatableAssignment())
+      return rewriter.notifyMatchFailure(assign,
+                                         "AssignOp may imply allocation");
+
+    hlfir::Entity rhs{assign.getRhs()};
+
+    if (!rhs.isArray())
+      return rewriter.notifyMatchFailure(assign,
+                                         "AssignOp's RHS is not an array");
+
+    mlir::Type rhsEleTy = rhs.getFortranElementType();
+    if (!fir::isa_trivial(rhsEleTy))
+      return rewriter.notifyMatchFailure(
+          assign, "AssignOp's RHS data type is not trivial");
+
+    hlfir::Entity lhs{assign.getLhs()};
+    if (!lhs.isArray())
+      return rewriter.notifyMatchFailure(assign,
+                                         "AssignOp's LHS is not an array");
+
+    mlir::Type lhsEleTy = lhs.getFortranElementType();
+    if (!fir::isa_trivial(lhsEleTy))
+      return rewriter.notifyMatchFailure(
+          assign, "AssignOp's LHS data type is not trivial");
+
+    if (lhsEleTy != rhsEleTy)
+      return rewriter.notifyMatchFailure(assign,
+                                         "RHS/LHS element types mismatch");
+
+    if (!mlir::isa<hlfir::ExprType>(rhs.getType())) {
+      // If RHS is not an hlfir.expr, then we should prove that
+      // LHS and RHS do not alias.
+      // TODO: if they may alias, we can insert hlfir.as_expr for RHS,
+      // and proceed with the inlining.
+      fir::AliasAnalysis aliasAnalysis;
+      mlir::AliasResult aliasRes = aliasAnalysis.alias(lhs, rhs);
+      // TODO: use areIdenticalOrDisjointSlices() from
+      // OptimizedBufferization.cpp to check if we can still do the expansion.
+      if (!aliasRes.isNo()) {
+        LLVM_DEBUG(llvm::dbgs() << "InlineHLFIRAssign:\n"
+                                << "\tLHS: " << lhs << "\n"
+                                << "\tRHS: " << rhs << "\n"
+                                << "\tALIAS: " << aliasRes << "\n");
+        return rewriter.notifyMatchFailure(assign, "RHS/LHS may alias");
+      }
+    }
+
+    mlir::Location loc = assign->getLoc();
+    fir::FirOpBuilder builder(rewriter, assign.getOperation());
+    builder.setInsertionPoint(assign);
+    rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+    lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+    mlir::Value shape = hlfir::genShape(loc, builder, lhs);
+    llvm::SmallVector<mlir::Value> extents =
+        hlfir::getIndexExtents(loc, builder, shape);
+    hlfir::LoopNest loopNest =
+        hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
+                           flangomp::shouldUseWorkshareLowering(assign));
+    builder.setInsertionPointToStart(loopNest.body);
+    auto rhsArrayElement =
+        hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
+    rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
+    auto lhsArrayElement =
+        hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
+    builder.create<hlfir::AssignOp>(loc, rhsArrayElement, lhsArrayElement);
+    rewriter.eraseOp(assign);
+    return mlir::success();
+  }
+};
+
+class InlineHLFIRAssignPass
+    : public hlfir::impl::InlineHLFIRAssignBase<InlineHLFIRAssignPass> {
+public:
+  void runOnOperation() override {
+    mlir::MLIRContext *context = &getContext();
+
+    mlir::GreedyRewriteConfig config;
+    // Prevent the pattern driver from merging blocks.
+    config.enableRegionSimplification =
+        mlir::GreedySimplifyRegionLevel::Disabled;
+
+    mlir::RewritePatternSet patterns(context);
+    patterns.insert<InlineHLFIRAssignConversion>(context);
+
+    if (mlir::failed(mlir::applyPatternsGreedily(
+            getOperation(), std::move(patterns), config))) {
+      mlir::emitError(getOperation()->getLoc(),
+                      "failure in hlfir.assign inlining");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index bf3cf86..0cfefc2 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -87,6 +87,13 @@ private:
   /// determines if the transformation can be applied to this elemental
   static std::optional<MatchInfo> findMatch(hlfir::ElementalOp elemental);
 
+  /// Returns the array indices for the given hlfir.designate.
+  /// It recognizes the computations used to transform the one-based indices
+  /// into the array's lb-based indices, and returns the one-based indices
+  /// in these cases.
+  static llvm::SmallVector<mlir::Value>
+  getDesignatorIndices(hlfir::DesignateOp designate);
+
 public:
   using mlir::OpRewritePattern<hlfir::ElementalOp>::OpRewritePattern;
 
@@ -430,6 +437,73 @@ bool ArraySectionAnalyzer::isLess(mlir::Value v1, mlir::Value v2) {
   return false;
 }
 
+llvm::SmallVector<mlir::Value>
+ElementalAssignBufferization::getDesignatorIndices(
+    hlfir::DesignateOp designate) {
+  mlir::Value memref = designate.getMemref();
+
+  // If the object is a box, then the indices may be adjusted
+  // according to the box's lower bound(s). Scan through
+  // the computations to try to find the one-based indices.
+  if (mlir::isa<fir::BaseBoxType>(memref.getType())) {
+    // Look for the following pattern:
+    //   %13 = fir.load %12 : !fir.ref<!fir.box<...>
+    //   %14:3 = fir.box_dims %13, %c0 : (!fir.box<...>, index) -> ...
+    //   %17 = arith.subi %14#0, %c1 : index
+    //   %18 = arith.addi %arg2, %17 : index
+    //   %19 = hlfir.designate %13 (%18)  : (!fir.box<...>, index) -> ...
+    //
+    // %arg2 is a one-based index.
+
+    auto isNormalizedLb = [memref](mlir::Value v, unsigned dim) {
+      // Return true, if v and dim are such that:
+      //   %14:3 = fir.box_dims %13, %dim : (!fir.box<...>, index) -> ...
+      //   %17 = arith.subi %14#0, %c1 : index
+      //   %19 = hlfir.designate %13 (...)  : (!fir.box<...>, index) -> ...
+      if (auto subOp =
+              mlir::dyn_cast_or_null<mlir::arith::SubIOp>(v.getDefiningOp())) {
+        auto cst = fir::getIntIfConstant(subOp.getRhs());
+        if (!cst || *cst != 1)
+          return false;
+        if (auto dimsOp = mlir::dyn_cast_or_null<fir::BoxDimsOp>(
+                subOp.getLhs().getDefiningOp())) {
+          if (memref != dimsOp.getVal() ||
+              dimsOp.getResult(0) != subOp.getLhs())
+            return false;
+          auto dimsOpDim = fir::getIntIfConstant(dimsOp.getDim());
+          return dimsOpDim && dimsOpDim == dim;
+        }
+      }
+      return false;
+    };
+
+    llvm::SmallVector<mlir::Value> newIndices;
+    for (auto index : llvm::enumerate(designate.getIndices())) {
+      if (auto addOp = mlir::dyn_cast_or_null<mlir::arith::AddIOp>(
+              index.value().getDefiningOp())) {
+        for (unsigned opNum = 0; opNum < 2; ++opNum)
+          if (isNormalizedLb(addOp->getOperand(opNum), index.index())) {
+            newIndices.push_back(addOp->getOperand((opNum + 1) % 2));
+            break;
+          }
+
+        // If new one-based index was not added, exit early.
+        if (newIndices.size() <= index.index())
+          break;
+      }
+    }
+
+    // If any of the indices is not adjusted to the array's lb,
+    // then return the original designator indices.
+    if (newIndices.size() != designate.getIndices().size())
+      return designate.getIndices();
+
+    return newIndices;
+  }
+
+  return designate.getIndices();
+}
+
 std::optional<ElementalAssignBufferization::MatchInfo>
 ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) {
   mlir::Operation::user_range users = elemental->getUsers();
@@ -557,7 +631,7 @@ ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) {
                                   << " at " << elemental.getLoc() << "\n");
           return std::nullopt;
         }
-        auto indices = designate.getIndices();
+        auto indices = getDesignatorIndices(designate);
         auto elementalIndices = elemental.getIndices();
         if (indices.size() == elementalIndices.size() &&
             std::equal(indices.begin(), indices.end(), elementalIndices.begin(),
@@ -698,108 +772,6 @@ llvm::LogicalResult BroadcastAssignBufferization::matchAndRewrite(
   return mlir::success();
 }
 
-/// Expand hlfir.assign of array RHS to array LHS into a loop nest
-/// of element-by-element assignments:
-///   hlfir.assign %4 to %5 : !fir.ref<!fir.array<3x3xf32>>,
-///                           !fir.ref<!fir.array<3x3xf32>>
-/// into:
-///   fir.do_loop %arg1 = %c1 to %c3 step %c1 unordered {
-///     fir.do_loop %arg2 = %c1 to %c3 step %c1 unordered {
-///       %6 = hlfir.designate %4 (%arg2, %arg1)  :
-///           (!fir.ref<!fir.array<3x3xf32>>, index, index) -> !fir.ref<f32>
-///       %7 = fir.load %6 : !fir.ref<f32>
-///       %8 = hlfir.designate %5 (%arg2, %arg1)  :
-///           (!fir.ref<!fir.array<3x3xf32>>, index, index) -> !fir.ref<f32>
-///       hlfir.assign %7 to %8 : f32, !fir.ref<f32>
-///     }
-///   }
-///
-/// The transformation is correct only when LHS and RHS do not alias.
-/// This transformation does not support runtime checking for
-/// non-conforming LHS/RHS arrays' shapes currently.
-class VariableAssignBufferization
-    : public mlir::OpRewritePattern<hlfir::AssignOp> {
-private:
-public:
-  using mlir::OpRewritePattern<hlfir::AssignOp>::OpRewritePattern;
-
-  llvm::LogicalResult
-  matchAndRewrite(hlfir::AssignOp assign,
-                  mlir::PatternRewriter &rewriter) const override;
-};
-
-llvm::LogicalResult VariableAssignBufferization::matchAndRewrite(
-    hlfir::AssignOp assign, mlir::PatternRewriter &rewriter) const {
-  if (assign.isAllocatableAssignment())
-    return rewriter.notifyMatchFailure(assign, "AssignOp may imply allocation");
-
-  hlfir::Entity rhs{assign.getRhs()};
-
-  // To avoid conflicts with ElementalAssignBufferization pattern, we avoid
-  // matching RHS when it is an `ExprType` defined by an `ElementalOp`; which is
-  // among the main criteria matched by ElementalAssignBufferization.
-  if (mlir::isa<hlfir::ExprType>(rhs.getType()) &&
-      mlir::isa<hlfir::ElementalOp>(rhs.getDefiningOp()))
-    return rewriter.notifyMatchFailure(
-        assign, "RHS is an ExprType defined by ElementalOp");
-
-  if (!rhs.isArray())
-    return rewriter.notifyMatchFailure(assign,
-                                       "AssignOp's RHS is not an array");
-
-  mlir::Type rhsEleTy = rhs.getFortranElementType();
-  if (!fir::isa_trivial(rhsEleTy))
-    return rewriter.notifyMatchFailure(
-        assign, "AssignOp's RHS data type is not trivial");
-
-  hlfir::Entity lhs{assign.getLhs()};
-  if (!lhs.isArray())
-    return rewriter.notifyMatchFailure(assign,
-                                       "AssignOp's LHS is not an array");
-
-  mlir::Type lhsEleTy = lhs.getFortranElementType();
-  if (!fir::isa_trivial(lhsEleTy))
-    return rewriter.notifyMatchFailure(
-        assign, "AssignOp's LHS data type is not trivial");
-
-  if (lhsEleTy != rhsEleTy)
-    return rewriter.notifyMatchFailure(assign,
-                                       "RHS/LHS element types mismatch");
-
-  fir::AliasAnalysis aliasAnalysis;
-  mlir::AliasResult aliasRes = aliasAnalysis.alias(lhs, rhs);
-  // TODO: use areIdenticalOrDisjointSlices() to check if
-  // we can still do the expansion.
-  if (!aliasRes.isNo()) {
-    LLVM_DEBUG(llvm::dbgs() << "VariableAssignBufferization:\n"
-                            << "\tLHS: " << lhs << "\n"
-                            << "\tRHS: " << rhs << "\n"
-                            << "\tALIAS: " << aliasRes << "\n");
-    return rewriter.notifyMatchFailure(assign, "RHS/LHS may alias");
-  }
-
-  mlir::Location loc = assign->getLoc();
-  fir::FirOpBuilder builder(rewriter, assign.getOperation());
-  builder.setInsertionPoint(assign);
-  rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
-  lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
-  mlir::Value shape = hlfir::genShape(loc, builder, lhs);
-  llvm::SmallVector<mlir::Value> extents =
-      hlfir::getIndexExtents(loc, builder, shape);
-  hlfir::LoopNest loopNest =
-      hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
-                         flangomp::shouldUseWorkshareLowering(assign));
-  builder.setInsertionPointToStart(loopNest.body);
-  auto rhsArrayElement =
-      hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
-  rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
-  auto lhsArrayElement =
-      hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
-  builder.create<hlfir::AssignOp>(loc, rhsArrayElement, lhsArrayElement);
-  rewriter.eraseOp(assign);
-  return mlir::success();
-}
-
 using GenBodyFn =
     std::function<mlir::Value(fir::FirOpBuilder &, mlir::Location, mlir::Value,
                               const llvm::SmallVectorImpl<mlir::Value> &)>;
@@ -1206,9 +1178,9 @@ public:
         loc, resultArr, builder.createBool(loc, false));
 
     // Check all the users - the destroy is no longer required, and any assign
-    // can use resultArr directly so that VariableAssignBufferization in this
-    // pass can optimize the results. Other operations are replaces with an
-    // AsExpr for the temporary resultArr.
+    // can use resultArr directly so that InlineHLFIRAssign pass
+    // can optimize the results. Other operations are replaced with an AsExpr
+    // for the temporary resultArr.
     llvm::SmallVector<hlfir::DestroyOp> destroys;
     llvm::SmallVector<hlfir::AssignOp> assigns;
     for (auto user : mloc->getUsers()) {
@@ -1356,7 +1328,6 @@ public:
     // This requires small code reordering in ElementalAssignBufferization.
     patterns.insert<ElementalAssignBufferization>(context);
     patterns.insert<BroadcastAssignBufferization>(context);
-    patterns.insert<VariableAssignBufferization>(context);
     patterns.insert<EvaluateIntoMemoryAssignBufferization>(context);
     patterns.insert<ReductionConversion<hlfir::CountOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AnyOp>>(context);
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 4f23b2b..026889c 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -22,6 +22,7 @@ add_flang_library(FlangOpenMPTransforms
   FIRDialectSupport
   FIRSupport
   FortranCommon
+  FortranEvaluate
   MLIRFuncDialect
   MLIROpenMPDialect
   HLFIRDialect
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index ad7b806..e823443 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -162,13 +162,19 @@ class MapInfoFinalizationPass
     mlir::Value baseAddrAddr = builder.create<fir::BoxOffsetOp>(
         loc, descriptor, fir::BoxFieldAttr::base_addr);
 
+    mlir::Type underlyingVarType =
+        llvm::cast<mlir::omp::PointerLikeType>(
+            fir::unwrapRefType(baseAddrAddr.getType()))
+            .getElementType();
+    if (auto seqType = llvm::dyn_cast<fir::SequenceType>(underlyingVarType))
+      if (seqType.hasDynamicExtents())
+        underlyingVarType = seqType.getEleTy();
+
     // Member of the descriptor pointing at the allocated data
     return builder.create<mlir::omp::MapInfoOp>(
         loc, baseAddrAddr.getType(), descriptor,
-        mlir::TypeAttr::get(llvm::cast<mlir::omp::PointerLikeType>(
-                                fir::unwrapRefType(baseAddrAddr.getType()))
-                                .getElementType()),
-        baseAddrAddr, /*members=*/mlir::SmallVector<mlir::Value>{},
+        mlir::TypeAttr::get(underlyingVarType), baseAddrAddr,
+        /*members=*/mlir::SmallVector<mlir::Value>{},
         /*membersIndex=*/mlir::ArrayAttr{}, bounds,
         builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
         builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 72803aa..e1d7376 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -234,10 +234,22 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
     pm.addPass(mlir::createCSEPass());
     addNestedPassToAllTopLevelOperations<PassConstructor>(
         pm, hlfir::createOptimizedBufferization);
+    addNestedPassToAllTopLevelOperations<PassConstructor>(
+        pm, hlfir::createInlineHLFIRAssign);
   }
   pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
   pm.addPass(hlfir::createBufferizeHLFIR());
+  // Run hlfir.assign inlining again after BufferizeHLFIR,
+  // because the latter may introduce new hlfir.assign operations,
+  // e.g. for copying an array into a temporary due to
+  // hlfir.associate.
+  // TODO: we can remove the previous InlineHLFIRAssign, when
+  // FIR AliasAnalysis is good enough to say that a temporary
+  // array does not alias with any user object.
+  if (optLevel.isOptimizingForSpeed())
+    addNestedPassToAllTopLevelOperations<PassConstructor>(
+        pm, hlfir::createInlineHLFIRAssign);
   pm.addPass(hlfir::createConvertHLFIRtoFIR());
   if (enableOpenMP)
     pm.addPass(flangomp::createLowerWorkshare());
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 3a437c7..a8e9d19 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -121,9 +121,9 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
   // constant attribute of [hl]fir.declare/fircg.ext_declare operation that has
   // a dummy_scope operand).
   unsigned argNo = 0;
-  if (fir::isDummyArgument(declOp.getMemref())) {
-    auto arg = llvm::cast<mlir::BlockArgument>(declOp.getMemref());
-    argNo = arg.getArgNumber() + 1;
+  if (declOp.getDummyScope()) {
+    if (auto arg = llvm::dyn_cast<mlir::BlockArgument>(declOp.getMemref()))
+      argNo = arg.getArgNumber() + 1;
   }
 
   auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()),
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 9eafa4e..d20d3bc 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -9,7 +9,6 @@ add_flang_library(FIRTransforms
   CompilerGeneratedNames.cpp
   ConstantArgumentGlobalisation.cpp
   ControlFlowConverter.cpp
-  CUFCommon.cpp
   CUFAddConstructor.cpp
   CUFDeviceGlobal.cpp
   CUFOpConversion.cpp
diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
index 9591f48..97551595 100644
--- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
@@ -19,7 +20,6 @@
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Support/DataLayout.h"
-#include "flang/Optimizer/Transforms/CUFCommon.h"
 #include "flang/Runtime/CUDA/registration.h"
 #include "flang/Runtime/entry-names.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
index 07cc1f3..2e6c272 100644
--- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
@@ -7,12 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Common/Fortran.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/InternalNames.h"
-#include "flang/Optimizer/Transforms/CUFCommon.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/allocatable.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index de5c515..8c525fc 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -8,6 +8,7 @@
 
 #include "flang/Optimizer/Transforms/CUFOpConversion.h"
 #include "flang/Common/Fortran.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
@@ -15,7 +16,6 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/DataLayout.h"
-#include "flang/Optimizer/Transforms/CUFCommon.h"
 #include "flang/Runtime/CUDA/allocatable.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
@@ -788,6 +788,38 @@ private:
   const mlir::SymbolTable &symTab;
 };
 
+struct CUFSyncDescriptorOpConversion
+    : public mlir::OpRewritePattern<cuf::SyncDescriptorOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::SyncDescriptorOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    auto globalOp = mod.lookupSymbol<fir::GlobalOp>(op.getGlobalName());
+    if (!globalOp)
+      return mlir::failure();
+
+    auto hostAddr = builder.create<fir::AddrOfOp>(
+        loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName());
+    mlir::func::FuncOp callee =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFSyncGlobalDescriptor)>(loc,
+                                                                       builder);
+    auto fTy = callee.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, hostAddr, sourceFile, sourceLine)};
+    builder.create<fir::CallOp>(loc, callee, args);
+    op.erase();
+    return mlir::success();
+  }
+};
+
 class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
 public:
   void runOnOperation() override {
@@ -848,7 +880,8 @@ void cuf::populateCUFToFIRConversionPatterns(
     const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) {
   patterns.insert<CUFAllocOpConversion>(patterns.getContext(), &dl, &converter);
   patterns.insert<CUFAllocateOpConversion, CUFDeallocateOpConversion,
-                  CUFFreeOpConversion>(patterns.getContext());
+                  CUFFreeOpConversion, CUFSyncDescriptorOpConversion>(
+      patterns.getContext());
   patterns.insert<CUFDataTransferOpConversion>(patterns.getContext(), symtab,
                                                &dl, &converter);
   patterns.insert<CUFLaunchOpConversion>(patterns.getContext(), symtab);
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index cc99698..8ae3d31 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -325,7 +325,7 @@ static bool canCacheThisType(mlir::LLVM::DICompositeTypeAttr comTy) {
 std::pair<std::uint64_t, unsigned short>
 DebugTypeGenerator::getFieldSizeAndAlign(mlir::Type fieldTy) {
   mlir::Type llvmTy;
-  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(fieldTy))
+  if (auto boxTy = mlir::dyn_cast_if_present<fir::BaseBoxType>(fieldTy))
     llvmTy = llvmTypeConverter.convertBoxTypeAsStruct(boxTy, getBoxRank(boxTy));
   else
     llvmTy = llvmTypeConverter.convertType(fieldTy);
@@ -371,7 +371,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
     std::optional<llvm::ArrayRef<int64_t>> lowerBounds =
         fir::getComponentLowerBoundsIfNonDefault(Ty, fieldName, module,
                                                  symbolTable);
-    auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(fieldTy);
+    auto seqTy = mlir::dyn_cast_if_present<fir::SequenceType>(fieldTy);
 
     // For members of the derived types, the information about the shift in
     // lower bounds is not part of the declOp but has to be extracted from the
@@ -622,10 +622,10 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType(
   // Arrays and character need different treatment because DWARF have special
   // constructs for them to get the location from the descriptor. Rest of
   // types are handled like pointer to underlying type.
-  if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(elTy))
+  if (auto seqTy = mlir::dyn_cast_if_present<fir::SequenceType>(elTy))
     return convertBoxedSequenceType(seqTy, fileAttr, scope, declOp,
                                     genAllocated, genAssociated);
-  if (auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(elTy))
+  if (auto charTy = mlir::dyn_cast_if_present<fir::CharacterType>(elTy))
     return convertCharacterType(charTy, fileAttr, scope, declOp,
                                 /*hasDescriptor=*/true);
 
@@ -638,7 +638,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType(
 
   return mlir::LLVM::DIDerivedTypeAttr::get(
       context, llvm::dwarf::DW_TAG_pointer_type,
-      mlir::StringAttr::get(context, ""), elTyAttr, ptrSize,
+      mlir::StringAttr::get(context, ""), elTyAttr, /*sizeInBits=*/ptrSize * 8,
       /*alignInBits=*/0, /*offset=*/0,
       /*optional<address space>=*/std::nullopt, /*extra data=*/nullptr);
 }
@@ -654,22 +654,22 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
   } else if (mlir::isa<mlir::FloatType>(Ty)) {
     return genBasicType(context, mlir::StringAttr::get(context, "real"),
                         Ty.getIntOrFloatBitWidth(), llvm::dwarf::DW_ATE_float);
-  } else if (auto logTy = mlir::dyn_cast_or_null<fir::LogicalType>(Ty)) {
+  } else if (auto logTy = mlir::dyn_cast_if_present<fir::LogicalType>(Ty)) {
     return genBasicType(context,
                         mlir::StringAttr::get(context, logTy.getMnemonic()),
                         kindMapping.getLogicalBitsize(logTy.getFKind()),
                         llvm::dwarf::DW_ATE_boolean);
-  } else if (auto cplxTy = mlir::dyn_cast_or_null<mlir::ComplexType>(Ty)) {
+  } else if (auto cplxTy = mlir::dyn_cast_if_present<mlir::ComplexType>(Ty)) {
     auto floatTy = mlir::cast<mlir::FloatType>(cplxTy.getElementType());
     unsigned bitWidth = floatTy.getWidth();
     return genBasicType(context, mlir::StringAttr::get(context, "complex"),
                         bitWidth * 2, llvm::dwarf::DW_ATE_complex_float);
-  } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(Ty)) {
+  } else if (auto seqTy = mlir::dyn_cast_if_present<fir::SequenceType>(Ty)) {
     return convertSequenceType(seqTy, fileAttr, scope, declOp);
-  } else if (auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(Ty)) {
+  } else if (auto charTy = mlir::dyn_cast_if_present<fir::CharacterType>(Ty)) {
     return convertCharacterType(charTy, fileAttr, scope, declOp,
                                 /*hasDescriptor=*/false);
-  } else if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(Ty)) {
+  } else if (auto recTy = mlir::dyn_cast_if_present<fir::RecordType>(Ty)) {
     return convertRecordType(recTy, fileAttr, scope, declOp);
   } else if (auto tupleTy = mlir::dyn_cast_if_present<mlir::TupleType>(Ty)) {
     return convertTupleType(tupleTy, fileAttr, scope, declOp);
@@ -678,22 +678,22 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
     return convertPointerLikeType(elTy, fileAttr, scope, declOp,
                                   /*genAllocated=*/false,
                                   /*genAssociated=*/false);
-  } else if (auto vecTy = mlir::dyn_cast_or_null<fir::VectorType>(Ty)) {
+  } else if (auto vecTy = mlir::dyn_cast_if_present<fir::VectorType>(Ty)) {
     return convertVectorType(vecTy, fileAttr, scope, declOp);
   } else if (mlir::isa<mlir::IndexType>(Ty)) {
     return genBasicType(context, mlir::StringAttr::get(context, "integer"),
                         llvmTypeConverter.getIndexTypeBitwidth(),
                         llvm::dwarf::DW_ATE_signed);
-  } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(Ty)) {
+  } else if (auto boxTy = mlir::dyn_cast_if_present<fir::BaseBoxType>(Ty)) {
     auto elTy = boxTy.getEleTy();
-    if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(elTy))
+    if (auto seqTy = mlir::dyn_cast_if_present<fir::SequenceType>(elTy))
       return convertBoxedSequenceType(seqTy, fileAttr, scope, declOp, false,
                                       false);
-    if (auto heapTy = mlir::dyn_cast_or_null<fir::HeapType>(elTy))
+    if (auto heapTy = mlir::dyn_cast_if_present<fir::HeapType>(elTy))
       return convertPointerLikeType(heapTy.getElementType(), fileAttr, scope,
                                     declOp, /*genAllocated=*/true,
                                     /*genAssociated=*/false);
-    if (auto ptrTy = mlir::dyn_cast_or_null<fir::PointerType>(elTy))
+    if (auto ptrTy = mlir::dyn_cast_if_present<fir::PointerType>(elTy))
       return convertPointerLikeType(ptrTy.getElementType(), fileAttr, scope,
                                     declOp, /*genAllocated=*/false,
                                     /*genAssociated=*/true);
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index d3567f4..fa6a7b2 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -24,6 +24,7 @@
 
 #include "flang/Common/Fortran.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/LowLevelIntrinsics.h"
 #include "flang/Optimizer/Builder/Todo.h"
@@ -31,7 +32,6 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
-#include "flang/Optimizer/Transforms/CUFCommon.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "flang/Optimizer/Transforms/Utils.h"
 #include "flang/Runtime/entry-names.h"
diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index bdcb819..2a9d339 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -330,6 +330,18 @@ std::optional<AllocationState> LatticePoint::get(mlir::Value val) const {
   return it->second;
 }
 
+static mlir::Value lookThroughDeclaresAndConverts(mlir::Value value) {
+  while (mlir::Operation *op = value.getDefiningOp()) {
+    if (auto declareOp = llvm::dyn_cast<fir::DeclareOp>(op))
+      value = declareOp.getMemref();
+    else if (auto convertOp = llvm::dyn_cast<fir::ConvertOp>(op))
+      value = convertOp->getOperand(0);
+    else
+      return value;
+  }
+  return value;
+}
+
 mlir::LogicalResult AllocationAnalysis::visitOperation(
     mlir::Operation *op, const LatticePoint &before, LatticePoint *after) {
   LLVM_DEBUG(llvm::dbgs() << "StackArrays: Visiting operation: " << *op
@@ -363,10 +375,10 @@ mlir::LogicalResult AllocationAnalysis::visitOperation(
     mlir::Value operand = op->getOperand(0);
 
     // Note: StackArrays is scheduled in the pass pipeline after lowering hlfir
-    // to fir. Therefore, we only need to handle `fir::DeclareOp`s.
-    if (auto declareOp =
-            llvm::dyn_cast_if_present<fir::DeclareOp>(operand.getDefiningOp()))
-      operand = declareOp.getMemref();
+    // to fir. Therefore, we only need to handle `fir::DeclareOp`s. Also look
+    // past converts in case the pointer was changed between different pointer
+    // types.
+    operand = lookThroughDeclaresAndConverts(operand);
 
     std::optional<AllocationState> operandState = before.get(operand);
     if (operandState && *operandState == AllocationState::Allocated) {
@@ -535,17 +547,12 @@ AllocMemConversion::matchAndRewrite(fir::AllocMemOp allocmem,
 
   // remove freemem operations
   llvm::SmallVector<mlir::Operation *> erases;
-  for (mlir::Operation *user : allocmem.getOperation()->getUsers()) {
-    if (auto declareOp = mlir::dyn_cast_if_present<fir::DeclareOp>(user)) {
-      for (mlir::Operation *user : declareOp->getUsers()) {
-        if (mlir::isa<fir::FreeMemOp>(user))
-          erases.push_back(user);
-      }
-    }
-
-    if (mlir::isa<fir::FreeMemOp>(user))
-      erases.push_back(user);
-  }
+  mlir::Operation *parent = allocmem->getParentOp();
+  // TODO: this shouldn't need to be re-calculated for every allocmem
+  parent->walk([&](fir::FreeMemOp freeOp) {
+    if (lookThroughDeclaresAndConverts(freeOp->getOperand(0)) == allocmem)
+      erases.push_back(freeOp);
+  });
 
   // now we are done iterating the users, it is safe to mutate them
   for (mlir::Operation *erase : erases)
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 67385c0..894c458 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -567,6 +567,8 @@ TYPE_PARSER(construct<OmpBindClause>(
     "TEAMS" >> pure(OmpBindClause::Binding::Teams) ||
     "THREAD" >> pure(OmpBindClause::Binding::Thread)))
 
+TYPE_PARSER(construct<OmpAlignClause>(scalarIntExpr))
+
 TYPE_PARSER(construct<OmpAtClause>(
     "EXECUTION" >> pure(OmpAtClause::ActionTime::Execution) ||
     "COMPILATION" >> pure(OmpAtClause::ActionTime::Compilation)))
@@ -582,6 +584,8 @@ TYPE_PARSER(
     "ACQ_REL" >> construct<OmpClause>(construct<OmpClause::AcqRel>()) ||
     "AFFINITY" >> construct<OmpClause>(construct<OmpClause::Affinity>(
                       parenthesized(Parser<OmpAffinityClause>{}))) ||
+    "ALIGN" >> construct<OmpClause>(construct<OmpClause::Align>(
+                   parenthesized(Parser<OmpAlignClause>{}))) ||
     "ALIGNED" >> construct<OmpClause>(construct<OmpClause::Aligned>(
                      parenthesized(Parser<OmpAlignedClause>{}))) ||
     "ALLOCATE" >> construct<OmpClause>(construct<OmpClause::Allocate>(
@@ -737,9 +741,20 @@ TYPE_PARSER(
 TYPE_PARSER(sourced(construct<OmpClauseList>(
     many(maybe(","_tok) >> sourced(Parser<OmpClause>{})))))
 
-// 2.1 (variable | /common-block | array-sections)
+// 2.1 (variable | /common-block/ | array-sections)
 TYPE_PARSER(construct<OmpObjectList>(nonemptyList(Parser<OmpObject>{})))
 
+TYPE_PARSER(sourced(construct<OmpErrorDirective>(
+    verbatim("ERROR"_tok), Parser<OmpClauseList>{})))
+
+TYPE_PARSER(sourced(construct<OmpNothingDirective>("NOTHING" >> ok)))
+
+TYPE_PARSER(sourced(construct<OpenMPUtilityConstruct>(
+    sourced(construct<OpenMPUtilityConstruct>(
+        sourced(Parser<OmpErrorDirective>{}))) ||
+    sourced(construct<OpenMPUtilityConstruct>(
+        sourced(Parser<OmpNothingDirective>{}))))))
+
 // Omp directives enclosing do loop
 TYPE_PARSER(sourced(construct<OmpLoopDirective>(first(
     "DISTRIBUTE PARALLEL DO SIMD" >>
@@ -1027,9 +1042,6 @@ TYPE_PARSER(sourced(construct<OmpCriticalDirective>(verbatim("CRITICAL"_tok),
 TYPE_PARSER(construct<OpenMPCriticalConstruct>(
     Parser<OmpCriticalDirective>{}, block, Parser<OmpEndCriticalDirective>{}))
 
-TYPE_PARSER(sourced(construct<OpenMPErrorConstruct>(
-    verbatim("ERROR"_tok), Parser<OmpClauseList>{})))
-
 // 2.11.3 Executable Allocate directive
 TYPE_PARSER(
     sourced(construct<OpenMPExecutableAllocate>(verbatim("ALLOCATE"_tok),
@@ -1082,7 +1094,9 @@ TYPE_PARSER(startOmpLine >>
             construct<OpenMPDeclarativeConstruct>(
                 Parser<OpenMPRequiresConstruct>{}) ||
             construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPThreadprivate>{})) /
+                Parser<OpenMPThreadprivate>{}) ||
+            construct<OpenMPDeclarativeConstruct>(
+                Parser<OpenMPUtilityConstruct>{})) /
             endOmpLine))
 
 // Block Construct
@@ -1127,7 +1141,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
                 // OpenMPStandaloneConstruct to resolve !$OMP ORDERED
                 construct<OpenMPConstruct>(Parser<OpenMPStandaloneConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}),
-                construct<OpenMPConstruct>(Parser<OpenMPErrorConstruct>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPUtilityConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}),
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 3cd32d7..703a0279 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -709,9 +709,22 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
       QuotedCharacterLiteral(tokens, start);
     } else if (IsLetter(*at_) && !preventHollerith_ &&
         parenthesisNesting_ > 0) {
-      // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
-      // we don't misrecognize I9HOLLERITH as an identifier in the next case.
-      EmitCharAndAdvance(tokens, *at_);
+      const char *p{at_};
+      int digits{0};
+      for (;; ++digits) {
+        ++p;
+        if (InFixedFormSource()) {
+          p = SkipWhiteSpace(p);
+        }
+        if (!IsDecimalDigit(*p)) {
+          break;
+        }
+      }
+      if (digits > 0 && (*p == 'h' || *p == 'H')) {
+        // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
+        // we don't misrecognize I9HOLLERITH as an identifier in the next case.
+        EmitCharAndAdvance(tokens, *at_);
+      }
     }
     preventHollerith_ = false;
   } else if (*at_ == '.') {
@@ -1289,14 +1302,18 @@ const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
     return nullptr;
   }
   p = SkipWhiteSpace(p);
-  if (InCompilerDirective()) {
-    if (*p++ != '!') {
-      return nullptr;
-    }
-    for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
-      if (*s != ToLowerCaseLetter(*p)) {
-        return nullptr;
+  if (*p == '!') {
+    ++p;
+    if (InCompilerDirective()) {
+      for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
+        if (*s != ToLowerCaseLetter(*p)) {
+          return nullptr;
+        }
       }
+    } else if (features_.IsEnabled(LanguageFeature::OpenMP) && *p == '$') {
+      ++p;
+    } else {
+      return nullptr;
     }
     p = SkipWhiteSpace(p);
     if (*p == '&') {
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 0a6af74..5882047 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2631,90 +2631,77 @@ public:
     }
   }
   void Unparse(const OpenMPDeclareReductionConstruct &x) {
+    BeginOpenMP();
+    Word("!$OMP DECLARE REDUCTION ");
     Put("(");
     Walk(std::get<OmpReductionIdentifier>(x.t)), Put(" : ");
     Walk(std::get<std::list<DeclarationTypeSpec>>(x.t), ","), Put(" : ");
     Walk(std::get<OmpReductionCombiner>(x.t));
     Put(")");
     Walk(std::get<std::optional<OmpReductionInitializerClause>>(x.t));
+    EndOpenMP();
   }
-  bool Pre(const OpenMPDeclarativeConstruct &x) {
+
+  void Unparse(const OpenMPDeclareMapperConstruct &z) {
     BeginOpenMP();
-    Word("!$OMP ");
-    return common::visit(
-        common::visitors{
-            [&](const OpenMPDeclarativeAllocate &z) {
-              Word("ALLOCATE (");
-              Walk(std::get<OmpObjectList>(z.t));
-              Put(")");
-              Walk(std::get<OmpClauseList>(z.t));
-              Put("\n");
-              EndOpenMP();
-              return false;
-            },
-            [&](const OpenMPDeclareMapperConstruct &z) {
-              Word("DECLARE MAPPER (");
-              const auto &spec{std::get<OmpDeclareMapperSpecifier>(z.t)};
-              if (auto mapname{std::get<std::optional<Name>>(spec.t)}) {
-                Walk(mapname);
-                Put(":");
-              }
-              Walk(std::get<TypeSpec>(spec.t));
-              Put("::");
-              Walk(std::get<Name>(spec.t));
-              Put(")");
+    Word("!$OMP DECLARE MAPPER (");
+    const auto &spec{std::get<OmpDeclareMapperSpecifier>(z.t)};
+    if (auto mapname{std::get<std::optional<Name>>(spec.t)}) {
+      Walk(mapname);
+      Put(":");
+    }
+    Walk(std::get<TypeSpec>(spec.t));
+    Put("::");
+    Walk(std::get<Name>(spec.t));
+    Put(")");
 
-              Walk(std::get<OmpClauseList>(z.t));
-              Put("\n");
-              return false;
-            },
-            [&](const OpenMPDeclareReductionConstruct &) {
-              Word("DECLARE REDUCTION ");
-              return true;
-            },
-            [&](const OpenMPDeclareSimdConstruct &y) {
-              Word("DECLARE SIMD ");
-              Walk("(", std::get<std::optional<Name>>(y.t), ")");
-              Walk(std::get<OmpClauseList>(y.t));
-              Put("\n");
-              EndOpenMP();
-              return false;
-            },
-            [&](const OpenMPDeclareTargetConstruct &) {
-              Word("DECLARE TARGET ");
-              return true;
-            },
-            [&](const OpenMPRequiresConstruct &y) {
-              Word("REQUIRES ");
-              Walk(std::get<OmpClauseList>(y.t));
-              Put("\n");
-              EndOpenMP();
-              return false;
-            },
-            [&](const OpenMPThreadprivate &) {
-              Word("THREADPRIVATE (");
-              return true;
-            },
-        },
-        x.u);
+    Walk(std::get<OmpClauseList>(z.t));
+    Put("\n");
+    EndOpenMP();
+  }
+  void Unparse(const OpenMPDeclareSimdConstruct &y) {
+    BeginOpenMP();
+    Word("!$OMP DECLARE SIMD ");
+    Walk("(", std::get<std::optional<Name>>(y.t), ")");
+    Walk(std::get<OmpClauseList>(y.t));
+    Put("\n");
+    EndOpenMP();
   }
-  void Post(const OpenMPDeclarativeConstruct &) {
+  void Unparse(const OpenMPDeclareTargetConstruct &x) {
+    BeginOpenMP();
+    Word("!$OMP DECLARE TARGET ");
+    Walk(std::get<parser::OmpDeclareTargetSpecifier>(x.t));
     Put("\n");
     EndOpenMP();
   }
-  void Post(const OpenMPThreadprivate &) {
+  void Unparse(const OpenMPRequiresConstruct &y) {
+    BeginOpenMP();
+    Word("!$OMP REQUIRES ");
+    Walk(std::get<OmpClauseList>(y.t));
+    Put("\n");
+    EndOpenMP();
+  }
+  void Unparse(const OpenMPThreadprivate &x) {
+    BeginOpenMP();
+    Word("!$OMP THREADPRIVATE (");
+    Walk(std::get<parser::OmpObjectList>(x.t));
     Put(")\n");
     EndOpenMP();
   }
+
   bool Pre(const OmpMessageClause &x) {
     Walk(x.v);
     return false;
   }
-  void Unparse(const OpenMPErrorConstruct &x) {
+  void Unparse(const OmpErrorDirective &x) {
     Word("!$OMP ERROR ");
     Walk(x.t);
     Put("\n");
   }
+  void Unparse(const OmpNothingDirective &x) {
+    Word("!$OMP NOTHING");
+    Put("\n");
+  }
   void Unparse(const OmpSectionsDirective &x) {
     switch (x.v) {
     case llvm::omp::Directive::OMPD_sections:
diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp
index e69a73c..0b57197 100644
--- a/flang/lib/Semantics/assignment.cpp
+++ b/flang/lib/Semantics/assignment.cpp
@@ -66,8 +66,13 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
     const SomeExpr &rhs{assignment->rhs};
     auto lhsLoc{std::get<parser::Variable>(stmt.t).GetSource()};
     const Scope &scope{context_.FindScope(lhsLoc)};
-    if (auto whyNot{WhyNotDefinable(lhsLoc, scope,
-            DefinabilityFlags{DefinabilityFlag::VectorSubscriptIsOk}, lhs)}) {
+    DefinabilityFlags flags{DefinabilityFlag::VectorSubscriptIsOk};
+    bool isDefinedAssignment{
+        std::holds_alternative<evaluate::ProcedureRef>(assignment->u)};
+    if (isDefinedAssignment) {
+      flags.set(DefinabilityFlag::AllowEventLockOrNotifyType);
+    }
+    if (auto whyNot{WhyNotDefinable(lhsLoc, scope, flags, lhs)}) {
       if (whyNot->IsFatal()) {
         if (auto *msg{Say(lhsLoc,
                 "Left-hand side of assignment is not definable"_err_en_US)}) {
@@ -79,9 +84,7 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
       }
     }
     auto rhsLoc{std::get<parser::Expr>(stmt.t).source};
-    if (std::holds_alternative<evaluate::ProcedureRef>(assignment->u)) {
-      // it's a defined ASSIGNMENT(=)
-    } else {
+    if (!isDefinedAssignment) {
       CheckForPureContext(rhs, rhsLoc);
     }
     if (whereDepth_ > 0) {
diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index 0481b3d..5164f1d 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -50,6 +50,43 @@ public:
 
   void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); }
 
+  // Pre-visit all constructs that have both a specification part and
+  // an execution part, and store the connection between the two.
+  bool Pre(parser::BlockConstruct &x) {
+    auto *spec = &std::get<parser::BlockSpecificationPart>(x.t).v;
+    auto *block = &std::get<parser::Block>(x.t);
+    blockForSpec_.insert(std::make_pair(spec, block));
+    return true;
+  }
+  bool Pre(parser::MainProgram &x) {
+    auto *spec = &std::get<parser::SpecificationPart>(x.t);
+    auto *block = &std::get<parser::ExecutionPart>(x.t).v;
+    blockForSpec_.insert(std::make_pair(spec, block));
+    return true;
+  }
+  bool Pre(parser::FunctionSubprogram &x) {
+    auto *spec = &std::get<parser::SpecificationPart>(x.t);
+    auto *block = &std::get<parser::ExecutionPart>(x.t).v;
+    blockForSpec_.insert(std::make_pair(spec, block));
+    return true;
+  }
+  bool Pre(parser::SubroutineSubprogram &x) {
+    auto *spec = &std::get<parser::SpecificationPart>(x.t);
+    auto *block = &std::get<parser::ExecutionPart>(x.t).v;
+    blockForSpec_.insert(std::make_pair(spec, block));
+    return true;
+  }
+  bool Pre(parser::SeparateModuleSubprogram &x) {
+    auto *spec = &std::get<parser::SpecificationPart>(x.t);
+    auto *block = &std::get<parser::ExecutionPart>(x.t).v;
+    blockForSpec_.insert(std::make_pair(spec, block));
+    return true;
+  }
+
+  void Post(parser::SpecificationPart &spec) {
+    CanonicalizeUtilityConstructs(spec);
+  }
+
 private:
   template <typename T> T *GetConstructIf(parser::ExecutionPartConstruct &x) {
     if (auto *y{std::get_if<parser::ExecutableConstruct>(&x.u)}) {
@@ -155,6 +192,131 @@ private:
     }
   }
 
+  // Canonicalization of utility constructs.
+  //
+  // This addresses the issue of utility constructs that appear at the
+  // boundary between the specification and the execution parts, e.g.
+  //   subroutine foo
+  //     integer :: x     ! Specification
+  //     !$omp nothing
+  //     x = 1            ! Execution
+  //     ...
+  //   end
+  //
+  // Utility constructs (error and nothing) can appear in both the
+  // specification part and the execution part, except "error at(execution)",
+  // which cannot be present in the specification part (whereas any utility
+  // construct can be in the execution part).
+  // When a utility construct is at the boundary, it should preferably be
+  // parsed as an element of the execution part, but since the specification
+  // part is parsed first, the utility construct ends up belonging to the
+  // specification part.
+  //
+  // To allow the likes of the following code to compile, move all utility
+  // construct that are at the end of the specification part to the beginning
+  // of the execution part.
+  //
+  // subroutine foo
+  //   !$omp error at(execution)  ! Initially parsed as declarative construct.
+  //                              ! Move it to the execution part.
+  // end
+
+  void CanonicalizeUtilityConstructs(parser::SpecificationPart &spec) {
+    auto found = blockForSpec_.find(&spec);
+    if (found == blockForSpec_.end()) {
+      // There is no corresponding execution part, so there is nothing to do.
+      return;
+    }
+    parser::Block &block = *found->second;
+
+    // There are two places where an OpenMP declarative construct can
+    // show up in the tuple in specification part:
+    // (1) in std::list<OpenMPDeclarativeConstruct>, or
+    // (2) in std::list<DeclarationConstruct>.
+    // The case (1) is only possible is the list (2) is empty.
+
+    auto &omps =
+        std::get<std::list<parser::OpenMPDeclarativeConstruct>>(spec.t);
+    auto &decls = std::get<std::list<parser::DeclarationConstruct>>(spec.t);
+
+    if (!decls.empty()) {
+      MoveUtilityConstructsFromDecls(decls, block);
+    } else {
+      MoveUtilityConstructsFromOmps(omps, block);
+    }
+  }
+
+  void MoveUtilityConstructsFromDecls(
+      std::list<parser::DeclarationConstruct> &decls, parser::Block &block) {
+    // Find the trailing range of DeclarationConstructs that are OpenMP
+    // utility construct, that are to be moved to the execution part.
+    std::list<parser::DeclarationConstruct>::reverse_iterator rlast = [&]() {
+      for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) {
+        parser::DeclarationConstruct &dc = *rit;
+        if (!std::holds_alternative<parser::SpecificationConstruct>(dc.u)) {
+          return rit;
+        }
+        auto &sc = std::get<parser::SpecificationConstruct>(dc.u);
+        using OpenMPDeclarativeConstruct =
+            common::Indirection<parser::OpenMPDeclarativeConstruct>;
+        if (!std::holds_alternative<OpenMPDeclarativeConstruct>(sc.u)) {
+          return rit;
+        }
+        // Got OpenMPDeclarativeConstruct. If it's not a utility construct
+        // then stop.
+        auto &odc = std::get<OpenMPDeclarativeConstruct>(sc.u).value();
+        if (!std::holds_alternative<parser::OpenMPUtilityConstruct>(odc.u)) {
+          return rit;
+        }
+      }
+      return decls.rend();
+    }();
+
+    std::transform(decls.rbegin(), rlast, std::front_inserter(block),
+        [](parser::DeclarationConstruct &dc) {
+          auto &sc = std::get<parser::SpecificationConstruct>(dc.u);
+          using OpenMPDeclarativeConstruct =
+              common::Indirection<parser::OpenMPDeclarativeConstruct>;
+          auto &oc = std::get<OpenMPDeclarativeConstruct>(sc.u).value();
+          auto &ut = std::get<parser::OpenMPUtilityConstruct>(oc.u);
+
+          return parser::ExecutionPartConstruct(parser::ExecutableConstruct(
+              common::Indirection(parser::OpenMPConstruct(std::move(ut)))));
+        });
+
+    decls.erase(rlast.base(), decls.end());
+  }
+
+  void MoveUtilityConstructsFromOmps(
+      std::list<parser::OpenMPDeclarativeConstruct> &omps,
+      parser::Block &block) {
+    using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct;
+    // Find the trailing range of OpenMPDeclarativeConstruct that are OpenMP
+    // utility construct, that are to be moved to the execution part.
+    std::list<OpenMPDeclarativeConstruct>::reverse_iterator rlast = [&]() {
+      for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) {
+        OpenMPDeclarativeConstruct &dc = *rit;
+        if (!std::holds_alternative<parser::OpenMPUtilityConstruct>(dc.u)) {
+          return rit;
+        }
+      }
+      return omps.rend();
+    }();
+
+    std::transform(omps.rbegin(), rlast, std::front_inserter(block),
+        [](parser::OpenMPDeclarativeConstruct &dc) {
+          auto &ut = std::get<parser::OpenMPUtilityConstruct>(dc.u);
+          return parser::ExecutionPartConstruct(parser::ExecutableConstruct(
+              common::Indirection(parser::OpenMPConstruct(std::move(ut)))));
+        });
+
+    omps.erase(rlast.base(), omps.end());
+  }
+
+  // Mapping from the specification parts to the blocks that follow in the
+  // same construct. This is for converting utility constructs to executable
+  // constructs.
+  std::map<parser::SpecificationPart *, parser::Block *> blockForSpec_;
   parser::Messages &messages_;
 };
 
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index 1e54123..223bee6 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -616,9 +616,11 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
   }
   if (allocateInfo_.gotPinned) {
     std::optional<common::CUDADataAttr> cudaAttr{GetCUDADataAttr(ultimate_)};
-    if (!cudaAttr || *cudaAttr != common::CUDADataAttr::Pinned) {
+    if ((!cudaAttr || *cudaAttr != common::CUDADataAttr::Pinned) &&
+        context.languageFeatures().ShouldWarn(
+            common::UsageWarning::CUDAUsage)) {
       context.Say(name_.source,
-          "Object in ALLOCATE must have PINNED attribute when PINNED option is specified"_err_en_US);
+          "Object in ALLOCATE should have PINNED attribute when PINNED option is specified"_warn_en_US);
     }
   }
   if (allocateInfo_.gotStream) {
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 597c280..ba68a0f 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -690,7 +690,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
     }
   }
   if (actualLastObject && actualLastObject->IsCoarray() &&
-      IsAllocatable(*actualLastSymbol) && dummy.intent == common::Intent::Out &&
+      dummy.attrs.test(characteristics::DummyDataObject::Attr::Allocatable) &&
+      dummy.intent == common::Intent::Out &&
       !(intrinsic &&
           evaluate::AcceptsIntentOutAllocatableCoarray(
               intrinsic->name))) { // C846
@@ -703,12 +704,14 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   // Problems with polymorphism are caught in the callee's definition.
   if (scope) {
     std::optional<parser::MessageFixedText> undefinableMessage;
-    if (dummy.intent == common::Intent::Out) {
-      undefinableMessage =
-          "Actual argument associated with INTENT(OUT) %s is not definable"_err_en_US;
-    } else if (dummy.intent == common::Intent::InOut) {
+    DefinabilityFlags flags{DefinabilityFlag::PolymorphicOkInPure};
+    if (dummy.intent == common::Intent::InOut) {
+      flags.set(DefinabilityFlag::AllowEventLockOrNotifyType);
       undefinableMessage =
           "Actual argument associated with INTENT(IN OUT) %s is not definable"_err_en_US;
+    } else if (dummy.intent == common::Intent::Out) {
+      undefinableMessage =
+          "Actual argument associated with INTENT(OUT) %s is not definable"_err_en_US;
     } else if (context.ShouldWarn(common::LanguageFeature::
                        UndefinableAsynchronousOrVolatileActual)) {
       if (dummy.attrs.test(
@@ -722,7 +725,6 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
       }
     }
     if (undefinableMessage) {
-      DefinabilityFlags flags{DefinabilityFlag::PolymorphicOkInPure};
       if (isElemental) { // 15.5.2.4(21)
         flags.set(DefinabilityFlag::VectorSubscriptIsOk);
       }
@@ -1622,8 +1624,8 @@ static void CheckImage_Index(evaluate::ActualArguments &arguments,
             evaluate::GetShape(arguments[1]->UnwrapExpr())}) {
       if (const auto *coarrayArgSymbol{UnwrapWholeSymbolOrComponentDataRef(
               arguments[0]->UnwrapExpr())}) {
-        const auto coarrayArgCorank = coarrayArgSymbol->Corank();
-        if (const auto subArrSize = evaluate::ToInt64(*subArrShape->front())) {
+        auto coarrayArgCorank{coarrayArgSymbol->Corank()};
+        if (auto subArrSize{evaluate::ToInt64(*subArrShape->front())}) {
           if (subArrSize != coarrayArgCorank) {
             messages.Say(arguments[1]->sourceLocation(),
                 "The size of 'SUB=' (%jd) for intrinsic 'image_index' must be equal to the corank of 'COARRAY=' (%d)"_err_en_US,
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index d497ac2..d8a5639 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -302,6 +302,14 @@ private:
             [&](const common::Indirection<parser::IfConstruct> &x) {
               Check(x.value());
             },
+            [&](const common::Indirection<parser::CaseConstruct> &x) {
+              const auto &caseList{
+                  std::get<std::list<parser::CaseConstruct::Case>>(
+                      x.value().t)};
+              for (const parser::CaseConstruct::Case &c : caseList) {
+                Check(std::get<parser::Block>(c.t));
+              }
+            },
             [&](const auto &x) {
               if (auto source{parser::GetSource(x)}) {
                 context_.Say(*source,
@@ -347,9 +355,24 @@ private:
           hostArray->name());
     }
   }
+  void ErrorInCUFKernel(parser::CharBlock source) {
+    if (IsCUFKernelDo) {
+      context_.Say(
+          source, "Statement may not appear in cuf kernel code"_err_en_US);
+    }
+  }
   void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
     common::visit(
         common::visitors{
+            [&](const common::Indirection<parser::CycleStmt> &) {
+              ErrorInCUFKernel(source);
+            },
+            [&](const common::Indirection<parser::ExitStmt> &) {
+              ErrorInCUFKernel(source);
+            },
+            [&](const common::Indirection<parser::GotoStmt> &) {
+              ErrorInCUFKernel(source);
+            },
             [&](const common::Indirection<parser::StopStmt> &) { return; },
             [&](const common::Indirection<parser::PrintStmt> &) {},
             [&](const common::Indirection<parser::WriteStmt> &x) {
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 95b962f..6db43cf6 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -614,6 +614,14 @@ void OmpStructureChecker::Leave(const parser::OpenMPConstruct &) {
   deferredNonVariables_.clear();
 }
 
+void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeConstruct &x) {
+  EnterDirectiveNest(DeclarativeNest);
+}
+
+void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeConstruct &x) {
+  ExitDirectiveNest(DeclarativeNest);
+}
+
 void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) {
   loopStack_.push_back(&x);
   const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
@@ -1481,11 +1489,24 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
+void OmpStructureChecker::CheckAlignValue(const parser::OmpClause &clause) {
+  if (auto *align{std::get_if<parser::OmpClause::Align>(&clause.u)}) {
+    if (const auto &v{GetIntValue(align->v)}; !v || *v <= 0) {
+      context_.Say(clause.source,
+          "The alignment value should be a constant positive integer"_err_en_US);
+    }
+  }
+}
+
 void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
   isPredefinedAllocator = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+  for (const auto &clause : clauseList.v) {
+    CheckAlignValue(clause);
+  }
   CheckIsVarPartOfAnotherVar(dir.source, objectList);
 }
 
@@ -1688,20 +1709,34 @@ void OmpStructureChecker::Leave(const parser::OpenMPDeclareTargetConstruct &x) {
   dirContext_.pop_back();
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPErrorConstruct &x) {
+void OmpStructureChecker::Enter(const parser::OmpErrorDirective &x) {
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_error);
 }
 
-void OmpStructureChecker::Leave(const parser::OpenMPErrorConstruct &x) {
+void OmpStructureChecker::Leave(const parser::OmpErrorDirective &x) {
   dirContext_.pop_back();
 }
 
+void OmpStructureChecker::Enter(const parser::OmpClause::At &x) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_at);
+  if (GetDirectiveNest(DeclarativeNest) > 0) {
+    if (x.v.v == parser::OmpAtClause::ActionTime::Execution) {
+      context_.Say(GetContext().clauseSource,
+          "The ERROR directive with AT(EXECUTION) cannot appear in the specification part"_err_en_US);
+    }
+  }
+}
+
 void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
   isPredefinedAllocator = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+  for (const auto &clause : clauseList.v) {
+    CheckAlignValue(clause);
+  }
   if (objectList) {
     CheckIsVarPartOfAnotherVar(dir.source, *objectList);
   }
@@ -2856,7 +2891,6 @@ CHECK_SIMPLE_CLAUSE(Init, OMPC_init)
 CHECK_SIMPLE_CLAUSE(Use, OMPC_use)
 CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants)
 CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext)
-CHECK_SIMPLE_CLAUSE(At, OMPC_at)
 CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity)
 CHECK_SIMPLE_CLAUSE(Message, OMPC_message)
 CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 346a7be..dc36095 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -73,6 +73,9 @@ public:
 
   void Enter(const parser::OpenMPConstruct &);
   void Leave(const parser::OpenMPConstruct &);
+  void Enter(const parser::OpenMPDeclarativeConstruct &);
+  void Leave(const parser::OpenMPDeclarativeConstruct &);
+
   void Enter(const parser::OpenMPLoopConstruct &);
   void Leave(const parser::OpenMPLoopConstruct &);
   void Enter(const parser::OmpEndLoopDirective &);
@@ -102,8 +105,8 @@ public:
   void Enter(const parser::OmpDeclareTargetWithList &);
   void Enter(const parser::OmpDeclareTargetWithClause &);
   void Leave(const parser::OmpDeclareTargetWithClause &);
-  void Enter(const parser::OpenMPErrorConstruct &);
-  void Leave(const parser::OpenMPErrorConstruct &);
+  void Enter(const parser::OmpErrorDirective &);
+  void Leave(const parser::OmpErrorDirective &);
   void Enter(const parser::OpenMPExecutableAllocate &);
   void Leave(const parser::OpenMPExecutableAllocate &);
   void Enter(const parser::OpenMPAllocatorsConstruct &);
@@ -261,6 +264,8 @@ private:
   void CheckAllowedRequiresClause(llvmOmpClause clause);
   bool deviceConstructFound_{false};
 
+  void CheckAlignValue(const parser::OmpClause &);
+
   void EnterDirectiveNest(const int index) { directiveNest_[index]++; }
   void ExitDirectiveNest(const int index) { directiveNest_[index]--; }
   int GetDirectiveNest(const int index) { return directiveNest_[index]; }
@@ -270,11 +275,12 @@ private:
       const parser::Variable &, const parser::Expr &);
   inline void ErrIfNonScalarAssignmentStmt(
       const parser::Variable &, const parser::Expr &);
-  enum directiveNestType {
+  enum directiveNestType : int {
     SIMDNest,
     TargetBlockOnlyTeams,
     TargetNest,
-    LastType
+    DeclarativeNest,
+    LastType = DeclarativeNest,
   };
   int directiveNest_[LastType + 1] = {0};
 
diff --git a/flang/lib/Semantics/definable.cpp b/flang/lib/Semantics/definable.cpp
index 88f9463e3..6d0155c 100644
--- a/flang/lib/Semantics/definable.cpp
+++ b/flang/lib/Semantics/definable.cpp
@@ -204,7 +204,8 @@ static std::optional<parser::Message> WhyNotDefinableLast(parser::CharBlock at,
     }
     return std::nullopt; // pointer assignment - skip following checks
   }
-  if (IsOrContainsEventOrLockComponent(ultimate)) {
+  if (!flags.test(DefinabilityFlag::AllowEventLockOrNotifyType) &&
+      IsOrContainsEventOrLockComponent(ultimate)) {
     return BlameSymbol(at,
         "'%s' is an entity with either an EVENT_TYPE or LOCK_TYPE"_en_US,
         original);
diff --git a/flang/lib/Semantics/definable.h b/flang/lib/Semantics/definable.h
index 709bbba..902702d 100644
--- a/flang/lib/Semantics/definable.h
+++ b/flang/lib/Semantics/definable.h
@@ -32,7 +32,8 @@ ENUM_CLASS(DefinabilityFlag,
     AcceptAllocatable, // treat allocatable as if it were a pointer
     SourcedAllocation, // ALLOCATE(a,SOURCE=)
     PolymorphicOkInPure, // don't check for polymorphic type in pure subprogram
-    DoNotNoteDefinition) // context does not imply definition
+    DoNotNoteDefinition, // context does not imply definition
+    AllowEventLockOrNotifyType)
 
 using DefinabilityFlags =
     common::EnumSet<DefinabilityFlag, DefinabilityFlag_enumSize>;
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index c2eb17c..3ec6f38 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1506,9 +1506,9 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::CoindexedNamedObject &x) {
     if (cosubsOk && !reversed.empty()) {
       int numCosubscripts{static_cast<int>(cosubscripts.size())};
       const Symbol &symbol{reversed.front()};
-      if (numCosubscripts != symbol.Corank()) {
+      if (numCosubscripts != GetCorank(symbol)) {
         Say("'%s' has corank %d, but coindexed reference has %d cosubscripts"_err_en_US,
-            symbol.name(), symbol.Corank(), numCosubscripts);
+            symbol.name(), GetCorank(symbol), numCosubscripts);
       }
     }
     for (const auto &imageSelSpec :
@@ -2536,6 +2536,15 @@ static bool CheckCompatibleArgument(bool isElemental,
             return false;
           },
           [&](const characteristics::DummyProcedure &dummy) {
+            if ((dummy.attrs.test(
+                     characteristics::DummyProcedure::Attr::Optional) ||
+                    dummy.attrs.test(
+                        characteristics::DummyProcedure::Attr::Pointer)) &&
+                IsBareNullPointer(expr)) {
+              // NULL() is compatible with any dummy pointer
+              // or optional dummy procedure.
+              return true;
+            }
             if (!expr || !IsProcedurePointerTarget(*expr)) {
               return false;
             }
diff --git a/flang/lib/Semantics/pointer-assignment.cpp b/flang/lib/Semantics/pointer-assignment.cpp
index 2450ce3..7f4548c 100644
--- a/flang/lib/Semantics/pointer-assignment.cpp
+++ b/flang/lib/Semantics/pointer-assignment.cpp
@@ -76,6 +76,7 @@ private:
       const Procedure * = nullptr,
       const evaluate::SpecificIntrinsic *specific = nullptr);
   bool LhsOkForUnlimitedPoly() const;
+  std::optional<MessageFormattedText> CheckRanks(const TypeAndShape &rhs) const;
   template <typename... A> parser::Message *Say(A &&...);
   template <typename FeatureOrUsageWarning, typename... A>
   parser::Message *Warn(FeatureOrUsageWarning, A &&...);
@@ -278,10 +279,19 @@ bool PointerAssignmentChecker::Check(const evaluate::FunctionRef<T> &f) {
   } else if (lhsType_) {
     const auto *frTypeAndShape{funcResult->GetTypeAndShape()};
     CHECK(frTypeAndShape);
-    if (!lhsType_->IsCompatibleWith(foldingContext_.messages(), *frTypeAndShape,
-            "pointer", "function result",
-            /*omitShapeConformanceCheck=*/isBoundsRemapping_ || isAssumedRank_,
-            evaluate::CheckConformanceFlags::BothDeferredShape)) {
+    if (frTypeAndShape->type().IsUnlimitedPolymorphic() &&
+        LhsOkForUnlimitedPoly()) {
+      // Special case exception to type checking (F'2023 C1017);
+      // still check rank compatibility.
+      if (auto msg{CheckRanks(*frTypeAndShape)}) {
+        Say(*msg);
+        return false;
+      }
+    } else if (!lhsType_->IsCompatibleWith(foldingContext_.messages(),
+                   *frTypeAndShape, "pointer", "function result",
+                   /*omitShapeConformanceCheck=*/isBoundsRemapping_ ||
+                       isAssumedRank_,
+                   evaluate::CheckConformanceFlags::BothDeferredShape)) {
       return false; // IsCompatibleWith() emitted message
     }
   }
@@ -324,27 +334,17 @@ bool PointerAssignmentChecker::Check(const evaluate::Designator<T> &d) {
         msg = "Pointer must be VOLATILE when target is a"
               " VOLATILE coarray"_err_en_US;
       }
+    } else if (auto m{CheckRanks(*rhsType)}) {
+      msg = std::move(*m);
     } else if (rhsType->type().IsUnlimitedPolymorphic()) {
       if (!LhsOkForUnlimitedPoly()) {
         msg = "Pointer type must be unlimited polymorphic or non-extensible"
               " derived type when target is unlimited polymorphic"_err_en_US;
       }
-    } else {
-      if (!lhsType_->type().IsTkLenCompatibleWith(rhsType->type())) {
-        msg = MessageFormattedText{
-            "Target type %s is not compatible with pointer type %s"_err_en_US,
-            rhsType->type().AsFortran(), lhsType_->type().AsFortran()};
-
-      } else if (!isBoundsRemapping_ &&
-          !lhsType_->attrs().test(TypeAndShape::Attr::AssumedRank)) {
-        int lhsRank{lhsType_->Rank()};
-        int rhsRank{rhsType->Rank()};
-        if (lhsRank != rhsRank) {
-          msg = MessageFormattedText{
-              "Pointer has rank %d but target has rank %d"_err_en_US, lhsRank,
-              rhsRank};
-        }
-      }
+    } else if (!lhsType_->type().IsTkLenCompatibleWith(rhsType->type())) {
+      msg = MessageFormattedText{
+          "Target type %s is not compatible with pointer type %s"_err_en_US,
+          rhsType->type().AsFortran(), lhsType_->type().AsFortran()};
     }
   }
   if (msg) {
@@ -434,6 +434,21 @@ bool PointerAssignmentChecker::LhsOkForUnlimitedPoly() const {
   }
 }
 
+std::optional<MessageFormattedText> PointerAssignmentChecker::CheckRanks(
+    const TypeAndShape &rhs) const {
+  if (!isBoundsRemapping_ &&
+      !lhsType_->attrs().test(TypeAndShape::Attr::AssumedRank)) {
+    int lhsRank{lhsType_->Rank()};
+    int rhsRank{rhs.Rank()};
+    if (lhsRank != rhsRank) {
+      return MessageFormattedText{
+          "Pointer has rank %d but target has rank %d"_err_en_US, lhsRank,
+          rhsRank};
+    }
+  }
+  return std::nullopt;
+}
+
 template <typename... A>
 parser::Message *PointerAssignmentChecker::Say(A &&...x) {
   auto *msg{foldingContext_.messages().Say(std::forward<A>(x)...)};
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 122c0a2..724f1b2 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -3162,6 +3162,10 @@ ModuleVisitor::SymbolRename ModuleVisitor::AddUse(
 // Convert it to a UseError with this additional location.
 static bool ConvertToUseError(
     Symbol &symbol, const SourceName &location, const Scope &module) {
+  if (auto *ued{symbol.detailsIf<UseErrorDetails>()}) {
+    ued->add_occurrence(location, module);
+    return true;
+  }
   const auto *useDetails{symbol.detailsIf<UseDetails>()};
   if (!useDetails) {
     if (auto *genericDetails{symbol.detailsIf<GenericDetails>()}) {
@@ -3319,6 +3323,8 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
         combinedDerivedType = CreateLocalUseError();
       } else {
         ConvertToUseError(*localSymbol, location, *useModuleScope_);
+        localDerivedType = nullptr;
+        localGeneric = nullptr;
         combinedDerivedType = localSymbol;
       }
     }
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index ef206df..4d134fa 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -22,6 +22,9 @@ module __fortran_builtins
   intrinsic :: __builtin_c_loc
   public :: __builtin_c_loc
 
+  intrinsic :: __builtin_c_devloc
+  public :: __builtin_c_devloc
+
   intrinsic :: __builtin_c_f_pointer
   public :: __builtin_c_f_pointer
 
@@ -40,15 +43,15 @@ module __fortran_builtins
   end type
 
   type, public :: __builtin_event_type
-    integer(kind=int64), private :: __count
+    integer(kind=int64), private :: __count = -1
   end type
 
   type, public :: __builtin_notify_type
-    integer(kind=int64), private :: __count
+    integer(kind=int64), private :: __count = -1
   end type
 
   type, public :: __builtin_lock_type
-    integer(kind=int64), private :: __count
+    integer(kind=int64), private :: __count = -1
   end type
 
   type, public :: __builtin_ieee_flag_type
@@ -88,7 +91,7 @@ module __fortran_builtins
       __builtin_ieee_round_type(_FORTRAN_RUNTIME_IEEE_OTHER)
 
   type, public :: __builtin_team_type
-    integer(kind=int64), private :: __id
+    integer(kind=int64), private :: __id = -1
   end type
 
   integer, parameter, public :: __builtin_atomic_int_kind = selected_int_kind(18)
@@ -144,6 +147,7 @@ module __fortran_builtins
 
   type :: __force_derived_type_instantiations
     type(__builtin_c_ptr) :: c_ptr
+    type(__builtin_c_devptr) :: c_devptr
     type(__builtin_c_funptr) :: c_funptr
     type(__builtin_event_type) :: event_type
     type(__builtin_lock_type) :: lock_type
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index 5f2273d..b30a6bf 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -14,7 +14,7 @@
 module __fortran_type_info
 
   use, intrinsic :: __fortran_builtins, &
-    only: __builtin_c_ptr, __builtin_c_funptr
+    only: __builtin_c_ptr, __builtin_c_devptr, __builtin_c_funptr
   implicit none
 
   ! Set PRIVATE by default to explicitly only export what is meant
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 3f6f8f3..9be54e8 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -52,7 +52,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
   }
   // Perform the standard allocation.
   int stat{RTNAME(AllocatableAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
   return stat;
 }
 
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index d848f18..85b3daf 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -33,7 +33,7 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 }
 
-void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) {
+void *CUFAllocPinned(std::size_t sizeInBytes) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
@@ -41,20 +41,15 @@ void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) {
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) {
+void *CUFAllocDevice(std::size_t sizeInBytes) {
   void *p;
-  if (stream >= 0) {
-    CUDA_REPORT_IF_ERROR(
-        cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream));
-  } else {
-    CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
-  }
+  CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
   return p;
 }
 
 void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) {
+void *CUFAllocManaged(std::size_t sizeInBytes) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -63,7 +58,7 @@ void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) {
 
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) {
+void *CUFAllocUnified(std::size_t sizeInBytes) {
   // Call alloc managed for the time being.
   return CUFAllocManaged(sizeInBytes);
 }
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
index 816b145..947eeb6 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -20,8 +20,7 @@ RT_EXT_API_GROUP_BEGIN
 
 Descriptor *RTDEF(CUFAllocDescriptor)(
     std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
-  return reinterpret_cast<Descriptor *>(
-      CUFAllocManaged(sizeInBytes, kCudaNoStream));
+  return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
 }
 
 void RTDEF(CUFFreeDescriptor)(
@@ -47,6 +46,13 @@ void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
       (void *)dst, (const void *)src, count, cudaMemcpyHostToDevice));
 }
 
+void RTDEF(CUFSyncGlobalDescriptor)(
+    void *hostPtr, const char *sourceFile, int sourceLine) {
+  void *devAddr{RTNAME(CUFGetDeviceAddress)(hostPtr, sourceFile, sourceLine)};
+  RTNAME(CUFDescriptorSync)
+  ((Descriptor *)devAddr, (Descriptor *)hostPtr, sourceFile, sourceLine);
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/allocatable.cpp b/flang/runtime/allocatable.cpp
index b65cec8..5e065f4 100644
--- a/flang/runtime/allocatable.cpp
+++ b/flang/runtime/allocatable.cpp
@@ -133,17 +133,15 @@ void RTDEF(AllocatableApplyMold)(
   }
 }
 
-int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+int RTDEF(AllocatableAllocate)(Descriptor &descriptor, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
   } else if (descriptor.IsAllocated()) {
     return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
   } else {
-    int stat{
-        ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
+    int stat{ReturnError(terminator, descriptor.Allocate(), errMsg, hasStat)};
     if (stat == StatOk) {
       if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
         if (const auto *derived{addendum->derivedType()}) {
@@ -162,7 +160,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
     const Descriptor &source, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
-      alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     DoFromSourceAssign(alloc, source, terminator);
diff --git a/flang/runtime/array-constructor.cpp b/flang/runtime/array-constructor.cpp
index 0d677d7..c695316 100644
--- a/flang/runtime/array-constructor.cpp
+++ b/flang/runtime/array-constructor.cpp
@@ -50,8 +50,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
           initialAllocationSize(fromElements, to.ElementBytes())};
       to.GetDimension(0).SetBounds(1, allocationSize);
       RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
-          vector.sourceFile, vector.sourceLine);
+      (to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile,
+          vector.sourceLine);
       to.GetDimension(0).SetBounds(1, fromElements);
       vector.actualAllocationSize = allocationSize;
     } else {
@@ -59,8 +59,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
       // first value: there should be no reallocation.
       RUNTIME_CHECK(terminator, previousToElements >= fromElements);
       RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
-          vector.sourceFile, vector.sourceLine);
+      (to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile,
+          vector.sourceLine);
       vector.actualAllocationSize = previousToElements;
     }
   } else {
diff --git a/flang/runtime/derived.cpp b/flang/runtime/derived.cpp
index 7c164ff..10813c6 100644
--- a/flang/runtime/derived.cpp
+++ b/flang/runtime/derived.cpp
@@ -129,6 +129,10 @@ RT_API_ATTRS int InitializeClone(const Descriptor &clone,
   std::size_t elements{orig.Elements()};
   int stat{StatOk};
 
+  // Skip pointers and unallocated variables.
+  if (orig.IsPointer() || !orig.IsAllocated()) {
+    return stat;
+  }
   // Initialize each data component.
   std::size_t components{componentDesc.Elements()};
   for (std::size_t i{0}; i < components; ++i) {
diff --git a/flang/runtime/descriptor.cpp b/flang/runtime/descriptor.cpp
index f43c96b..32f43e8 100644
--- a/flang/runtime/descriptor.cpp
+++ b/flang/runtime/descriptor.cpp
@@ -163,7 +163,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) {
 #endif
 }
 
-RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
+RT_API_ATTRS int Descriptor::Allocate() {
   std::size_t elementBytes{ElementBytes()};
   if (static_cast<std::int64_t>(elementBytes) < 0) {
     // F'2023 7.4.4.2 p5: "If the character length parameter value evaluates
@@ -175,7 +175,7 @@ RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
   // Zero size allocation is possible in Fortran and the resulting
   // descriptor must be allocated/associated. Since std::malloc(0)
   // result is implementation defined, always allocate at least one byte.
-  void *p{alloc(byteSize ? byteSize : 1, asyncId)};
+  void *p{alloc(byteSize ? byteSize : 1)};
   if (!p) {
     return CFI_ERROR_MEM_ALLOCATION;
   }
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index c714a85..317f0b6 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -130,7 +130,7 @@ static RT_API_ATTRS bool EditBOZInput(
     shift = shift - 8; // misaligned octal
   }
   while (digits > 0) {
-    char32_t ch{*io.NextInField(remaining, edit)};
+    char32_t ch{io.NextInField(remaining, edit).value_or(' ')};
     int digit{0};
     if (ch == ' ' || ch == '\t') {
       if (edit.modes.editingFlags & blankZero) {
diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index 1ed0053..2fa2baa 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -84,7 +84,7 @@ uint32_t RTNAME(MapException)(uint32_t excepts) {
 // Check if the processor has the ability to control whether to halt or
 // continue execution when a given exception is raised.
 bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) {
-#if (defined(__arm__) || defined(__aarch64__)) && !defined(_WIN32)
+#ifdef __USE_GNU
   except = RTNAME(MapException)(except);
   int currentSet = fegetexcept(), flipSet, ok;
   if (currentSet & except) {
diff --git a/flang/runtime/stop.cpp b/flang/runtime/stop.cpp
index f8457e1..a7be8a0 100644
--- a/flang/runtime/stop.cpp
+++ b/flang/runtime/stop.cpp
@@ -162,7 +162,7 @@ static void PrintBacktrace() {
   // TODO: Need to parse DWARF information to print function line numbers
   constexpr int MAX_CALL_STACK{999};
   void *buffer[MAX_CALL_STACK];
-  int nptrs{backtrace(buffer, MAX_CALL_STACK)};
+  int nptrs{(int)backtrace(buffer, MAX_CALL_STACK)};
 
   if (char **symbols{backtrace_symbols(buffer, nptrs)}) {
     for (int i = 0; i < nptrs; i++) {
diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir
new file mode 100644
index 0000000..5116622
--- /dev/null
+++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-private-allocatable.mlir
@@ -0,0 +1,50 @@
+// Use --mlir-disable-threading so that the AA queries are serialized
+// as well as its diagnostic output.
+// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s
+
+// Fortran code before simplification:
+// SUBROUTINE mysub(ns,ne)
+//      INTEGER  :: n
+//      REAL(KIND=8), DIMENSION(:), allocatable     :: ar1
+//      real(kind=8), dimension(20) :: ar2
+//      REAL(KIND=8), DIMENSION(20) :: d
+//
+//!$OMP parallel PRIVATE(ar1)
+//         d(1:1) = (/(DOT_PRODUCT(ar1(1:n), ar2(1:n)),n=1, 1)/)
+//!$OMP END parallel
+//   END SUBROUTINE
+
+// CHECK-LABEL: Testing : "testPrivateAllocatable"
+// CHECK: ar2#0 <-> ar1#0: NoAlias
+// CHECK: ar2#1 <-> ar1#0: NoAlias
+// CHECK: ar2#0 <-> ar1#1: NoAlias
+// CHECK: ar2#1 <-> ar1#1: NoAlias
+
+omp.private {type = private} @_QFmysubEar1_private_ref_box_heap_Uxf64 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> alloc {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>):
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>> {bindc_name = "ar1", pinned, uniq_name = "_QFmysubEar1"}
+  %5:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFmysubEar1"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+  omp.yield(%5#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+} dealloc {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>):
+  omp.yield
+}
+func.func @testPrivateAllocatable(%arg0: !fir.ref<i32> {fir.bindc_name = "ns"}, %arg1: !fir.ref<i32> {fir.bindc_name = "ne"}) {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf64>>> {bindc_name = "ar1", uniq_name = "_QFmysubEar1"}
+  %2 = fir.zero_bits !fir.heap<!fir.array<?xf64>>
+  %c0 = arith.constant 0 : index
+  %3 = fir.shape %c0 : (index) -> !fir.shape<1>
+  %4 = fir.embox %2(%3) : (!fir.heap<!fir.array<?xf64>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf64>>>
+  fir.store %4 to %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  %5:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFmysubEar1"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+  %c20 = arith.constant 20 : index
+  %6 = fir.alloca !fir.array<20xf64> {bindc_name = "ar2", uniq_name = "_QFmysubEar2"}
+  %7 = fir.shape %c20 : (index) -> !fir.shape<1>
+  %8:2 = hlfir.declare %6(%7) {uniq_name = "_QFmysubEar2", test.ptr="ar2" } : (!fir.ref<!fir.array<20xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<20xf64>>, !fir.ref<!fir.array<20xf64>>)
+  omp.parallel private(@_QFmysubEar1_private_ref_box_heap_Uxf64 %5#0 -> %arg2 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) {
+    %20:2 = hlfir.declare %arg2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFmysubEar1", test.ptr = "ar1"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Driver/fsave-main-program.f90 b/flang/test/Driver/fsave-main-program.f90
new file mode 100644
index 0000000..bffdfd9
--- /dev/null
+++ b/flang/test/Driver/fsave-main-program.f90
@@ -0,0 +1,5 @@
+! Check that the driver passes through -fsave-main-program:
+! RUN: %flang -### -S -fsave-main-program %s -o - 2>&1 | FileCheck %s
+! Check that the compiler accepts -fsave-main-program:
+! RUN: %flang_fc1 -emit-hlfir -fsave-main-program %s -o -
+! CHECK: "-fc1"{{.*}}"-fsave-main-program"
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index b30affe..55e86da 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -36,15 +36,28 @@ end program
 ! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! O2-NEXT: 'fir.global' Pipeline
 ! O2-NEXT:   OptimizedBufferization
+! O2-NEXT:   InlineHLFIRAssign
 ! O2-NEXT: 'func.func' Pipeline
 ! O2-NEXT:   OptimizedBufferization
+! O2-NEXT:   InlineHLFIRAssign
 ! O2-NEXT: 'omp.declare_reduction' Pipeline
 ! O2-NEXT:   OptimizedBufferization
+! O2-NEXT:   InlineHLFIRAssign
 ! O2-NEXT: 'omp.private' Pipeline
 ! O2-NEXT:   OptimizedBufferization
+! O2-NEXT:   InlineHLFIRAssign
 ! ALL: LowerHLFIROrderedAssignments
 ! ALL-NEXT: LowerHLFIRIntrinsics
 ! ALL-NEXT: BufferizeHLFIR
+! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! O2-NEXT:   'fir.global' Pipeline
+! O2-NEXT:     InlineHLFIRAssign
+! O2-NEXT:   'func.func' Pipeline
+! O2-NEXT:     InlineHLFIRAssign
+! O2-NEXT:   'omp.declare_reduction' Pipeline
+! O2-NEXT:     InlineHLFIRAssign
+! O2-NEXT:   'omp.private' Pipeline
+! O2-NEXT:     InlineHLFIRAssign
 ! ALL-NEXT: ConvertHLFIRtoFIR
 ! ALL-NEXT: CSE
 ! Ideally, we need an output with only the pass names, but
diff --git a/flang/test/Driver/parse-error.ll b/flang/test/Driver/parse-error.ll
index 3567c3e..2c36d8b 100644
--- a/flang/test/Driver/parse-error.ll
+++ b/flang/test/Driver/parse-error.ll
@@ -13,7 +13,7 @@
 ; RUN: not %flang_fc1 -fdebug-unparse-no-sema -x f95 %s 2>&1 | FileCheck %s --check-prefix=ERROR
 ; RUN: not %flang_fc1 -fsyntax-only %s -x f95 2>&1 | FileCheck %s --check-prefix=ERROR
 
-; ERROR: Could not parse {{.*}}parse-error.f95
+; ERROR: Could not scan {{.*}}parse-error.ll
 
 define void @foo() {
   ret void
diff --git a/flang/test/Fir/CUDA/cuda-sync-desc.mlir b/flang/test/Fir/CUDA/cuda-sync-desc.mlir
new file mode 100644
index 0000000..20b317f34
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-sync-desc.mlir
@@ -0,0 +1,20 @@
+// RUN: fir-opt --cuf-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f64 = dense<64> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git f37e52237791f58438790c77edeb8de08f692987)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda<device>} : !fir.box<!fir.ptr<!fir.array<?xf32>>> {
+    %0 = fir.zero_bits !fir.ptr<!fir.array<?xf32>>
+    %c0 = arith.constant 0 : index
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.ptr<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
+    fir.has_value %2 : !fir.box<!fir.ptr<!fir.array<?xf32>>>
+  }
+  func.func @_QQmain() {
+    cuf.sync_descriptor @_QMdevptrEdev_ptr
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: %[[HOST_ADDR:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+// CHECK: %[[HOST_ADDR_PTR:.*]] = fir.convert %[[HOST_ADDR]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.llvm_ptr<i8>
+// CHECK: fir.call @_FortranACUFSyncGlobalDescriptor(%[[HOST_ADDR_PTR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32)
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index d278800..29a0f66 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -37,15 +37,28 @@ func.func @_QQmain() {
 // PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT:    InlineHLFIRAssign
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT:    InlineHLFIRAssign
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT:    InlineHLFIRAssign
 // PASSES-NEXT: 'omp.private' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT:    InlineHLFIRAssign
 // PASSES-NEXT:   LowerHLFIROrderedAssignments
 // PASSES-NEXT:   LowerHLFIRIntrinsics
 // PASSES-NEXT:   BufferizeHLFIR
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT:   'fir.global' Pipeline
+// PASSES-NEXT:     InlineHLFIRAssign
+// PASSES-NEXT:   'func.func' Pipeline
+// PASSES-NEXT:     InlineHLFIRAssign
+// PASSES-NEXT:   'omp.declare_reduction' Pipeline
+// PASSES-NEXT:     InlineHLFIRAssign
+// PASSES-NEXT:   'omp.private' Pipeline
+// PASSES-NEXT:     InlineHLFIRAssign
 // PASSES-NEXT:   ConvertHLFIRtoFIR
 // PASSES-NEXT:   LowerWorkshare
 // PASSES-NEXT:   CSE
diff --git a/flang/test/Fir/convert-fold.fir b/flang/test/Fir/convert-fold.fir
index ebb6c8d..fb30e63 100644
--- a/flang/test/Fir/convert-fold.fir
+++ b/flang/test/Fir/convert-fold.fir
@@ -35,3 +35,12 @@ func.func @ctest() -> index {
   // CHECK-NEXT: return %{{.*}} : index
   return %2 : index
 }
+
+// CHECK-LABEL:   func.func @ptrtest(
+// CHECK-SAME:                       %[[VAL_0:.*]]: !fir.heap<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>> {
+func.func @ptrtest(%0 : !fir.heap<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>> {
+  %1 = fir.convert %0 : (!fir.heap<!fir.array<2xf32>>) -> !fir.ref<!fir.array<2xf32>>
+  %2 = fir.convert %1 : (!fir.ref<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>>
+// CHECK:           return %[[VAL_0]] : !fir.heap<!fir.array<2xf32>>
+  return %2 : !fir.heap<!fir.array<2xf32>>
+}
diff --git a/flang/test/HLFIR/elemental-codegen.fir b/flang/test/HLFIR/elemental-codegen.fir
index 3c33bf8..0d5f343 100644
--- a/flang/test/HLFIR/elemental-codegen.fir
+++ b/flang/test/HLFIR/elemental-codegen.fir
@@ -192,7 +192,7 @@ func.func @test_polymorphic(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.bindc_
 // CHECK:           %[[VAL_35:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_36:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_37:.*]] = fir.convert %[[VAL_31]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_39:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_40:.*]] = arith.constant 1 : index
 // CHECK:           fir.do_loop %[[VAL_41:.*]] = %[[VAL_40]] to %[[EX1]] step %[[VAL_40]] unordered {
@@ -276,7 +276,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
 // CHECK:           %[[VAL_36:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_37:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_38:.*]] = fir.convert %[[VAL_32]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_40:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_41:.*]] = arith.constant 1 : index
 // CHECK:           fir.do_loop %[[VAL_42:.*]] = %[[VAL_41]] to %[[VAL_3]] step %[[VAL_41]] unordered {
@@ -329,7 +329,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
 // CHECK:           %[[VAL_85:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_86:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_87:.*]] = fir.convert %[[VAL_81]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_89:.*]] = fir.load %[[VAL_63]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_90:.*]] = arith.constant 1 : index
 // CHECK:           fir.do_loop %[[VAL_91:.*]] = %[[VAL_90]] to %[[VAL_3]] step %[[VAL_90]] unordered {
diff --git a/flang/test/HLFIR/opt-variable-assign.fir b/flang/test/HLFIR/inline-hlfir-assign.fir
index 17124fa..f834e79 100644
--- a/flang/test/HLFIR/opt-variable-assign.fir
+++ b/flang/test/HLFIR/inline-hlfir-assign.fir
@@ -1,6 +1,5 @@
-// Test optimized bufferization for hlfir.assign of arrays
-// variables:
-// RUN: fir-opt --opt-bufferization %s | FileCheck %s
+// Test inlining of hlfir.assign of arrays:
+// RUN: fir-opt --inline-hlfir-assign %s | FileCheck %s
 
 // The two assigns come from the following source forms:
 //   y(:,:) = x(:,:)
@@ -302,3 +301,55 @@ func.func @_QPtest7(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> {f
 // CHECK-NOT:       hlfir.assign
 // CHECK:           hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref<f32>
 // CHECK-NOT:       hlfir.assign
+
+
+// Test that VAR = EXPR assignment is inlined:
+// subroutine test_expr_rhs(p1, p2)
+//   logical, pointer :: p1(:), p2(:)
+//   p1 = (p2)
+// end subroutine test_expr_rhs
+func.func @_QPtest_expr_rhs(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>> {fir.bindc_name = "p1"}, %arg1: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>> {fir.bindc_name = "p2"}) {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_expr_rhsEp1"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>)
+  %2:2 = hlfir.declare %arg1 dummy_scope %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_expr_rhsEp2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>)
+  %3 = fir.load %2#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+  %4:3 = fir.box_dims %3, %c0 : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>, index) -> (index, index, index)
+  %5 = fir.shape %4#1 : (index) -> !fir.shape<1>
+  %6 = hlfir.elemental %5 unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+  ^bb0(%arg2: index):
+    %8 = arith.subi %4#0, %c1 : index
+    %9 = arith.addi %arg2, %8 : index
+    %10 = hlfir.designate %3 (%9)  : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>, index) -> !fir.ref<!fir.logical<4>>
+    %11 = fir.load %10 : !fir.ref<!fir.logical<4>>
+    %12 = hlfir.no_reassoc %11 : !fir.logical<4>
+    hlfir.yield_element %12 : !fir.logical<4>
+  }
+  %7 = fir.load %1#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+  hlfir.assign %6 to %7 : !hlfir.expr<?x!fir.logical<4>>, !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
+  hlfir.destroy %6 : !hlfir.expr<?x!fir.logical<4>>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_expr_rhs(
+// CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>> {fir.bindc_name = "p1"},
+// CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>> {fir.bindc_name = "p2"}) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_expr_rhsEp1"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>)
+// CHECK:           %[[VAL_10:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?x!fir.logical<4>> {
+// CHECK:           }
+// CHECK:           %[[VAL_17:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+// CHECK:           %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_17]], %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>, index) -> (index, index, index)
+// CHECK:           fir.do_loop %[[VAL_19:.*]] = %[[VAL_2]] to %[[VAL_18]]#1 step %[[VAL_2]] unordered {
+// CHECK:             %[[VAL_20:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_19]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+// CHECK:             %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_17]], %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_21]]#0, %[[VAL_2]] : index
+// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_19]], %[[VAL_22]] : index
+// CHECK:             %[[VAL_24:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_23]])  : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>, index) -> !fir.ref<!fir.logical<4>>
+// CHECK:             hlfir.assign %[[VAL_20]] to %[[VAL_24]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+// CHECK:           }
+// CHECK:           hlfir.destroy %[[VAL_10]] : !hlfir.expr<?x!fir.logical<4>>
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/HLFIR/maxloc-elemental.fir b/flang/test/HLFIR/maxloc-elemental.fir
index 497a58c..c9210a5 100644
--- a/flang/test/HLFIR/maxloc-elemental.fir
+++ b/flang/test/HLFIR/maxloc-elemental.fir
@@ -68,13 +68,7 @@ func.func @_QPtest(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "array"}
 // CHECK-NEXT:      }
 // CHECK-NEXT:      fir.result %[[V18]] : i32
 // CHECK-NEXT:    }
-// CHECK-NEXT:    %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK-NEXT:    fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered {
-// CHECK-NEXT:      %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3)  : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:      %[[V14:.*]] = fir.load %[[V13]] : !fir.ref<i32>
-// CHECK-NEXT:      %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3)  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:      hlfir.assign %[[V14]] to %[[V15]] : i32, !fir.ref<i32>
-// CHECK-NEXT:    }
+// CHECK-NEXT:    hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?xi32>>
 // CHECK-NEXT:    return
 // CHECK-NEXT:  }
 
diff --git a/flang/test/HLFIR/minloc-elemental.fir b/flang/test/HLFIR/minloc-elemental.fir
index 5fa482a..9453a33 100644
--- a/flang/test/HLFIR/minloc-elemental.fir
+++ b/flang/test/HLFIR/minloc-elemental.fir
@@ -68,13 +68,7 @@ func.func @_QPtest(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "array"}
 // CHECK-NEXT:      }
 // CHECK-NEXT:      fir.result %[[V18]] : i32
 // CHECK-NEXT:    }
-// CHECK-NEXT:    %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK-NEXT:    fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered {
-// CHECK-NEXT:      %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3)  : (!fir.ref<!fir.array<1xi32>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:      %[[V14:.*]] = fir.load %[[V13]] : !fir.ref<i32>
-// CHECK-NEXT:      %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3)  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:      hlfir.assign %[[V14]] to %[[V15]] : i32, !fir.ref<i32>
-// CHECK-NEXT:    }
+// CHECK-NEXT:    hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?xi32>>
 // CHECK-NEXT:    return
 // CHECK-NEXT:  }
 
@@ -147,13 +141,7 @@ func.func @_QPtest_kind2(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a
 // CHECK-NEXT:      }
 // CHECK-NEXT:      fir.result %[[V18]] : i32
 // CHECK-NEXT:    }
-// CHECK-NEXT:    %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box<!fir.array<?xi16>>, index) -> (index, index, index)
-// CHECK-NEXT:    fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered {
-// CHECK-NEXT:      %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3)  : (!fir.ref<!fir.array<1xi16>>, index) -> !fir.ref<i16>
-// CHECK-NEXT:      %[[V14:.*]] = fir.load %[[V13]] : !fir.ref<i16>
-// CHECK-NEXT:      %[[V15:.*]] = hlfir.designate %[[V2]]#0 (%arg3)  : (!fir.box<!fir.array<?xi16>>, index) -> !fir.ref<i16>
-// CHECK-NEXT:      hlfir.assign %[[V14]] to %[[V15]] : i16, !fir.ref<i16>
-// CHECK-NEXT:    }
+// CHECK-NEXT:    hlfir.assign %[[RES]] to %[[V2]]#0 : !fir.ref<!fir.array<1xi16>>, !fir.box<!fir.array<?xi16>>
 // CHECK-NEXT:    return
 
 
diff --git a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir
index 984c0bc..ce66907 100644
--- a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir
+++ b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir
@@ -48,7 +48,6 @@ func.func @_QPnegative_test_is_target(%arg0: !fir.ref<!fir.array<10xf32>> {fir.b
 }
 // CHECK-LABEL: func.func @_QPnegative_test_is_target(
 // CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x", fir.target}) {
-// CHECK:         %[[VAL_1:.*]] = arith.constant 1 : index
 // CHECK:         %[[VAL_2:.*]] = arith.constant false
 // CHECK:         %[[VAL_3:.*]] = arith.constant 10 : index
 // CHECK:         %[[VAL_4:.*]] = fir.alloca !fir.array<10xf32>
@@ -57,11 +56,7 @@ func.func @_QPnegative_test_is_target(%arg0: !fir.ref<!fir.array<10xf32>> {fir.b
 // CHECK:         %[[VAL_9:.*]] = fir.call @_QPfoo() fastmath<contract> : () -> !fir.array<10xf32>
 // CHECK:         fir.save_result %[[VAL_9]] to %[[VAL_8]]#1{{.*}}
 // CHECK:         %[[VAL_10:.*]] = hlfir.as_expr %[[VAL_8]]#0 move %[[VAL_2]] : (!fir.ref<!fir.array<10xf32>>, i1) -> !hlfir.expr<10xf32>
-// CHECK:         fir.do_loop %[[VAL_11:.*]] = %[[VAL_1]] to %[[VAL_3]] step %[[VAL_1]] unordered {
-// CHECK:           %[[VAL_12:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_11]] : (!hlfir.expr<10xf32>, index) -> f32
-// CHECK:           %[[VAL_13:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_11]])  : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
-// CHECK:           hlfir.assign %[[VAL_12]] to %[[VAL_13]] : f32, !fir.ref<f32>
-// CHECK:         }
+// CHECK:         hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : !hlfir.expr<10xf32>, !fir.ref<!fir.array<10xf32>>
 // CHECK:         hlfir.destroy %[[VAL_10]] : !hlfir.expr<10xf32>
 // CHECK:         return
 // CHECK:       }
diff --git a/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir b/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir
new file mode 100644
index 0000000..ae91930
--- /dev/null
+++ b/flang/test/HLFIR/opt-bufferization-same-ptr-elemental.fir
@@ -0,0 +1,69 @@
+// RUN: fir-opt --opt-bufferization %s | FileCheck %s
+
+// Verify that the hlfir.assign of hlfir.elemental is optimized
+// into element-per-element assignment:
+// subroutine test1(p)
+//   real, pointer :: p(:)
+//   p = p + 1.0
+// end subroutine test1
+
+func.func @_QPtest1(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "p"}) {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest1Ep"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+  %2 = fir.load %1#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+  %3:3 = fir.box_dims %2, %c0 : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+  %4 = fir.shape %3#1 : (index) -> !fir.shape<1>
+  %5 = hlfir.elemental %4 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%arg1: index):
+    %6 = arith.subi %3#0, %c1 : index
+    %7 = arith.addi %arg1, %6 : index
+    %8 = hlfir.designate %2 (%7)  : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> !fir.ref<f32>
+    %9 = fir.load %8 : !fir.ref<f32>
+    %10 = arith.addf %9, %cst fastmath<contract> : f32
+    hlfir.yield_element %10 : f32
+  }
+  hlfir.assign %5 to %2 : !hlfir.expr<?xf32>, !fir.box<!fir.ptr<!fir.array<?xf32>>>
+  hlfir.destroy %5 : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest1(
+// CHECK-NOT:         hlfir.assign
+// CHECK:             hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref<f32>
+// CHECK-NOT:         hlfir.assign
+
+// subroutine test2(p)
+//   real, pointer :: p(:,:)
+//   p = p + 1.0
+// end subroutine test2
+func.func @_QPtest2(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> {fir.bindc_name = "p"}) {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ep"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+  %2 = fir.load %1#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
+  %3:3 = fir.box_dims %2, %c0 : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>, index) -> (index, index, index)
+  %4:3 = fir.box_dims %2, %c1 : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>, index) -> (index, index, index)
+  %5 = fir.shape %3#1, %4#1 : (index, index) -> !fir.shape<2>
+  %6 = hlfir.elemental %5 unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xf32> {
+  ^bb0(%arg1: index, %arg2: index):
+    %7 = arith.subi %3#0, %c1 : index
+    %8 = arith.addi %arg1, %7 : index
+    %9 = arith.subi %4#0, %c1 : index
+    %10 = arith.addi %arg2, %9 : index
+    %11 = hlfir.designate %2 (%8, %10)  : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>, index, index) -> !fir.ref<f32>
+    %12 = fir.load %11 : !fir.ref<f32>
+    %13 = arith.addf %12, %cst fastmath<contract> : f32
+    hlfir.yield_element %13 : f32
+  }
+  hlfir.assign %6 to %2 : !hlfir.expr<?x?xf32>, !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
+  hlfir.destroy %6 : !hlfir.expr<?x?xf32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest2(
+// CHECK-NOT:         hlfir.assign
+// CHECK:             hlfir.assign %{{.*}} to %{{.*}} : f32, !fir.ref<f32>
+// CHECK-NOT:         hlfir.assign
diff --git a/flang/test/HLFIR/opt-bufferization.fir b/flang/test/HLFIR/opt-bufferization.fir
index 87afb3c..faa8f4b 100644
--- a/flang/test/HLFIR/opt-bufferization.fir
+++ b/flang/test/HLFIR/opt-bufferization.fir
@@ -796,45 +796,3 @@ func.func @_QPddx(%arg0: !fir.box<!fir.array<?x?xf64>> {fir.bindc_name = "array"
 // CHECK:           %[[VAL_61:.*]] = fir.load %[[VAL_26]]#1 : !fir.ref<!fir.array<?x?xf64>>
 // CHECK:           return %[[VAL_61]] : !fir.array<?x?xf64>
 // CHECK:         }
-
-// `hlfir.expr` bufferization (when the expresion is not the result of
-// `hlfir.elemental`)
-func.func @_QPfoo() {
-  %c1 = arith.constant 1 : index
-  %0 = fir.alloca !fir.array<1xi32> {bindc_name = "iavs", uniq_name = "_QFfooEiavs"}
-  %1 = fir.shape %c1 : (index) -> !fir.shape<1>
-  %2:2 = hlfir.declare %0(%1) {uniq_name = "_QFfooEiavs"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
-  %3 = fir.alloca i32 {bindc_name = "iv", uniq_name = "_QFfooEiv"}
-  %4:2 = hlfir.declare %3 {uniq_name = "_QFfooEiv"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  %c10_i32 = arith.constant 10 : i32
-  %6 = fir.convert %c10_i32 : (i32) -> index
-  %7 = fir.convert %c1 : (index) -> i32
-  %8:2 = fir.do_loop %arg0 = %c1 to %6 step %c1 iter_args(%arg1 = %7) -> (index, i32) {
-    fir.store %arg1 to %4#1 : !fir.ref<i32>
-    %9 = fir.allocmem !fir.array<1xi32> {bindc_name = ".tmp.arrayctor", uniq_name = ""}
-    %10 = fir.shape %c1 : (index) -> !fir.shape<1>
-    %11:2 = hlfir.declare %9(%10) {uniq_name = ".tmp.arrayctor"} : (!fir.heap<!fir.array<1xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<1xi32>>, !fir.heap<!fir.array<1xi32>>)
-    %12 = fir.load %4#0 : !fir.ref<i32>
-    %13 = hlfir.designate %11#0 (%c1)  : (!fir.heap<!fir.array<1xi32>>, index) -> !fir.ref<i32>
-    hlfir.assign %12 to %13 : i32, !fir.ref<i32>
-    %true = arith.constant true
-    %14 = hlfir.as_expr %11#0 move %true : (!fir.heap<!fir.array<1xi32>>, i1) -> !hlfir.expr<1xi32>
-    hlfir.assign %14 to %2#0 : !hlfir.expr<1xi32>, !fir.ref<!fir.array<1xi32>>
-    hlfir.destroy %14 : !hlfir.expr<1xi32>
-    %15 = arith.addi %arg0, %c1 : index
-    %16 = fir.convert %c1 : (index) -> i32
-    %17 = fir.load %4#1 : !fir.ref<i32>
-    %18 = arith.addi %17, %16 : i32
-    fir.result %15, %18 : index, i32
-  }
-  fir.store %8#1 to %4#1 : !fir.ref<i32>
-  return
-}
-
-// CHECK-LABEL:   func.func @_QPfoo
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
-// CHECK:           fir.do_loop {{.*}} {
-// CHECK-NOT:         hlfir.assign %{{.*}} to %{{.*}}#0 : !hlfir.expr<1xi32>, !fir.ref<!fir.array<1xi32>>
-// CHECK:             fir.do_loop %{{.*}} = %[[C1]] to %[[C1]] step %[[C1]] unordered {
-// CHECK:             }
-// CHECK:           }
diff --git a/flang/test/Integration/debug-116525.f90 b/flang/test/Integration/debug-116525.f90
new file mode 100644
index 0000000..1916a34
--- /dev/null
+++ b/flang/test/Integration/debug-116525.f90
@@ -0,0 +1,12 @@
+! RUN: %flang_fc1 -fopenmp -emit-llvm -debug-info-kind=standalone %s -o -
+
+! Test that this does not cause build failure.
+function s(x)
+    character(len=2) :: x, s, ss
+
+    s = x
+
+    entry ss()
+
+end function s
+
diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90
index 5a675cb..fe4144a 100644
--- a/flang/test/Integration/debug-local-var-2.f90
+++ b/flang/test/Integration/debug-local-var-2.f90
@@ -107,4 +107,4 @@ contains
   end function
 end program
 
-LINEONLY-NOT: DILocalVariable
+! LINEONLY-NOT: DILocalVariable
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 6479425..8b287f8 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -196,3 +196,30 @@ end subroutine
 ! CHECK: %[[BOX:.*]] = fir.load %[[A]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
 ! CHECK: fir.freemem %[[BOXADDR]] : !fir.heap<!fir.array<?xf32>>
+
+subroutine setpinned()
+  integer, allocatable :: i(:)
+  logical :: plog
+  allocate(i(10), pinned=plog)
+end
+
+! CHECK-LABEL: func.func @_QPsetpinned()  
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsetpinnedEplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %[[PLOG]] {uniq_name = "_QFsetpinnedEplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[FALSE:.*]] = arith.constant false
+! CHECK: %[[FLASE_CONV:.*]] = fir.convert %[[FALSE]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[FLASE_CONV]] to %[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>
+
+subroutine setpinnedpointer()
+  integer, pointer :: i(:)
+  logical :: plog
+  allocate(i(10), pinned=plog)
+end
+
+! CHECK-LABEL: func.func @_QPsetpinnedpointer()
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsetpinnedpointerEplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %[[PLOG]] {uniq_name = "_QFsetpinnedpointerEplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: fir.call @_FortranAPointerAllocate
+! CHECK: %[[FALSE:.*]] = arith.constant false
+! CHECK: %[[FLASE_CONV:.*]] = fir.convert %[[FALSE]] : (i1) -> !fir.logical<4>
+! CHECK: fir.store %[[FLASE_CONV]] to %[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>
diff --git a/flang/test/Lower/CUDA/cuda-cdevloc.cuf b/flang/test/Lower/CUDA/cuda-cdevloc.cuf
new file mode 100644
index 0000000..a7149020
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-cdevloc.cuf
@@ -0,0 +1,21 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+attributes(global) subroutine testcdevloc(a)
+  use __fortran_builtins, only: c_devloc => __builtin_c_devloc
+  integer, device :: a(10)
+  print*, c_devloc(a(1))
+end
+
+! CHECK-LABEL: func.func @_QPtestcdevloc(
+! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ARG]](%{{.*}}) dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtestcdevlocEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[A1:.*]] = hlfir.designate %[[A]]#0 (%c1{{.*}})  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[BOX:.*]] = fir.embox %[[A1]] : (!fir.ref<i32>) -> !fir.box<i32>
+! CHECK: %[[CDEVPTR:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK: %[[FIELD_CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK: %[[COORD_CPTR:.*]] = fir.coordinate_of %[[CDEVPTR]], %[[FIELD_CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS:.*]] = fir.coordinate_of %[[COORD_CPTR]], %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<i32>) -> !fir.ref<i32>
+! CHECK: %[[ADDRESS_A1:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.ref<i32>) -> i64
+! CHECK: fir.store %[[ADDRESS_A1]] to %[[COORD_ADDRESS]] : !fir.ref<i64>
diff --git a/flang/test/Lower/CUDA/cuda-pointer-sync.cuf b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf
new file mode 100644
index 0000000..4c64f4b
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-pointer-sync.cuf
@@ -0,0 +1,21 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+module devptr
+  real, device, pointer, dimension(:) :: dev_ptr
+end module
+
+use devptr
+real, device, target, dimension(4) :: a_dev
+a_dev = 42.0
+dev_ptr => a_dev
+
+dev_ptr => null()
+
+nullify(dev_ptr)
+end
+
+! CHECK: fir.global @_QMdevptrEdev_ptr {data_attr = #cuf.cuda<device>} : !fir.box<!fir.ptr<!fir.array<?xf32>>>
+! CHECK-LABEL: func.func @_QQmain()
+! CHECK: fir.embox
+! CHECK: fir.store
+! CHECK-COUNT-3: cuf.sync_descriptor @_QMdevptrEdev_ptr
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 9fe51a8..0066e71 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -469,6 +469,6 @@ contains
 end module
 
 ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-depobj.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-depobj.f90
index 3bc730f..4e98d77 100644
--- a/flang/test/Lower/OpenMP/Todo/depend-clause-depobj.f90
+++ b/flang/test/Lower/OpenMP/Todo/depend-clause-depobj.f90
@@ -1,7 +1,7 @@
 !RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
 !RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
 
-!CHECK: not yet implemented: INOUTSET, MUTEXINOUTSET and DEPOBJ dependence-types
+!CHECK: not yet implemented: DEPOBJ dependence-type
 
 subroutine f00(x)
   integer :: x
diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90
deleted file mode 100644
index 160893f..0000000
--- a/flang/test/Lower/OpenMP/Todo/depend-clause-inoutset.f90
+++ /dev/null
@@ -1,11 +0,0 @@
-!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
-!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
-
-!CHECK: not yet implemented: INOUTSET, MUTEXINOUTSET and DEPOBJ dependence-types
-subroutine f00(x)
-  integer :: x
-  !$omp task depend(inoutset: x)
-  x = x + 1
-  !$omp end task
-end
-
diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90
deleted file mode 100644
index 17cc389..0000000
--- a/flang/test/Lower/OpenMP/Todo/depend-clause-mutexinoutset.f90
+++ /dev/null
@@ -1,11 +0,0 @@
-!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
-!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
-
-!CHECK: not yet implemented: INOUTSET, MUTEXINOUTSET and DEPOBJ dependence-types
-subroutine f00(x)
-  integer :: x
-  !$omp task depend(mutexinoutset: x)
-  x = x + 1
-  !$omp end task
-end
-
diff --git a/flang/test/Lower/OpenMP/Todo/error.f90 b/flang/test/Lower/OpenMP/Todo/error.f90
index b97e2c2..6d3bd89 100644
--- a/flang/test/Lower/OpenMP/Todo/error.f90
+++ b/flang/test/Lower/OpenMP/Todo/error.f90
@@ -1,6 +1,6 @@
 ! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
 
-! CHECK: not yet implemented: OpenMPErrorConstruct
+! CHECK: not yet implemented: OpenMPUtilityConstruct
 program p
   integer, allocatable :: x
   !$omp error at(compilation) severity(warning) message("an error")
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
new file mode 100644
index 0000000..d0ed0cb
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
@@ -0,0 +1,10 @@
+! This test checks lowering of OpenMP allocate Directive with align clause.
+
+// RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s
+
+program main
+  integer :: x
+
+  // CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  !$omp allocate(x) align(32)
+end
diff --git a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90 b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
index e162c5a..e66b6f1 100644
--- a/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
+++ b/flang/test/Lower/OpenMP/allocatable-array-bounds.f90
@@ -23,7 +23,7 @@
 !HOST: %[[BOX_3:.*]]:3 = fir.box_dims %[[LOAD_3]], %[[CONSTANT_3]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 !HOST: %[[BOUNDS_1:.*]] = omp.map.bounds lower_bound(%[[LB_1]] : index) upper_bound(%[[UB_1]] : index) extent(%[[BOX_3]]#1 : index) stride(%[[BOX_2]]#2 : index) start_idx(%[[BOX_1]]#0 : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE_1]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}    
+!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 !HOST: %[[MAP_INFO_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "sp_read(2:5)"}
 
 !HOST: %[[LOAD_3:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -41,7 +41,7 @@
 !HOST: %[[BOX_5:.*]]:3 = fir.box_dims %[[LOAD_5]], %[[CONSTANT_5]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 !HOST: %[[BOUNDS_2:.*]] = omp.map.bounds lower_bound(%[[LB_2]] : index) upper_bound(%[[UB_2]] : index) extent(%[[BOX_5]]#1 : index) stride(%[[BOX_4]]#2 : index) start_idx(%[[BOX_3]]#0 : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE_2]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}    
+!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 !HOST: %[[MAP_INFO_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "sp_write(2:5)"}
 
 subroutine read_write_section()
@@ -80,8 +80,9 @@ module assumed_allocatable_array_routines
 !HOST: %[[BOX_3:.*]]:3 = fir.box_dims %[[LOAD_3]], %[[CONSTANT_3]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[LB]] : index) upper_bound(%[[UB]] : index) extent(%[[BOX_3]]#1 : index) stride(%[[BOX_2]]#2 : index) start_idx(%[[BOX_1]]#0 : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[DECLARE]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}    
+!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 !HOST: %[[MAP_INFO:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "arr_read_write(2:5)"}
+
 subroutine assumed_shape_array(arr_read_write)
     integer, allocatable, intent(inout) :: arr_read_write(:)
 
diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90
index 78fa815..479b688 100644
--- a/flang/test/Lower/OpenMP/array-bounds.f90
+++ b/flang/test/Lower/OpenMP/array-bounds.f90
@@ -51,7 +51,7 @@ module assumed_array_routines
 !HOST: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0_DECL]]#1, %[[C0_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 !HOST: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) extent(%[[DIMS1]]#1 : index) stride(%[[DIMS0]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true}
 !HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %0 base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, i32) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 !HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(to) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
 !HOST: omp.target   map_entries(%[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}}, %[[MAP_INFO_MEMBER]] -> %{{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
     subroutine assumed_shape_array(arr_read_write)
diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90
index 47bcf2a..28a2b9b 100644
--- a/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90
+++ b/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90
@@ -6,7 +6,7 @@
 !CHECK: %[[MEMBER_INDEX:.*]] = arith.constant 4 : index
 !CHECK: %[[MEMBER_COORD:.*]] = fir.coordinate_of %[[DECLARE]]#0, %[[MEMBER_INDEX]] : (!fir.ref<!fir.type<[[ONE_LAYER_TY]]>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 !CHECK: %[[MAP_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 !CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<[[ONE_LAYER_TY]]>>, !fir.type<[[ONE_LAYER_TY]]>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_DESCRIPTOR]], %[[MAP_MEMBER_BASE_ADDR]] : [4], [4, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[ONE_LAYER_TY]]>> {{{.*}} partial_map = true}
 !CHECK:   omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG0:.*]], %[[MAP_MEMBER_DESCRIPTOR]] -> %[[ARG1:.*]], %[[MAP_MEMBER_BASE_ADDR]] -> %[[ARG2:.*]] : !fir.ref<!fir.type<[[ONE_LAYER_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
@@ -37,7 +37,7 @@ end subroutine
 !CHECK: %[[MEMBER_INDEX:.*]] = arith.constant 4 : index
 !CHECK: %[[MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_DTYPE]], %[[MEMBER_INDEX]] : (!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+!CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 !CHECK: %[[MAP_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 !CHECK: %[[LOAD_DTYPE:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>
 !CHECK: %[[MEMBER_COORD:.*]] = arith.constant 5 : index
@@ -78,7 +78,7 @@ end subroutine
 !CHECK: %[[NESTED_MEMBER_INDEX:.*]] = arith.constant 2 : index
 !CHECK: %[[NESTED_MEMBER_COORD:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %[[NESTED_MEMBER_INDEX]] : (!fir.ref<!fir.type<[[REC_TY2]]>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !CHECK: %[[NESTED_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_MEMBER_COORD]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 !CHECK: %[[MAP_NESTED_MEMBER_COORD:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 !CHECK: %[[LOAD:.*]] = fir.load %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>
 !CHECK: %[[NESTED_DTYPE_INDEX:.*]] = arith.constant 6 : index
@@ -128,7 +128,7 @@ end subroutine
 !CHECK: %[[NESTED_MEMBER_INDEX:.*]] = arith.constant 2 : index
 !CHECK: %[[NESTED_MEMBER_COORD:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %[[NESTED_MEMBER_INDEX]] : (!fir.ref<!fir.type<[[REC_TY2]]>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 !CHECK: %[[NESTED_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_MEMBER_COORD]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+!CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 !CHECK: %[[MAP_NESTED_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 !CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.type<[[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_NESTED_MEMBER_DESC]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] : [6, 2], [6, 2, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[REC_TY]]>> {{.*}}
 !CHECK: omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG0:.*]], %[[MAP_NESTED_MEMBER_DESC]] -> %[[ARG1:.*]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] -> %[[ARG2:.*]] : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable.f90
index d265954..2dc4e20 100644
--- a/flang/test/Lower/OpenMP/derived-type-allocatable.f90
+++ b/flang/test/Lower/OpenMP/derived-type-allocatable.f90
@@ -13,6 +13,10 @@ module m1
 
 contains
 
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_pointer
+!CHECK-NOT:   fir.call @_FortranAInitializeClone
+!CHECK:       omp.yield
+
 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_nested
 !CHECK:       fir.call @_FortranAInitializeClone
 !CHECK-NEXT:  omp.yield
@@ -91,4 +95,11 @@ contains
     !$omp parallel private(d2)
     !$omp end parallel
   end subroutine
+
+  subroutine test_pointer()
+    type(x), pointer :: ptr
+
+    !$omp parallel private(ptr)
+    !$omp end parallel
+  end subroutine
 end module
diff --git a/flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90 b/flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90
new file mode 100644
index 0000000..2453fe2
--- /dev/null
+++ b/flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90
@@ -0,0 +1,19 @@
+! Test delayed privatization for derived types with allocatable components.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+subroutine firstprivate_alloc_comp
+  type t1
+    integer, allocatable :: c(:)
+  end type
+  type(t1) :: x
+  !$omp parallel firstprivate(x)
+    print *, allocated(x%c)
+  !$omp end parallel
+end
+
+  call firstprivate_alloc_comp()
+end
+! CHECK-LABEL:   omp.private {type = firstprivate} @_QFfirstprivate_alloc_compEx_firstprivate_ref_rec__QFfirstprivate_alloc_compTt1 : !fir.ref<!fir.type<_QFfirstprivate_alloc_compTt1{c:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> alloc {
+! CHECK:     fir.call @_FortranAInitialize(
+! CHECK:   } copy {
+! ...
diff --git a/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90
new file mode 100644
index 0000000..ab2cdf3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90
@@ -0,0 +1,32 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="HLFIRDIALECT"
+
+!HLFIRDIALECT: func.func @_QPlocal_variable_intrinsic_size(%[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) {
+!HLFIRDIALECT:   %[[SZ_DATA:.*]] = fir.alloca index
+!HLFIRDIALECT:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope {{.*}} {uniq_name = "_QFlocal_variable_intrinsic_sizeEa"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+!HLFIRDIALECT:   %[[DIMENSIONS:.*]]:3 = fir.box_dims %[[DECLARE]]#0, %{{.*}} : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+!HLFIRDIALECT:   fir.store %[[DIMENSIONS]]#1 to %[[SZ_DATA]] : !fir.ref<index>
+!HLFIRDIALECT:   %[[SIZE_SEL:.*]] = arith.select {{.*}}, {{.*}}, {{.*}} : index
+!HLFIRDIALECT:   %[[B_ALLOCA:.*]] = fir.alloca !fir.array<?xf32>, %[[SIZE_SEL]] {bindc_name = "b", uniq_name = "_QFlocal_variable_intrinsic_sizeEb"}
+!HLFIRDIALECT:   %[[B_SHAPE:.*]] = fir.shape %[[SIZE_SEL]] : (index) -> !fir.shape<1>
+!HLFIRDIALECT:   %[[B_DECLARE:.*]]:2 = hlfir.declare %[[B_ALLOCA]](%[[B_SHAPE]]) {uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+!HLFIRDIALECT:   %[[BOUNDS:.*]] = omp.map.bounds lower_bound({{.*}} : index) upper_bound({{.*}} : index) extent({{.*}} : index) stride({{.*}} : index) start_idx({{.*}} : index) {stride_in_bytes = true}
+!HLFIRDIALECT:   %[[MAP_DATA_B:.*]] = omp.map.info var_ptr(%[[B_DECLARE]]#1 : !fir.ref<!fir.array<?xf32>>, f32) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xf32>> {name = "b"}
+!HLFIRDIALECT:   %[[MAP_DATA_SZ:.*]] = omp.map.info var_ptr(%[[SZ_DATA]] : !fir.ref<index>, index) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<index> {name = ""}
+!HLFIRDIALECT:   omp.target map_entries(%[[MAP_DATA_B]] -> %[[ARG1:.*]], %[[MAP_DATA_SZ]] -> %[[ARG2:.*]] : !fir.ref<!fir.array<?xf32>>, !fir.ref<index>) {
+!HLFIRDIALECT:      %[[SZ_LD:.*]] = fir.load %[[ARG2]] : !fir.ref<index>
+!HLFIRDIALECT:      %[[SZ_CONV:.*]] = fir.convert %[[SZ_LD]] : (index) -> i64
+!HLFIRDIALECT:      %[[SZ_CONV2:.*]] = fir.convert %[[SZ_CONV]] : (i64) -> index
+!HLFIRDIALECT:      %[[SEL_SZ:.*]] = arith.cmpi sgt, %[[SZ_CONV2]], %{{.*}} : index
+!HLFIRDIALECT:      %[[SEL_SZ2:.*]] = arith.select %[[SEL_SZ]], %[[SZ_CONV2]], %{{.*}} : index
+!HLFIRDIALECT:      %[[SHAPE:.*]] = fir.shape %[[SEL_SZ2]] : (index) -> !fir.shape<1>
+!HLFIRDIALECT:      %{{.*}} = hlfir.declare %[[ARG1]](%[[SHAPE]]) {uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+
+subroutine local_variable_intrinsic_size(a)
+    implicit none
+    real, dimension(:) :: a
+    real, dimension(size(a, 1)) :: b
+
+!$omp target map(tofrom: b)
+        b(5) = 5
+!$omp end target
+end subroutine
diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90
index 6e525a0..f5591bd 100644
--- a/flang/test/Lower/OpenMP/task.f90
+++ b/flang/test/Lower/OpenMP/task.f90
@@ -144,6 +144,18 @@ subroutine task_depend_multi_task()
   x = x + 12
   !CHECK: omp.terminator
   !$omp end task
+  !CHECK: omp.task depend(taskdependmutexinoutset -> %{{.+}} : !fir.ref<i32>)
+  !$omp task depend(mutexinoutset : x)
+  !CHECK: arith.subi
+  x = x - 12
+  !CHECK: omp.terminator
+  !$omp end task
+    !CHECK: omp.task depend(taskdependinoutset -> %{{.+}} : !fir.ref<i32>)
+  !$omp task depend(inoutset : x)
+  !CHECK: arith.subi
+  x = x - 12
+  !CHECK: omp.terminator
+  !$omp end task
 end subroutine task_depend_multi_task
 
 !===============================================================================
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index fba0c12f..bbc5475 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -267,7 +267,7 @@ contains
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
 ! CHECK: %[[P_CAST:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@@ -276,7 +276,7 @@ contains
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C1_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
 ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@@ -285,7 +285,7 @@ contains
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C2_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
 ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@@ -300,7 +300,7 @@ contains
 ! CHECK: %[[C10_I64:.*]] = fir.convert %[[C10]] : (i32) -> i64
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[C3_CAST]], %[[C0]], %[[C1_I64]], %[[C10_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@@ -316,7 +316,7 @@ contains
 ! CHECK: %[[C20_I64:.*]] = fir.convert %[[C20]] : (i32) -> i64
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[C4_CAST]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[C1_LOAD1:.*]] = fir.load %[[C1_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: fir.dispatch "proc1"(%[[C1_LOAD1]] : !fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>)
@@ -390,7 +390,7 @@ contains
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> none
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[PTR_DECL]]#1 : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[CAT:.*]] = arith.constant 2 : i32
@@ -573,7 +573,7 @@ contains
 ! CHECK: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[A_NONE]], %[[LEN]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> none
 ! CHECK: %[[A_NONE:.*]] = fir.convert %[[A_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 end module
 
@@ -592,17 +592,17 @@ end
 ! LLVM-LABEL: define void @_QMpolyPtest_allocatable()
 
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 1, i32 0)
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 10)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0)
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM-COUNT-2:  call void %{{[0-9]*}}()
 
 ! LLVM: call void @llvm.memcpy.p0.p0.i32
@@ -683,5 +683,5 @@ end
 ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]]
 ! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA2:[0-9]+]], ptr %[[ALLOCA1]], i32 40, i1 false)
 ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i64 -1, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableDeallocatePolymorphic(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
diff --git a/flang/test/Lower/allocatable-runtime.f90 b/flang/test/Lower/allocatable-runtime.f90
index effd0a3..3f1f8a8 100644
--- a/flang/test/Lower/allocatable-runtime.f90
+++ b/flang/test/Lower/allocatable-runtime.f90
@@ -31,7 +31,7 @@ subroutine foo()
   ! CHECK: fir.call @{{.*}}AllocatableSetBounds(%[[xBoxCast2]], %c0{{.*}}, %[[xlbCast]], %[[xubCast]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
   ! CHECK-DAG: %[[xBoxCast3:.*]] = fir.convert %[[xBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK-DAG: %[[sourceFile:.*]] = fir.convert %{{.*}} -> !fir.ref<i8>
-  ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %c-1{{.*}}, %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+  ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
   ! Simply check that we are emitting the right numebr of set bound for y and z. Otherwise, this is just like x.
   ! CHECK: fir.convert %[[yBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
@@ -180,4 +180,4 @@ end subroutine
 ! CHECK: %[[M_BOX_NONE:.*]] = fir.convert %[[EMBOX_M]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[M_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/allocate-mold.f90 b/flang/test/Lower/allocate-mold.f90
index 831b260..0cc10fc 100644
--- a/flang/test/Lower/allocate-mold.f90
+++ b/flang/test/Lower/allocate-mold.f90
@@ -16,7 +16,7 @@ end subroutine
 ! CHECK: %[[A_REF_BOX_NONE1:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableApplyMold(%[[A_REF_BOX_NONE1]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> none
 ! CHECK: %[[A_REF_BOX_NONE2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 subroutine array_scalar_mold_allocation()
   real, allocatable :: a(:)
@@ -40,4 +40,4 @@ end subroutine array_scalar_mold_allocation
 ! CHECK: %[[REF_BOX_A1:.*]] = fir.convert %1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableSetBounds(%[[REF_BOX_A1]], {{.*}},{{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
 ! CHECK: %[[REF_BOX_A2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/array-substring.f90 b/flang/test/Lower/array-substring.f90
index 0210103..7544fbb 100644
--- a/flang/test/Lower/array-substring.f90
+++ b/flang/test/Lower/array-substring.f90
@@ -24,9 +24,8 @@
 ! CHECK:         %[[VAL_16:.*]] = fir.array_coor %[[VAL_7]](%[[VAL_9]]) {{\[}}%[[VAL_10]]] %[[VAL_15]] : (!fir.ref<!fir.array<1x!fir.char<1,12>>>, !fir.shape<1>, !fir.slice<1>, index) -> !fir.ref<!fir.char<1,12>>
 ! CHECK:         %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.char<1,12>>) -> !fir.ref<!fir.array<12x!fir.char<1>>>
 ! CHECK:         %[[VAL_18:.*]] = fir.coordinate_of %[[VAL_17]], %[[VAL_2]] : (!fir.ref<!fir.array<12x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
-! CHECK:         %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:         %[[VAL_20:.*]] = fir.array_coor %[[VAL_11]](%[[VAL_9]]) %[[VAL_15]] : (!fir.ref<!fir.array<1x!fir.char<1,8>>>, !fir.shape<1>, index) -> !fir.ref<!fir.char<1,8>>
-! CHECK:         %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK:         %[[VAL_21:.*]] = fir.convert %[[VAL_18]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
 ! CHECK:         %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.char<1,8>>) -> !fir.ref<i8>
 ! CHECK:         %[[VAL_23:.*]] = fir.convert %[[VAL_4]] : (index) -> i64
 ! CHECK:         %[[VAL_24:.*]] = fir.call @_FortranACharacterCompareScalar1(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_23]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i64) -> i32
diff --git a/flang/test/Lower/fsave-main-program.f90 b/flang/test/Lower/fsave-main-program.f90
new file mode 100644
index 0000000..17fc1b0
--- /dev/null
+++ b/flang/test/Lower/fsave-main-program.f90
@@ -0,0 +1,10 @@
+! Test -fsave-main-program switch.
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: %flang_fc1 -fsave-main-program -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-SAVE %s
+program test
+integer :: i
+call foo(i)
+end
+
+!CHECK-DEFAULT-NOT: fir.global internal @_QFEi
+!CHECK-SAVE: fir.global internal @_QFEi
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index 8c212ce..8c40c91 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -1154,11 +1154,11 @@ end program
 ! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
 ! CHECK: %[[ADDR_O:.*]] = fir.address_of(@_QFEo) : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ADDR_O]] : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: %[[O:.*]] = fir.load %[[ADDR_O]] : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[FIELD_INNER:.*]] = fir.field_index inner, !fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>
 ! CHECK: %[[COORD_INNER:.*]] = fir.coordinate_of %[[O]], %[[FIELD_INNER]] : (!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>, !fir.field) -> !fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
-! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %{{.*}}) -> (!fir.array<5x!fir.logical<4>>) {
+! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %9) -> (!fir.array<5x!fir.logical<4>>) {
 ! CHECK:   %[[EMBOXED:.*]] = fir.embox %[[COORD_INNER]] : (!fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
-! CHECK:   %{{.*}} = fir.call @_QMpolymorphic_testPlt(%{{.*}}, %[[EMBOXED]]) {{.*}} : (!fir.ref<i32>, !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.logical<4>
+! CHECK:   %{{.*}} = fir.call @_QMpolymorphic_testPlt(%17, %[[EMBOXED]]) {{.*}} : (!fir.ref<i32>, !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.logical<4>
 ! CHECK:  }
diff --git a/flang/test/Lower/vector-subscript-io.f90 b/flang/test/Lower/vector-subscript-io.f90
index 372130f..9a041af 100644
--- a/flang/test/Lower/vector-subscript-io.f90
+++ b/flang/test/Lower/vector-subscript-io.f90
@@ -325,12 +325,11 @@ subroutine substring(x, y, i, j)
 ! CHECK:   %[[VAL_230:.*]] = arith.subi %[[VAL_216]], %[[VAL_210]] : index
 ! CHECK:   %[[VAL_231:.*]] = fir.convert %[[VAL_228]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1>>>
 ! CHECK:   %[[VAL_232:.*]] = fir.coordinate_of %[[VAL_231]], %[[VAL_230]] : (!fir.ref<!fir.array<?x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
-! CHECK:   %[[VAL_233:.*]] = fir.convert %[[VAL_232]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:   %[[VAL_234:.*]] = arith.subi %[[VAL_219]], %[[VAL_216]] : index
 ! CHECK:   %[[VAL_235:.*]] = arith.addi %[[VAL_234]], %[[VAL_210]] : index
 ! CHECK:   %[[VAL_236:.*]] = arith.cmpi slt, %[[VAL_235]], %[[VAL_209]] : index
 ! CHECK:   %[[VAL_237:.*]] = arith.select %[[VAL_236]], %[[VAL_209]], %[[VAL_235]] : index
-! CHECK:   %[[VAL_238:.*]] = fir.convert %[[VAL_233]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK:   %[[VAL_238:.*]] = fir.convert %[[VAL_232]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
 ! CHECK:   %[[VAL_239:.*]] = fir.convert %[[VAL_237]] : (index) -> i64
 ! CHECK:   %[[VAL_240:.*]] = fir.call @_FortranAioInputAscii(%[[VAL_213]], %[[VAL_238]], %[[VAL_239]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64) -> i1
 ! CHECK:   %[[VAL_241:.*]] = arith.addi %[[VAL_221]], %[[VAL_210]] overflow<nsw> : index
diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90
new file mode 100644
index 0000000..8cb009d
--- /dev/null
+++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90
@@ -0,0 +1,42 @@
+! REQUIRES: openmp_runtime
+
+! RUN: %flang_fc1 %openmp_flags -fopenmp-version=51 -fdebug-dump-parse-tree %s | FileCheck %s
+! RUN: %flang_fc1 %openmp_flags -fdebug-unparse -fopenmp-version=51 %s | FileCheck %s --check-prefix="UNPARSE"
+! Ensures associated declarative OMP allocations are nested in their
+! corresponding executable allocate directive
+
+program allocate_align_tree
+    use omp_lib
+    integer, allocatable :: j(:), xarray(:)
+    integer :: z, t
+    t = 2
+    z = 3
+!$omp allocate(j) align(16)
+!$omp allocate(xarray) align(32) allocator(omp_large_cap_mem_alloc)
+    allocate(j(z), xarray(t))
+end program allocate_align_tree
+
+!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!CHECK-NEXT: | | | AttrSpec -> Allocatable
+!CHECK-NEXT: | | | EntityDecl
+!CHECK-NEXT: | | | | Name = 'j'
+
+
+!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
+!CHECK-NEXT: | | | Verbatim
+!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Expr = '32_4'
+!CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32'
+!CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
+!CHECK-NEXT: | | | | Verbatim
+!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Expr = '16_4'
+!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16'
+!CHECK-NEXT: | | | AllocateStmt
+
+!UNPARSE: !$OMP ALLOCATE (j) ALIGN(16_4)
+!UNPARSE: !$OMP ALLOCATE (xarray) ALIGN(32_4) ALLOCATOR(2_8)
+!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t))
diff --git a/flang/test/Parser/OpenMP/allocate-unparse.f90 b/flang/test/Parser/OpenMP/allocate-unparse.f90
index 81b3677..94bc2ad 100644
--- a/flang/test/Parser/OpenMP/allocate-unparse.f90
+++ b/flang/test/Parser/OpenMP/allocate-unparse.f90
@@ -5,7 +5,7 @@ program allocate_unparse
 use omp_lib
 
 real, dimension (:,:), allocatable :: darray
-integer :: a, b, m, n, t, x, y, z
+integer :: a, b, j, m, n, t, x, y, z
 
 ! 2.11.3 declarative allocate
 
@@ -25,6 +25,7 @@ integer :: a, b, m, n, t, x, y, z
 !$omp allocate(z) allocator(omp_default_mem_alloc)
 !$omp allocate(m) allocator(omp_default_mem_alloc)
 !$omp allocate(n)
+!$omp allocate(j) align(16)
     allocate ( darray(z, t) )
 
 end program allocate_unparse
@@ -41,4 +42,5 @@ end program allocate_unparse
 !CHECK:!$OMP ALLOCATE (z) ALLOCATOR(omp_default_mem_alloc)
 !CHECK:!$OMP ALLOCATE (m) ALLOCATOR(omp_default_mem_alloc)
 !CHECK:!$OMP ALLOCATE (n)
+!CHECK:!$OMP ALLOCATE (j) ALIGN(16)
 !CHECK:ALLOCATE(darray(z,t))
diff --git a/flang/test/Parser/OpenMP/compiler-directive-continuation.f90 b/flang/test/Parser/OpenMP/compiler-directive-continuation.f90
new file mode 100644
index 0000000..87e4a72
--- /dev/null
+++ b/flang/test/Parser/OpenMP/compiler-directive-continuation.f90
@@ -0,0 +1,44 @@
+! RUN: %flang_fc1 -fopenmp -E %s 2>&1 | FileCheck %s --check-prefix=CHECK-OMP
+! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s 
+
+
+! Test in mixed way, i.e., combination of Fortran free source form 
+! and free source form with conditional compilation sentinel.
+! CHECK-LABEL: subroutine mixed_form1()
+! CHECK-OMP: i = 1 +100+ 1000+ 10 + 1 +1000000000 + 1000000
+! CHECK: i = 1 + 10 + 10000 + 1000000
+subroutine mixed_form1()
+   i = 1 &
+  !$+100&
+  !$&+ 1000&
+   &+ 10 + 1&
+  !$& +100000&
+   &0000 + 1000000
+end subroutine	
+
+
+! Testing continuation lines in only Fortran Free form Source
+! CHECK-LABEL: subroutine mixed_form2()
+! CHECK-OMP: i = 1 +10 +100 + 1000 + 10000
+! CHECK: i = 1 +10 +100 + 1000 + 10000
+subroutine mixed_form2()
+   i = 1 &
+   +10 &
+   &+100
+   & + 1000 &
+   + 10000
+end subroutine
+
+
+! Testing continuation line in only free source form conditional compilation sentinel.
+! CHECK-LABEL: subroutine mixed_form3()
+! CHECK-OMP: i=0
+! CHECK-OMP: i = 1 +10 +100+1000
+subroutine mixed_form3()
+   !$ i=0
+   !$ i = 1 &
+   !$ & +10 &
+   !$&+100&
+   !$ +1000 
+end subroutine
+
diff --git a/flang/test/Parser/OpenMP/error-unparse.f90 b/flang/test/Parser/OpenMP/error-unparse.f90
index fce5d8c..2cb4e1a 100644
--- a/flang/test/Parser/OpenMP/error-unparse.f90
+++ b/flang/test/Parser/OpenMP/error-unparse.f90
@@ -1,23 +1,27 @@
-! RUN: %flang_fc1  -fopenmp-version=51 -fopenmp -fdebug-unparse-no-sema %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1  -fopenmp-version=51 -fopenmp -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s --check-prefix="PARSE-TREE"
+! RUN: %flang_fc1  -fopenmp-version=51 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1  -fopenmp-version=51 -fopenmp -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s --check-prefix="PARSE-TREE"
 program main
   character(*), parameter :: message = "This is an error"
   !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(WARNING) MESSAGE("some message here")
-  !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct
+  !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective
   !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation
   !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Warning
-  !PARSE-TREE:  OmpClause -> Message -> OmpMessageClause -> Expr -> LiteralConstant -> CharLiteralConstant
+  !PARSE-TREE:  OmpClause -> Message -> OmpMessageClause -> Expr = '"some message here"'
+  !PARSE-TREE:  LiteralConstant -> CharLiteralConstant
+  !PARSE-TREE:  string = 'some message here'
   !$omp error at(compilation) severity(warning) message("some message here")
-  !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE(message)
-  !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct
+  !CHECK: !$OMP ERROR AT(COMPILATION) SEVERITY(FATAL) MESSAGE("This is an error")
+  !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective
   !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Compilation
   !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal
-  !PARSE-TREE:  OmpClause -> Message -> OmpMessageClause -> Expr -> Designator -> DataRef -> Name = 'message'
+  !PARSE-TREE:  OmpClause -> Message -> OmpMessageClause -> Expr = '"This is an error"'
+  !PARSE-TREE:  Designator -> DataRef -> Name = 'message'
   !$omp error at(compilation) severity(fatal) message(message)
-  !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE(message)
-  !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPErrorConstruct
+  !CHECK: !$OMP ERROR AT(EXECUTION) SEVERITY(FATAL) MESSAGE("This is an error")
+  !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpErrorDirective
   !PARSE-TREE: OmpClauseList -> OmpClause -> At -> OmpAtClause -> ActionTime = Execution
   !PARSE-TREE: OmpClause -> Severity -> OmpSeverityClause -> Severity = Fatal
-  !PARSE-TREE:  OmpClause -> Message -> OmpMessageClause -> Expr -> Designator ->  DataRef -> Name = 'message'
+  !PARSE-TREE:  OmpClause -> Message -> OmpMessageClause -> Expr = '"This is an error"'
+  !PARSE-TREE:  Designator ->  DataRef -> Name = 'message'
   !$omp error at(EXECUTION) severity(fatal) message(message)
 end program main
diff --git a/flang/test/Parser/OpenMP/nothing.f90 b/flang/test/Parser/OpenMP/nothing.f90
new file mode 100644
index 0000000..22558c4
--- /dev/null
+++ b/flang/test/Parser/OpenMP/nothing.f90
@@ -0,0 +1,113 @@
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00
+  !$omp nothing
+end
+
+!UNPARSE: SUBROUTINE f00
+!UNPARSE:  !$OMP NOTHING
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPart -> Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective
+
+subroutine f01
+  block
+  import, none
+  integer :: x
+  !$omp nothing   ! "nothing" in the execution part
+  x = x+1
+  end block
+end
+
+!UNPARSE: SUBROUTINE f01
+!UNPARSE:  BLOCK
+!UNPARSE:   IMPORT, NONE
+!UNPARSE:   INTEGER x
+!UNPARSE:   !$OMP NOTHING
+!UNPARSE:    x=x+1_4
+!UNPARSE:  END BLOCK
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: BlockStmt ->
+!PARSE-TREE: BlockSpecificationPart -> SpecificationPart
+!PARSE-TREE: | ImportStmt
+!PARSE-TREE: | ImplicitPart ->
+!PARSE-TREE: | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!PARSE-TREE: | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!PARSE-TREE: | | EntityDecl
+!PARSE-TREE: | | | Name = 'x'
+!PARSE-TREE: Block
+!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective
+!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1_4'
+!PARSE-TREE: | | Variable = 'x'
+!PARSE-TREE: | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | Expr = 'x+1_4'
+!PARSE-TREE: | | | Add
+!PARSE-TREE: | | | | Expr = 'x'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | Expr = '1_4'
+!PARSE-TREE: | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE: EndBlockStmt ->
+
+subroutine f02
+  integer :: x
+  !$omp nothing
+end
+
+!UNPARSE: SUBROUTINE f02
+!UNPARSE:  INTEGER x
+!UNPARSE:  !$OMP NOTHING
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: SpecificationPart
+!PARSE-TREE: | ImplicitPart ->
+!PARSE-TREE: | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!PARSE-TREE: | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!PARSE-TREE: | | EntityDecl
+!PARSE-TREE: | | | Name = 'x'
+!PARSE-TREE: ExecutionPart -> Block
+!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective
+
+subroutine f03
+  block
+  !$omp nothing   ! "nothing" in the specification part
+  import, none
+  integer :: x
+  x = x+1
+  end block
+end
+
+!UNPARSE: SUBROUTINE f03
+!UNPARSE:  BLOCK
+!UNPARSE:   !$OMP NOTHING
+!UNPARSE:   IMPORT, NONE
+!UNPARSE:   INTEGER x
+!UNPARSE:    x=x+1_4
+!UNPARSE:  END BLOCK
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPart -> Block
+!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> BlockConstruct
+!PARSE-TREE: | | BlockStmt ->
+!PARSE-TREE: | | BlockSpecificationPart -> SpecificationPart
+!PARSE-TREE: | | | OpenMPDeclarativeConstruct -> OpenMPUtilityConstruct -> OmpNothingDirective
+!PARSE-TREE: | | | ImportStmt
+!PARSE-TREE: | | | ImplicitPart ->
+!PARSE-TREE: | | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!PARSE-TREE: | | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!PARSE-TREE: | | | | EntityDecl
+!PARSE-TREE: | | | | | Name = 'x'
+!PARSE-TREE: | | Block
+!PARSE-TREE: | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1_4'
+!PARSE-TREE: | | | | Variable = 'x'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | Expr = 'x+1_4'
+!PARSE-TREE: | | | | | Add
+!PARSE-TREE: | | | | | | Expr = 'x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | | Expr = '1_4'
+!PARSE-TREE: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE: | | EndBlockStmt ->
+!PARSE-TREE: EndSubroutineStmt ->
+\ No newline at end of file
diff --git a/flang/test/Parser/at-process.f b/flang/test/Parser/at-process.f
index 41b9504..4f54c6b 100644
--- a/flang/test/Parser/at-process.f
+++ b/flang/test/Parser/at-process.f
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
 
 ! Test ignoring @PROCESS directive in fixed source form
 
@@ -18,3 +18,5 @@ c@process
 
 !CHECK: Character in fixed-form label field must be a digit
 @precoss 
+
+!CHECK: at-process.f:14:1: error: parser FAIL (final position)
diff --git a/flang/test/Parser/unparseable.f90 b/flang/test/Parser/unparseable.f90
new file mode 100644
index 0000000..9e7a890
--- /dev/null
+++ b/flang/test/Parser/unparseable.f90
@@ -0,0 +1,5 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+! CHECK: unparseable.f90:5:1: error: parser FAIL (final position)
+module m
+end
+select type (barf)
diff --git a/flang/test/Preprocessing/bug129131.F b/flang/test/Preprocessing/bug129131.F
new file mode 100644
index 0000000..5b1a914
--- /dev/null
+++ b/flang/test/Preprocessing/bug129131.F
@@ -0,0 +1,5 @@
+! RUN: %flang -fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+! CHECK: PRINT *, 2_4
+#define a ,3
+      print *, mod(5 a)
+      end
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
new file mode 100644
index 0000000..ba0776c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -0,0 +1,20 @@
+! REQUIRES: openmp_runtime
+
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
+! OpenMP Version 5.2
+! The allocate clause's allocator modifier must be of type allocator_handle
+! and the align modifier must be constant, positive integer expression
+
+program allocate_align_tree
+    use omp_lib
+    integer, allocatable :: j(:), xarray(:)
+    integer :: z, t, xx
+    t = 2
+    z = 3
+    !ERROR: The alignment value should be a constant positive integer
+!$omp allocate(j) align(xx)
+    !ERROR: The alignment value should be a constant positive integer
+!$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
+    allocate(j(z), xarray(t))
+end program allocate_align_tree
+
diff --git a/flang/test/Semantics/OpenMP/error.f90 b/flang/test/Semantics/OpenMP/error.f90
new file mode 100644
index 0000000..067417a
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/error.f90
@@ -0,0 +1,8 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+subroutine f00(x)
+!ERROR: The ERROR directive with AT(EXECUTION) cannot appear in the specification part
+  !$omp error at(execution) message("Haaa!")
+  integer :: x
+end
+
diff --git a/flang/test/Semantics/assign16.f90 b/flang/test/Semantics/assign16.f90
new file mode 100644
index 0000000..2e65829f
--- /dev/null
+++ b/flang/test/Semantics/assign16.f90
@@ -0,0 +1,46 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! The RHS of a pointer assignment can be unlimited polymorphic
+! if the LHS is a sequence type.
+program main
+  type nonSeqType
+    integer j
+  end type
+  type seqType
+    sequence
+    integer j
+  end type
+  type(nonSeqType), target :: xNonSeq = nonSeqType(1)
+  type(nonSeqType), pointer :: pNonSeq
+  type(seqType), target :: xSeq = seqType(1), aSeq(1)
+  type(seqType), pointer :: pSeq, paSeq(:)
+  !ERROR: function result type 'CLASS(*)' is not compatible with pointer type 'nonseqtype'
+  pNonSeq => polyPtr(xNonSeq)
+  pSeq => polyPtr(xSeq) ! ok
+  !ERROR: Pointer has rank 1 but target has rank 0
+  paSeq => polyPtr(xSeq)
+  !ERROR: Pointer has rank 0 but target has rank 1
+  pSeq => polyPtrArr(aSeq)
+ contains
+  function polyPtr(target)
+    class(*), intent(in), target :: target
+    class(*), pointer :: polyPtr
+    polyPtr => target
+  end
+  function polyPtrArr(target)
+    class(*), intent(in), target :: target(:)
+    class(*), pointer :: polyPtrArr(:)
+    polyPtrArr => target
+  end
+  function err1(target)
+    class(*), intent(in), target :: target(:)
+    class(*), pointer :: err1
+    !ERROR: Pointer has rank 0 but target has rank 1
+    err1 => target
+  end
+  function err2(target)
+    class(*), intent(in), target :: target
+    class(*), pointer :: err2(:)
+    !ERROR: Pointer has rank 1 but target has rank 0
+    err2 => target
+  end
+end
diff --git a/flang/test/Semantics/bug121718.f90 b/flang/test/Semantics/bug121718.f90
new file mode 100644
index 0000000..e99391f
--- /dev/null
+++ b/flang/test/Semantics/bug121718.f90
@@ -0,0 +1,31 @@
+! RUN: %flang_fc1 2>&1 | FileCheck %s --allow-empty
+! CHECK-NOT: error
+! Regression test simplified from LLVM bug 121718.
+! Ensure no crash and no spurious error message.
+module m1
+  type foo
+    integer x
+  end type
+ contains
+  subroutine test
+    print *, foo(123)
+  end
+end
+module m2
+  interface foo
+    procedure f
+  end interface
+  type foo
+    real x
+  end type
+ contains
+  complex function f(x)
+    complex, intent(in) :: x
+    f = x
+  end
+end
+program main
+  use m1
+  use m2
+  call test
+end
diff --git a/flang/test/Semantics/call04.f90 b/flang/test/Semantics/call04.f90
index 6877f9c..9be579f 100644
--- a/flang/test/Semantics/call04.f90
+++ b/flang/test/Semantics/call04.f90
@@ -21,10 +21,14 @@ module m
   subroutine s01a(x)
     real, allocatable, intent(out) :: x(:)
   end subroutine
+  subroutine s01c(x)
+    real, intent(out) :: x(:)
+  end subroutine
   subroutine s01b ! C846 - can only be caught at a call via explicit interface
     !ERROR: ALLOCATABLE coarray 'coarray' may not be associated with INTENT(OUT) dummy argument 'x='
     !ERROR: ALLOCATABLE dummy argument 'x=' has corank 0 but actual argument has corank 1
     call s01a(coarray)
+    call s01c(coarray) ! ok, dummy is not allocatable
   end subroutine
 
   subroutine s02(x) ! C846
diff --git a/flang/test/Semantics/cuf07.cuf b/flang/test/Semantics/cuf07.cuf
index c48abb5..56b2164 100644
--- a/flang/test/Semantics/cuf07.cuf
+++ b/flang/test/Semantics/cuf07.cuf
@@ -28,7 +28,7 @@ module m
     integer, allocatable, device :: ia(:)
     logical :: plog
 
-    !ERROR: Object in ALLOCATE must have PINNED attribute when PINNED option is specified
+    !WARNING: Object in ALLOCATE should have PINNED attribute when PINNED option is specified
     allocate(ia(100), pinned = plog)
   end subroutine
 
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index 3307e2a..06c9070 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -54,6 +54,59 @@ module m
     print*,threadIdx%x
     stop ! ok
   end subroutine
+
+  attributes(global) subroutine cycletest()
+    integer :: i
+    do i = 1, 10
+      cycle ! ok
+    end do
+  end subroutine
+
+  attributes(global) subroutine gototest()
+    integer :: i
+    goto 10
+    10 print *, "X is negative!" 
+  end subroutine
+
+  attributes(global) subroutine exittest()
+    integer :: i
+    do i = 1, 10
+      if (i == 1) then
+        exit ! ok
+      end if
+    end do
+  end subroutine
+
+  attributes(global) subroutine selectcasetest()
+    integer :: i
+    select case(i)
+    case (1)
+      print*,'main'
+    case default
+      print*, 'default'
+    end select
+  end subroutine
+
+  subroutine host()
+    integer :: i
+    !$cuf kernel do
+    do i = 1, 10
+      !ERROR: Statement may not appear in cuf kernel code
+      cycle
+    end do
+
+    !$cuf kernel do
+    do i = 1, 10
+      if (i == 1) then
+        !ERROR: Statement may not appear in cuf kernel code
+        exit ! ok
+      end if
+
+      !ERROR: Statement may not appear in cuf kernel code
+      goto 10
+      10 print *, "X is negative!"
+    end do
+  end subroutine
 end
 
 program main
diff --git a/flang/test/Semantics/cuf10.cuf b/flang/test/Semantics/cuf10.cuf
index 047503b..24b596b 100644
--- a/flang/test/Semantics/cuf10.cuf
+++ b/flang/test/Semantics/cuf10.cuf
@@ -2,6 +2,7 @@
 module m
   real, device :: a(4,8)
   real, managed, allocatable :: b(:,:)
+  integer, constant :: x = 1
  contains
   attributes(global) subroutine kernel(a,b,c,n,m)
     integer, value :: n
@@ -23,4 +24,10 @@ module m
       call devsub(c,4) ! not checked in OpenACC construct
     end do
   end
+  attributes(global) subroutine sub1(x)
+    integer :: x
+  end
+  subroutine sub2()
+    call sub1<<<1,1>>>(x) ! actual constant to device dummy
+  end
 end
diff --git a/flang/test/Semantics/definable01.f90 b/flang/test/Semantics/definable01.f90
index ff71b41..d3b31ee 100644
--- a/flang/test/Semantics/definable01.f90
+++ b/flang/test/Semantics/definable01.f90
@@ -109,7 +109,29 @@ module m
   end
   pure subroutine test7(lp)
     type(list), pointer :: lp
-    !CHECK-NOT: error:
-    lp%next%next => null()
+    lp%next%next => null() ! ok
   end
 end module
+program main
+  use iso_fortran_env, only: lock_type
+  type(lock_type) lock
+  interface
+    subroutine inlock(lock)
+      import lock_type
+      type(lock_type), intent(in) :: lock
+    end
+    subroutine outlock(lock)
+      import lock_type
+      !CHECK: error: An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE
+      type(lock_type), intent(out) :: lock
+    end
+    subroutine inoutlock(lock)
+      import lock_type
+      type(lock_type), intent(in out) :: lock
+    end
+  end interface
+  call inlock(lock) ! ok
+  call inoutlock(lock) ! ok
+  !CHECK: error: Actual argument associated with INTENT(OUT) dummy argument 'lock=' is not definable
+  call outlock(lock)
+end
diff --git a/flang/test/Semantics/get_team.f90 b/flang/test/Semantics/get_team.f90
index 7e48867..a5b49a8 100644
--- a/flang/test/Semantics/get_team.f90
+++ b/flang/test/Semantics/get_team.f90
@@ -10,6 +10,8 @@ program get_team_test
   type(team_type) :: result_team
   logical wrong_result_type, non_integer
 
+  result_team = team_type()
+
   !___ standard-conforming statement with no optional arguments present ___
   result_team = get_team()
 
diff --git a/flang/test/Semantics/io07.f90 b/flang/test/Semantics/io07.f90
index 64a32c9..a013849 100644
--- a/flang/test/Semantics/io07.f90
+++ b/flang/test/Semantics/io07.f90
@@ -68,10 +68,10 @@
 6001 format(((I0, B0)))
 
      !ERROR: 'A' edit descriptor 'w' value must be positive
-     !ERROR: 'L' edit descriptor 'w' value must be positive
+     !WARNING: 'L' edit descriptor 'w' value should be positive
 6101 format((A0), ((L0)))
 
-     !ERROR: 'L' edit descriptor 'w' value must be positive
+     !WARNING: 'L' edit descriptor 'w' value should be positive
 6102 format((3(((L 0 0 0)))))
 
 7001 format(17G8.1, 17G8.1e3)
diff --git a/flang/test/Semantics/io08.f90 b/flang/test/Semantics/io08.f90
index f6038b4..517984f 100644
--- a/flang/test/Semantics/io08.f90
+++ b/flang/test/Semantics/io08.f90
@@ -192,8 +192,7 @@
   !ERROR: 'A' edit descriptor 'w' value must be positive
   write(*,'(A0)')
 
-  !ERROR: 'L' edit descriptor 'w' value must be positive
-  write(*,'(L0)')
+  write(*,'(L0)') ! warning, not error
 
   !ERROR: Expected 'G' edit descriptor '.d' value
   write(*,'(G4)')
diff --git a/flang/test/Semantics/lcobound.f90 b/flang/test/Semantics/lcobound.f90
index ce2f001..f03f2cae 100644
--- a/flang/test/Semantics/lcobound.f90
+++ b/flang/test/Semantics/lcobound.f90
@@ -11,6 +11,9 @@ program lcobound_tests
   logical non_integer, logical_coarray[3,*]
   logical, parameter :: const_non_integer = .true.
   integer, allocatable :: lcobounds(:)
+  real bounded[2:3,4:5,*]
+
+  integer(kind=merge(kind(1),-1,all(lcobound(bounded)==[2,4,1]))) test_lcobound
 
   !___ standard-conforming statement with no optional arguments present ___
   lcobounds = lcobound(scalar_coarray)
@@ -50,28 +53,28 @@ program lcobound_tests
 
   !___ non-conforming statements ___
 
-  !ERROR: DIM=0 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=0 dimension must be positive
   n = lcobound(scalar_coarray, dim=0)
 
-  !ERROR: DIM=0 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=0 dimension must be positive
   n = lcobound(coarray_corank3, dim=0)
 
-  !ERROR: DIM=-1 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=-1 dimension must be positive
   n = lcobound(scalar_coarray, dim=-1)
 
-  !ERROR: DIM=2 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=2 dimension is out of range for corank-1 coarray
   n = lcobound(array_coarray, dim=2)
 
-  !ERROR: DIM=2 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=2 dimension is out of range for corank-1 coarray
   n = lcobound(array_coarray, 2)
 
-  !ERROR: DIM=4 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=4 dimension is out of range for corank-3 coarray
   n = lcobound(coarray_corank3, dim=4)
 
-  !ERROR: DIM=4 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=4 dimension is out of range for corank-3 coarray
   n = lcobound(dim=4, coarray=coarray_corank3)
 
-  !ERROR: DIM=5 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=5 dimension is out of range for corank-3 coarray
   n = lcobound(coarray_corank3, const_out_of_range_dim)
 
   !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
diff --git a/flang/test/Semantics/resolve94.f90 b/flang/test/Semantics/resolve94.f90
index e47ab4a..19c06ad 100644
--- a/flang/test/Semantics/resolve94.f90
+++ b/flang/test/Semantics/resolve94.f90
@@ -17,8 +17,15 @@ subroutine s1()
   intCoVar = 343
   ! OK
   rVar1 = rCoarray[1,2,3]
+  associate (x => rCoarray)
+    rVar1 = x[1,2,3] ! also ok
+  end associate
   !ERROR: 'rcoarray' has corank 3, but coindexed reference has 2 cosubscripts
   rVar1 = rCoarray[1,2]
+  associate (x => rCoarray)
+  !ERROR: 'x' has corank 3, but coindexed reference has 2 cosubscripts
+    rVar1 = x[1,2]
+  end associate
   !ERROR: Must have INTEGER type, but is REAL(4)
   rVar1 = rCoarray[1,2,3.4]
   !ERROR: Must have INTEGER type, but is REAL(4)
diff --git a/flang/test/Semantics/this_image01.f90 b/flang/test/Semantics/this_image01.f90
index 0e59aa3..fdcccda 100644
--- a/flang/test/Semantics/this_image01.f90
+++ b/flang/test/Semantics/this_image01.f90
@@ -8,6 +8,8 @@ subroutine test
   type(team_type) :: coteam[*]
   integer :: coscalar[*], coarray(3)[*]
   save :: coteam, coscalar, coarray
+  real coarray1[*], coarray2[2,*], coarray3[2,3,*]
+  integer indices(3)
 
   ! correct calls, should produce no errors
   team = get_team()
@@ -17,6 +19,10 @@ subroutine test
   print *, this_image(coarray, team)
   print *, this_image(coarray, 1)
   print *, this_image(coarray, 1, team)
+  print *, this_image(coarray(1))
+  print *, this_image(coarray(1), team)
+  print *, this_image(coarray(1), 1)
+  print *, this_image(coarray(1), 1, team)
   print *, this_image(coscalar)
   print *, this_image(coscalar, team)
   print *, this_image(coscalar, 1)
@@ -28,4 +34,14 @@ subroutine test
   print *, team_number()
   print *, team_number(team)
 
+  indices(1:1) = this_image(coarray1) ! ok
+  indices(1:2) = this_image(coarray2) ! ok
+  indices(1:3) = this_image(coarray3) ! ok
+  !ERROR: Dimension 1 of left-hand side has extent 2, but right-hand side has extent 1
+  indices(1:2) = this_image(coarray1)
+  !ERROR: Dimension 1 of left-hand side has extent 3, but right-hand side has extent 2
+  indices(1:3) = this_image(coarray2)
+  !ERROR: Dimension 1 of left-hand side has extent 1, but right-hand side has extent 3
+  indices(1:1) = this_image(coarray3)
+
 end subroutine
diff --git a/flang/test/Semantics/ucobound.f90 b/flang/test/Semantics/ucobound.f90
index f9da11a0..d84c80c 100644
--- a/flang/test/Semantics/ucobound.f90
+++ b/flang/test/Semantics/ucobound.f90
@@ -11,6 +11,9 @@ program ucobound_tests
   logical non_integer, logical_coarray[3,*]
   logical, parameter :: const_non_integer = .true.
   integer, allocatable :: ucobounds(:)
+  real bounded[2:3,4:5,*]
+
+  integer(kind=merge(kind(1),-1,ucobound(bounded,1)==3.and.ucobound(bounded,2)==5)) test_ucobound
 
   !___ standard-conforming statement with no optional arguments present ___
   ucobounds = ucobound(scalar_coarray)
@@ -50,28 +53,28 @@ program ucobound_tests
 
   !___ non-conforming statements ___
 
-  !ERROR: DIM=0 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=0 dimension must be positive
   n = ucobound(scalar_coarray, dim=0)
 
-  !ERROR: DIM=0 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=0 dimension must be positive
   n = ucobound(coarray_corank3, dim=0)
 
-  !ERROR: DIM=-1 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=-1 dimension must be positive
   n = ucobound(scalar_coarray, dim=-1)
 
-  !ERROR: DIM=2 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=2 dimension is out of range for corank-1 coarray
   n = ucobound(array_coarray, dim=2)
 
-  !ERROR: DIM=2 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=2 dimension is out of range for corank-1 coarray
   n = ucobound(array_coarray, 2)
 
-  !ERROR: DIM=4 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=4 dimension is out of range for corank-3 coarray
   n = ucobound(coarray_corank3, dim=4)
 
-  !ERROR: DIM=4 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=4 dimension is out of range for corank-3 coarray
   n = ucobound(dim=4, coarray=coarray_corank3)
 
-  !ERROR: DIM=5 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=5 dimension is out of range for corank-3 coarray
   n = ucobound(coarray_corank3, const_out_of_range_dim)
 
   !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir
index 74b8715..19e6dca 100644
--- a/flang/test/Transforms/omp-map-info-finalization.fir
+++ b/flang/test/Transforms/omp-map-info-finalization.fir
@@ -35,7 +35,7 @@ func.func @test_descriptor_expansion_pass(%arg0: !fir.box<!fir.array<?xi32>>) {
 // CHECK: %[[DESC_PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE2]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(to) capture(ByRef) members(%[[DESC_MEMBER_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
 // CHECK: fir.store %[[DECLARE1]]#1 to %[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
 // CHECK: %[[BASE_ADDR_OFF_2:.*]] = fir.box_offset %[[ALLOCA]] base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-// CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+// CHECK: %[[DESC_MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, i32) var_ptr_ptr(%[[BASE_ADDR_OFF_2]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 // CHECK: %[[DESC_PARENT_MAP_2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(to) capture(ByRef) members(%[[DESC_MEMBER_MAP_2]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>>
 // CHECK: omp.target map_entries(%[[DESC_PARENT_MAP]] -> %[[ARG1:.*]], %[[DESC_PARENT_MAP_2]] -> %[[ARG2:.*]], %[[DESC_MEMBER_MAP]] -> %[[ARG3:.*]], %[[DESC_MEMBER_MAP_2]] -> %[[ARG4:.*]] : {{.*}}) {
 
@@ -115,7 +115,7 @@ func.func @dtype_alloca_op_block_add(%arg0: !fir.ref<!fir.type<_QFtest_derived_t
 // CHECK:   %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%{{.*}} : index) upper_bound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index) {stride_in_bytes = true}
 // CHECK:   %[[MEMBER_COORD:.*]] = fir.coordinate_of %[[ALLOCA]]#0, %{{.*}} : (!fir.ref<[[REC_TY]]>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK:   %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD:.*]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-// CHECK:   %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+// CHECK:   %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 // CHECK:   %[[MAP_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "one_l%array_j"}
 // CHECK:   %[[MAP_MEMBER_PARENT:.*]] = omp.map.info var_ptr(%[[ALLOCA]]#0 : !fir.ref<[[REC_TY]]>>, [[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%10, %9 : [4], [4, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<[[REC_TY]]>> {{.*}}
 // CHECK:    omp.target map_entries(%[[MAP_MEMBER_PARENT]] -> %[[ARG1:.*]], %[[MAP_MEMBER_DESCRIPTOR]] -> %[[ARG2:.*]], %[[MAP_MEMBER_BASE_ADDR]] -> %[[ARG3:.*]] : !fir.ref<[[REC_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
@@ -157,7 +157,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
 // CHECK:     %[[LOAD_ALLOCA:.*]] = fir.load %[[ALLOCA]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>
 // CHECK:     %[[ALLOCATABLE_MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_ALLOCA]], %{{.*}} : (!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK:     %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[ALLOCATABLE_MEMBER_COORD]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-// CHECK:     %[[MAP_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+// CHECK:     %[[MAP_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 // CHECK:     %[[MAP_ALLOCA_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 // CHECK:     %[[LOAD_ALLOCA2:.*]] = fir.load %[[ALLOCA]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>
 // CHECK:     %[[REGULAR_MEMBER_COORD:.*]] = fir.coordinate_of %[[LOAD_ALLOCA2]], %{{.*}} : (!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>, index) -> !fir.ref<i32>
@@ -208,7 +208,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
 // CHECK:   %[[INTERMEDIATE_DTYPE_NESTED_MEMBER:.*]] = fir.coordinate_of %[[ALLOCA_LOAD]], %{{.*}} : (!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>, index) -> !fir.ref<!fir.type<[[REC_TY2:_QFtest_alloca_nested_derived_type_map_operand_and_block_additionTmiddle_layer{i:f32,array_i:!fir.array<10xi32>,array_k:!fir.box<!fir.heap<!fir.array<\?xi32>>>,k:i32}]]>>
 // CHECK:   %[[NESTED_ALLOCA_MEMBER:.*]] = fir.coordinate_of %[[INTERMEDIATE_DTYPE_NESTED_MEMBER]], %{{.*}} : (!fir.ref<!fir.type<[[REC_TY2]]>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK:   %[[NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_ALLOCA_MEMBER]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-// CHECK:   %[[MAP_NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[NESTED_ALLOCA_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+// CHECK:   %[[MAP_NESTED_ALLOCA_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[NESTED_ALLOCA_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 // CHECK:   %[[MAP_NESTED_ALLOCA_MEMBER:.*]] = omp.map.info var_ptr(%[[NESTED_ALLOCA_MEMBER]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 // CHECK:   %[[ALLOCA_LOAD2:.*]] = fir.load %[[ALLOCA]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>
 // CHECK:   %[[INTERMEDIATE_DTYPE_NESTED_MEMBER2:.*]] = fir.coordinate_of %[[ALLOCA_LOAD2]], %{{.*}} : (!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>, index) -> !fir.ref<!fir.type<[[REC_TY2]]>>
@@ -252,7 +252,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
 // CHECK:   %[[NESTED_DTYPE_COORD:.*]] = fir.coordinate_of %[[ALLOCA]]#0, %{{.*}} : (!fir.ref<!fir.type<[[REC_TY]]>>, index) -> !fir.ref<!fir.type<[[REC_TY2:_QFtest_nested_derived_type_alloca_map_operand_and_block_additionTmiddle_layer{i:f32,array_i:!fir.array<10xi32>,array_k:!fir.box<!fir.heap<!fir.array<\?xi32>>>,k:i32}]]>>
 // CHECK:   %[[ALLOCATABLE_MEMBER:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], %{{.*}} : (!fir.ref<!fir.type<[[REC_TY2]]>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK:   %[[ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[ALLOCATABLE_MEMBER]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-// CHECK:   %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+// CHECK:   %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[ALLOCATABLE_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 // CHECK:   %[[MAP_ALLOCATABLE_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[ALLOCATABLE_MEMBER]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 // CHECK:   %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[ALLOCA]]#0 : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.type<[[REC_TY]]>) map_clauses(tofrom) capture(ByRef) members(%12, %11 : [6, 2], [6, 2, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[REC_TY]]>> {{.*}}
 // CHECK:   omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG1:.*]], %[[MAP_ALLOCATABLE_MEMBER_DESCRIPTOR]] -> %[[ARG2:.*]], %[[MAP_ALLOCATABLE_MEMBER_BASE_ADDR]] -> %[[ARG3:.*]] :  !fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
@@ -286,13 +286,13 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
 // CHECK:    %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] {{.*}} : (!fir.ref<!fir.type<[[REC_TY]]>>) -> (!fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.type<[[REC_TY]]>>)
 // CHECK:    %[[DESC_1:.*]] = fir.coordinate_of %[[DECLARE]]#0, %{{.*}} : (!fir.ref<!fir.type<[[REC_TY]]>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2:_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<\?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<\?xi32>>>}]]>>>>>
 // CHECK:    %[[BASE_ADDR_1:.*]] = fir.box_offset %[[DESC_1]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>
-// CHECK:    %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.array<?x!fir.type<[[REC_TY2]]>>) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>> {{.*}}
+// CHECK:    %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.type<[[REC_TY2]]>) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>> {{.*}}
 // CHECK:    %[[DESC_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>> {{.*}}
 // CHECK:    %[[DESC_LD_1:.*]] = fir.load %[[DESC_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>
 // CHECK:    %[[MEMBER_ACCESS_1:.*]] = fir.coordinate_of %[[DESC_LD_1]], %{{.*}} : (!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, index) -> !fir.ref<!fir.type<[[REC_TY2]]>>
 // CHECK:    %[[DESC_2:.*]] = fir.coordinate_of %[[MEMBER_ACCESS_1]], %{{.*}} : (!fir.ref<!fir.type<[[REC_TY2]]>>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 // CHECK:    %[[BASE_ADDR_2:.*]] = fir.box_offset %[[DESC_2]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-// CHECK:    %[[BASE_ADDR_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.array<?xi32>) var_ptr_ptr(%[[BASE_ADDR_2]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
+// CHECK:    %[[BASE_ADDR_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[BASE_ADDR_2]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 // CHECK:    %[[DESC_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
 // CHECK:    %[[TOP_PARENT_MAP:.*]] = omp.map.info var_ptr(%0#1 : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.type<[[REC_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%6, %5, %14, %13 : [1], [1, 0], [1, 0, 2], [1, 0, 2, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[REC_TY]]>> {{{.*}} partial_map = true}
 // CHECK:    omp.target map_entries(%[[TOP_PARENT_MAP]] -> %{{.*}}, %[[DESC_MAP_1]] -> %{{.*}}, %[[BASE_ADDR_MAP_1]] -> %{{.*}}, %[[DESC_MAP_2]] -> %{{.*}}, %[[BASE_ADDR_MAP_2]] -> %{{.*}} : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir
index 66cd2a5..444136d 100644
--- a/flang/test/Transforms/stack-arrays.fir
+++ b/flang/test/Transforms/stack-arrays.fir
@@ -379,7 +379,8 @@ func.func @placement_loop_declare() {
     %3 = arith.addi %c1, %c2 : index
     // operand is now available
     %4 = fir.allocmem !fir.array<?xi32>, %3
-    %5 = fir.declare %4 {uniq_name = "temp"} : (!fir.heap<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+    %shape = fir.shape %3 : (index) -> !fir.shape<1>
+    %5 = fir.declare %4(%shape) {uniq_name = "temp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xi32>>
     // ...
     fir.freemem %5 : !fir.heap<!fir.array<?xi32>>
     fir.result %3, %c1_i32 : index, i32
@@ -400,3 +401,20 @@ func.func @placement_loop_declare() {
 // CHECK-NEXT:   }
 // CHECK-NEXT:   return
 // CHECK-NEXT: }
+
+// Can we look through fir.convert and fir.declare?
+func.func @lookthrough() {
+  %0 = fir.allocmem !fir.array<42xi32>
+  %c42 = arith.constant 42 : index
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %1 = fir.declare %0(%shape) {uniq_name = "name"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<42xi32>>
+  %2 = fir.convert %1 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  // use the ref so the converts aren't folded
+  %3 = fir.load %2 : !fir.ref<!fir.array<42xi32>>
+  %4 = fir.convert %2 : (!fir.ref<!fir.array<42xi32>>) -> !fir.heap<!fir.array<42xi32>>
+  fir.freemem %4 : !fir.heap<!fir.array<42xi32>>
+  return
+}
+// CHECK: func.func @lookthrough() {
+// CHECK:     fir.alloca !fir.array<42xi32>
+// CHECK-NOT: fir.freemem
diff --git a/flang/unittests/Frontend/CodeGenActionTest.cpp b/flang/unittests/Frontend/CodeGenActionTest.cpp
index 5d75de0..e9ff095 100644
--- a/flang/unittests/Frontend/CodeGenActionTest.cpp
+++ b/flang/unittests/Frontend/CodeGenActionTest.cpp
@@ -72,8 +72,7 @@ public:
     mlirCtx->loadDialect<test::DummyDialect>();
 
     mlir::Location loc(mlir::UnknownLoc::get(mlirCtx.get()));
-    mlirModule =
-        std::make_unique<mlir::ModuleOp>(mlir::ModuleOp::create(loc, "mod"));
+    mlirModule = mlir::ModuleOp::create(loc, "mod");
 
     mlir::OpBuilder builder(mlirCtx.get());
     builder.setInsertionPointToStart(&mlirModule->getRegion().front());
diff --git a/flang/unittests/Optimizer/Builder/CharacterTest.cpp b/flang/unittests/Optimizer/Builder/CharacterTest.cpp
index c6defcd..6d912b8 100644
--- a/flang/unittests/Optimizer/Builder/CharacterTest.cpp
+++ b/flang/unittests/Optimizer/Builder/CharacterTest.cpp
@@ -26,19 +26,20 @@ public:
 
     // Set up a Module with a dummy function operation inside.
     // Set the insertion point in the function entry block.
-    mlir::ModuleOp mod = builder.create<mlir::ModuleOp>(loc);
-    mlir::func::FuncOp func = mlir::func::FuncOp::create(
+    moduleOp = builder.create<mlir::ModuleOp>(loc);
+    builder.setInsertionPointToStart(moduleOp->getBody());
+    mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
         loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
     auto *entryBlock = func.addEntryBlock();
-    mod.push_back(mod);
     builder.setInsertionPointToStart(entryBlock);
 
-    firBuilder = std::make_unique<fir::FirOpBuilder>(mod, *kindMap);
+    firBuilder = std::make_unique<fir::FirOpBuilder>(builder, *kindMap);
   }
 
   fir::FirOpBuilder &getBuilder() { return *firBuilder; }
 
   mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> moduleOp;
   std::unique_ptr<fir::KindMapping> kindMap;
   std::unique_ptr<fir::FirOpBuilder> firBuilder;
 };
diff --git a/flang/unittests/Optimizer/Builder/ComplexTest.cpp b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
index 6472a52..eefab118 100644
--- a/flang/unittests/Optimizer/Builder/ComplexTest.cpp
+++ b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
@@ -22,15 +22,15 @@ public:
 
     // Set up a Module with a dummy function operation inside.
     // Set the insertion point in the function entry block.
-    mlir::ModuleOp mod = builder.create<mlir::ModuleOp>(loc);
-    mlir::func::FuncOp func = mlir::func::FuncOp::create(
+    moduleOp = builder.create<mlir::ModuleOp>(loc);
+    builder.setInsertionPointToStart(moduleOp->getBody());
+    mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
         loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
     auto *entryBlock = func.addEntryBlock();
-    mod.push_back(mod);
     builder.setInsertionPointToStart(entryBlock);
 
     kindMap = std::make_unique<fir::KindMapping>(&context);
-    firBuilder = std::make_unique<fir::FirOpBuilder>(mod, *kindMap);
+    firBuilder = std::make_unique<fir::FirOpBuilder>(builder, *kindMap);
     helper = std::make_unique<fir::factory::Complex>(*firBuilder, loc);
 
     // Init commonly used types
@@ -46,6 +46,7 @@ public:
   }
 
   mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> moduleOp;
   std::unique_ptr<fir::KindMapping> kindMap;
   std::unique_ptr<fir::FirOpBuilder> firBuilder;
   std::unique_ptr<fir::factory::Complex> helper;
diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
index f63afe4..05407d9 100644
--- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
+++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
@@ -26,19 +26,20 @@ public:
 
     // Set up a Module with a dummy function operation inside.
     // Set the insertion point in the function entry block.
-    mlir::ModuleOp mod = builder.create<mlir::ModuleOp>(loc);
-    mlir::func::FuncOp func = mlir::func::FuncOp::create(
+    moduleOp = builder.create<mlir::ModuleOp>(loc);
+    builder.setInsertionPointToStart(moduleOp->getBody());
+    mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
         loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
     auto *entryBlock = func.addEntryBlock();
-    mod.push_back(mod);
     builder.setInsertionPointToStart(entryBlock);
 
-    firBuilder = std::make_unique<fir::FirOpBuilder>(mod, kindMap);
+    firBuilder = std::make_unique<fir::FirOpBuilder>(builder, kindMap);
   }
 
   fir::FirOpBuilder &getBuilder() { return *firBuilder; }
 
   mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> moduleOp;
   std::unique_ptr<fir::FirOpBuilder> firBuilder;
 };
 
diff --git a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
index 1858b27..640b7ec 100644
--- a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
+++ b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
@@ -25,14 +25,14 @@ public:
 
     // Set up a Module with a dummy function operation inside.
     // Set the insertion point in the function entry block.
-    mlir::ModuleOp mod = builder.create<mlir::ModuleOp>(loc);
-    mlir::func::FuncOp func = mlir::func::FuncOp::create(
+    moduleOp = builder.create<mlir::ModuleOp>(loc);
+    builder.setInsertionPointToStart(moduleOp->getBody());
+    mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
         loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
     auto *entryBlock = func.addEntryBlock();
-    mod.push_back(mod);
     builder.setInsertionPointToStart(entryBlock);
 
-    firBuilder = std::make_unique<fir::FirOpBuilder>(mod, kindMap);
+    firBuilder = std::make_unique<fir::FirOpBuilder>(builder, kindMap);
   }
 
   mlir::Value createDeclare(fir::ExtendedValue exv) {
@@ -52,6 +52,7 @@ public:
 
   int varCounter = 0;
   mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> moduleOp;
   std::unique_ptr<fir::FirOpBuilder> firBuilder;
 };
 
diff --git a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
index d0ec977..40abf567 100644
--- a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
+++ b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
@@ -24,16 +24,16 @@ public:
 
     // Set up a Module with a dummy function operation inside.
     // Set the insertion point in the function entry block.
-    mlir::ModuleOp mod = builder.create<mlir::ModuleOp>(loc);
+    moduleOp = builder.create<mlir::ModuleOp>(loc);
+    builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func =
-        mlir::func::FuncOp::create(loc, "runtime_unit_tests_func",
+        builder.create<mlir::func::FuncOp>(loc, "runtime_unit_tests_func",
             builder.getFunctionType(std::nullopt, std::nullopt));
     auto *entryBlock = func.addEntryBlock();
-    mod.push_back(mod);
     builder.setInsertionPointToStart(entryBlock);
 
     kindMap = std::make_unique<fir::KindMapping>(&context);
-    firBuilder = std::make_unique<fir::FirOpBuilder>(mod, *kindMap);
+    firBuilder = std::make_unique<fir::FirOpBuilder>(builder, *kindMap);
 
     i1Ty = firBuilder->getI1Type();
     i8Ty = firBuilder->getI8Type();
@@ -66,6 +66,7 @@ public:
   }
 
   mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> moduleOp;
   std::unique_ptr<fir::KindMapping> kindMap;
   std::unique_ptr<fir::FirOpBuilder> firBuilder;
 
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 87efb62..4ba9359 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -19,12 +19,12 @@ public:
 
     // Set up a Module with a dummy function operation inside.
     // Set the insertion point in the function entry block.
-    mlir::ModuleOp mod = builder->create<mlir::ModuleOp>(loc);
+    moduleOp = builder->create<mlir::ModuleOp>(loc);
+    builder->setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func =
-        mlir::func::FuncOp::create(loc, "fortran_variable_tests",
+        builder->create<mlir::func::FuncOp>(loc, "fortran_variable_tests",
             builder->getFunctionType(std::nullopt, std::nullopt));
     auto *entryBlock = func.addEntryBlock();
-    mod.push_back(mod);
     builder->setInsertionPointToStart(entryBlock);
   }
 
@@ -40,6 +40,7 @@ public:
   }
   mlir::MLIRContext context;
   std::unique_ptr<mlir::OpBuilder> builder;
+  mlir::OwningOpRef<mlir::ModuleOp> moduleOp;
 };
 
 TEST_F(FortranVariableTest, SimpleScalar) {
diff --git a/flang/unittests/Runtime/ArrayConstructor.cpp b/flang/unittests/Runtime/ArrayConstructor.cpp
index 62e3b78..53774a0 100644
--- a/flang/unittests/Runtime/ArrayConstructor.cpp
+++ b/flang/unittests/Runtime/ArrayConstructor.cpp
@@ -127,6 +127,9 @@ TEST(ArrayConstructor, Character) {
       0);
   result.Deallocate();
   cookieAllocator.deallocate(acVector, 1);
+  x->Deallocate();
+  y->Deallocate();
+  c->Deallocate();
 }
 
 TEST(ArrayConstructor, CharacterRuntimeCheck) {
diff --git a/flang/unittests/Runtime/CUDA/Allocatable.cpp b/flang/unittests/Runtime/CUDA/Allocatable.cpp
index 171ca98..0f7eb27 100644
--- a/flang/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang/unittests/Runtime/CUDA/Allocatable.cpp
@@ -42,8 +42,7 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
   CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes()));
 
   RTNAME(AllocatableAllocate)
-  (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
-      __LINE__);
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
   EXPECT_TRUE(a->IsAllocated());
   RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
   cudaDeviceSynchronize();
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 2cc49b6..7cb2578 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -35,25 +35,7 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
   EXPECT_FALSE(a->HasAddendum());
   RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
   RTNAME(AllocatableAllocate)
-  (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
-      __LINE__);
-  EXPECT_TRUE(a->IsAllocated());
-  RTNAME(AllocatableDeallocate)
   (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
-  EXPECT_FALSE(a->IsAllocated());
-}
-
-TEST(AllocatableCUFTest, SimpleStreamDeviceAllocate) {
-  using Fortran::common::TypeCategory;
-  RTNAME(CUFRegisterAllocator)();
-  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
-  auto a{createAllocatable(TypeCategory::Real, 4)};
-  a->SetAllocIdx(kDeviceAllocatorPos);
-  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
-  EXPECT_FALSE(a->HasAddendum());
-  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
-  RTNAME(AllocatableAllocate)
-  (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
   EXPECT_TRUE(a->IsAllocated());
   RTNAME(AllocatableDeallocate)
   (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
@@ -71,8 +53,7 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   EXPECT_FALSE(a->HasAddendum());
   RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
   RTNAME(AllocatableAllocate)
-  (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
-      __LINE__);
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
   EXPECT_TRUE(a->IsAllocated());
   RTNAME(AllocatableDeallocate)
   (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
diff --git a/flang/unittests/Runtime/CUDA/Memory.cpp b/flang/unittests/Runtime/CUDA/Memory.cpp
index 2f40915..7c8b7aa 100644
--- a/flang/unittests/Runtime/CUDA/Memory.cpp
+++ b/flang/unittests/Runtime/CUDA/Memory.cpp
@@ -51,8 +51,7 @@ TEST(MemoryCUFTest, CUFDataTransferDescDesc) {
   EXPECT_EQ((int)kDeviceAllocatorPos, dev->GetAllocIdx());
   RTNAME(AllocatableSetBounds)(*dev, 0, 1, 10);
   RTNAME(AllocatableAllocate)
-  (*dev, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
-      __LINE__);
+  (*dev, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
   EXPECT_TRUE(dev->IsAllocated());
 
   // Create temp array to transfer to device.
diff --git a/flang/unittests/Runtime/CharacterTest.cpp b/flang/unittests/Runtime/CharacterTest.cpp
index e54fd8a..d462c91 100644
--- a/flang/unittests/Runtime/CharacterTest.cpp
+++ b/flang/unittests/Runtime/CharacterTest.cpp
@@ -259,6 +259,9 @@ void RunExtremumTests(const char *which,
           t.expect[i], t.expect[i] + std::strlen(t.expect[i])};
       EXPECT_EQ(expect, got) << "inputs: '" << t.x[i] << "','" << t.y[i] << "'";
     }
+
+    x->Deallocate();
+    y->Deallocate();
   }
 }
 
diff --git a/flang/unittests/Runtime/LogicalFormatTest.cpp b/flang/unittests/Runtime/LogicalFormatTest.cpp
index c4fbfc8..26c9374 100644
--- a/flang/unittests/Runtime/LogicalFormatTest.cpp
+++ b/flang/unittests/Runtime/LogicalFormatTest.cpp
@@ -23,7 +23,7 @@ TEST(IOApiTests, LogicalFormatTest) {
   char buffer[bufferSize];
 
   // Create format for all types and values to be written
-  const char *format{"(L,L3,I3,L2,L2,I3,L2,A3,L2,L,F4.1,L2)"};
+  const char *format{"(L0,L3,I3,L2,L2,I3,L2,A3,L2,L,F4.1,L2)"};
   auto cookie{IONAME(BeginInternalFormattedOutput)(
       buffer, bufferSize, format, std::strlen(format))};
 
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 00a07ea..6f1c180 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -64,8 +64,6 @@ if(LIBC_BUILD_GPU_LOADER OR ((NOT LLVM_RUNTIMES_BUILD) AND LLVM_LIBC_GPU_BUILD))
   return()
 endif()
 
-add_subdirectory(hdrgen)
-
 option(LIBC_CMAKE_VERBOSE_LOGGING
   "Log details warnings and notifications during CMake configuration." OFF)
 
diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index 862c7ec..a5ea66a 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -13,6 +13,8 @@ set(
     "float16_conversion"
     "float128"
     "fixed_point"
+    "cfloat16"
+    "cfloat128"
 )
 
 # Making sure ALL_COMPILER_FEATURES is sorted.
@@ -110,6 +112,10 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
       set(LIBC_TYPES_HAS_FLOAT128 TRUE)
     elseif(${feature} STREQUAL "fixed_point")
       set(LIBC_COMPILER_HAS_FIXED_POINT TRUE)
+    elseif(${feature} STREQUAL "cfloat16")
+      set(LIBC_TYPES_HAS_CFLOAT16 TRUE)
+    elseif(${feature} STREQUAL "cfloat128")
+      set(LIBC_TYPES_HAS_CFLOAT128 TRUE)
     elseif(${feature} STREQUAL "builtin_ceil_floor_rint_trunc")
       set(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_RINT_TRUNC TRUE)
     elseif(${feature} STREQUAL "builtin_fmax_fmin")
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 31a88f0..288e4da 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -75,7 +75,7 @@ function(add_gen_header target_name)
   cmake_parse_arguments(
     "ADD_GEN_HDR"
     "PUBLIC" # No optional arguments
-    "YAML_FILE;DEF_FILE;GEN_HDR" # Single value arguments
+    "YAML_FILE;GEN_HDR" # Single value arguments
     "DEPENDS"     # Multi value arguments
     ${ARGN}
   )
@@ -84,9 +84,6 @@ function(add_gen_header target_name)
     add_library(${fq_target_name} INTERFACE)
     return()
   endif()
-  if(NOT ADD_GEN_HDR_DEF_FILE)
-    message(FATAL_ERROR "`add_gen_hdr` rule requires DEF_FILE to be specified.")
-  endif()
   if(NOT ADD_GEN_HDR_GEN_HDR)
     message(FATAL_ERROR "`add_gen_hdr` rule requires GEN_HDR to be specified.")
   endif()
@@ -97,8 +94,8 @@ function(add_gen_header target_name)
   set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR_GEN_HDR})
   file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path})
   set(out_file ${LIBC_INCLUDE_DIR}/${relative_path})
+  set(dep_file "${out_file}.d")
   set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR_YAML_FILE})
-  set(def_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR_DEF_FILE})
 
   set(fq_data_files "")
   if(ADD_GEN_HDR_DATA_FILES)
@@ -108,17 +105,20 @@ function(add_gen_header target_name)
   endif()
 
   set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
-  list(TRANSFORM entry_points PREPEND "--e=")
+  list(TRANSFORM entry_points PREPEND "--entry-point=")
 
   add_custom_command(
     OUTPUT ${out_file}
-    COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/hdrgen/yaml_to_classes.py
-            ${yaml_file}
-            --h_def_file ${def_file}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+    COMMAND ${Python3_EXECUTABLE} "${LIBC_SOURCE_DIR}/utils/hdrgen/main.py"
+            --output ${out_file}
+            --depfile ${dep_file}
+            --write-if-changed
             ${entry_points}
-            --output_dir ${out_file}
-    DEPENDS ${yaml_file} ${def_file} ${fq_data_files}
-    COMMENT "Generating header ${ADD_GEN_HDR_GEN_HDR} from ${yaml_file} and ${def_file}"
+            ${yaml_file}
+    DEPENDS ${yaml_file} ${fq_data_files}
+    DEPFILE ${dep_file}
+    COMMENT "Generating header ${ADD_GEN_HDR_GEN_HDR} from ${yaml_file}"
   )
   if(LIBC_TARGET_OS_IS_GPU)
     file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls)
@@ -126,7 +126,7 @@ function(add_gen_header target_name)
     set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path})
     add_custom_command(
       OUTPUT ${decl_out_file}
-      COMMAND ${Python3_EXECUTABLE} ${LIBC_SOURCE_DIR}/hdrgen/yaml_to_classes.py
+      COMMAND ${Python3_EXECUTABLE} "${LIBC_SOURCE_DIR}/utils/hdrgen/yaml_to_classes.py"
               ${yaml_file}
               --export-decls
               ${entry_points}
diff --git a/libc/cmake/modules/compiler_features/check_cfloat128.cpp b/libc/cmake/modules/compiler_features/check_cfloat128.cpp
new file mode 100644
index 0000000..a798ccb
--- /dev/null
+++ b/libc/cmake/modules/compiler_features/check_cfloat128.cpp
@@ -0,0 +1,5 @@
+#include "src/__support/macros/properties/complex_types.h"
+
+#ifndef LIBC_TYPES_HAS_CFLOAT128
+#error unsupported
+#endif
diff --git a/libc/cmake/modules/compiler_features/check_cfloat16.cpp b/libc/cmake/modules/compiler_features/check_cfloat16.cpp
new file mode 100644
index 0000000..31416ff
--- /dev/null
+++ b/libc/cmake/modules/compiler_features/check_cfloat16.cpp
@@ -0,0 +1,5 @@
+#include "src/__support/macros/properties/complex_types.h"
+
+#ifndef LIBC_TYPES_HAS_CFLOAT16
+#error unsupported
+#endif
diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json
index dc4b051..08c581d 100644
--- a/libc/config/baremetal/config.json
+++ b/libc/config/baremetal/config.json
@@ -25,5 +25,10 @@
     "LIBC_CONF_QSORT_IMPL": {
       "value": "LIBC_QSORT_HEAP_SORT"
     }
+  },
+  "math": {
+    "LIBC_CONF_MATH_OPTIMIZATIONS": {
+      "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES)"
+    }
   }
 }
diff --git a/libc/config/gpu/config.json b/libc/config/gpu/amdgpu/config.json
index d99f48e..d99f48e 100644
--- a/libc/config/gpu/config.json
+++ b/libc/config/gpu/amdgpu/config.json
diff --git a/libc/config/gpu/amdgpu/entrypoints.txt b/libc/config/gpu/amdgpu/entrypoints.txt
new file mode 100644
index 0000000..7a19828
--- /dev/null
+++ b/libc/config/gpu/amdgpu/entrypoints.txt
@@ -0,0 +1,604 @@
+set(TARGET_LIBC_ENTRYPOINTS
+    # assert.h entrypoints
+    libc.src.assert.__assert_fail
+
+    # ctype.h entrypoints
+    libc.src.ctype.isalnum
+    libc.src.ctype.isalnum_l
+    libc.src.ctype.isalpha
+    libc.src.ctype.isalpha_l
+    libc.src.ctype.isascii
+    libc.src.ctype.isblank
+    libc.src.ctype.isblank_l
+    libc.src.ctype.iscntrl
+    libc.src.ctype.iscntrl_l
+    libc.src.ctype.isdigit
+    libc.src.ctype.isdigit_l
+    libc.src.ctype.isgraph
+    libc.src.ctype.isgraph_l
+    libc.src.ctype.islower
+    libc.src.ctype.islower_l
+    libc.src.ctype.isprint
+    libc.src.ctype.isprint_l
+    libc.src.ctype.ispunct
+    libc.src.ctype.ispunct_l
+    libc.src.ctype.isspace
+    libc.src.ctype.isspace_l
+    libc.src.ctype.isupper
+    libc.src.ctype.isupper_l
+    libc.src.ctype.isxdigit
+    libc.src.ctype.isxdigit_l
+    libc.src.ctype.toascii
+    libc.src.ctype.tolower
+    libc.src.ctype.tolower_l
+    libc.src.ctype.toupper
+    libc.src.ctype.toupper_l
+
+    # string.h entrypoints
+    libc.src.string.memccpy
+    libc.src.string.memchr
+    libc.src.string.memcmp
+    libc.src.string.memcpy
+    libc.src.string.memmem
+    libc.src.string.memmove
+    libc.src.string.mempcpy
+    libc.src.string.memrchr
+    libc.src.string.memset
+    libc.src.string.stpcpy
+    libc.src.string.stpncpy
+    libc.src.string.strcasestr
+    libc.src.string.strcat
+    libc.src.string.strchr
+    libc.src.string.strchrnul
+    libc.src.string.strcmp
+    libc.src.string.strcoll
+    libc.src.string.strcoll_l
+    libc.src.string.strcpy
+    libc.src.string.strcspn
+    libc.src.string.strdup
+    libc.src.string.strerror
+    libc.src.string.strlcat
+    libc.src.string.strlcpy
+    libc.src.string.strlen
+    libc.src.string.strncat
+    libc.src.string.strncmp
+    libc.src.string.strncpy
+    libc.src.string.strndup
+    libc.src.string.strnlen
+    libc.src.string.strpbrk
+    libc.src.string.strrchr
+    libc.src.string.strsep
+    libc.src.string.strspn
+    libc.src.string.strstr
+    libc.src.string.strtok
+    libc.src.string.strtok_r
+    libc.src.string.strxfrm
+    libc.src.string.strxfrm_l
+
+    # strings.h entrypoints
+    libc.src.strings.bcmp
+    libc.src.strings.bcopy
+    libc.src.strings.bzero
+    libc.src.strings.index
+    libc.src.strings.rindex
+    libc.src.strings.strcasecmp
+    libc.src.strings.strncasecmp
+
+    # stdbit.h entrypoints
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_zeros_us
+    libc.src.stdbit.stdc_first_leading_one_uc
+    libc.src.stdbit.stdc_first_leading_one_ui
+    libc.src.stdbit.stdc_first_leading_one_ul
+    libc.src.stdbit.stdc_first_leading_one_ull
+    libc.src.stdbit.stdc_first_leading_one_us
+    libc.src.stdbit.stdc_first_leading_zero_uc
+    libc.src.stdbit.stdc_first_leading_zero_ui
+    libc.src.stdbit.stdc_first_leading_zero_ul
+    libc.src.stdbit.stdc_first_leading_zero_ull
+    libc.src.stdbit.stdc_first_leading_zero_us
+    libc.src.stdbit.stdc_first_trailing_one_uc
+    libc.src.stdbit.stdc_first_trailing_one_ui
+    libc.src.stdbit.stdc_first_trailing_one_ul
+    libc.src.stdbit.stdc_first_trailing_one_ull
+    libc.src.stdbit.stdc_first_trailing_one_us
+    libc.src.stdbit.stdc_first_trailing_zero_uc
+    libc.src.stdbit.stdc_first_trailing_zero_ui
+    libc.src.stdbit.stdc_first_trailing_zero_ul
+    libc.src.stdbit.stdc_first_trailing_zero_ull
+    libc.src.stdbit.stdc_first_trailing_zero_us
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_leading_ones_uc
+    libc.src.stdbit.stdc_leading_ones_ui
+    libc.src.stdbit.stdc_leading_ones_ul
+    libc.src.stdbit.stdc_leading_ones_ull
+    libc.src.stdbit.stdc_leading_ones_us
+    libc.src.stdbit.stdc_leading_zeros_uc
+    libc.src.stdbit.stdc_leading_zeros_ui
+    libc.src.stdbit.stdc_leading_zeros_ul
+    libc.src.stdbit.stdc_leading_zeros_ull
+    libc.src.stdbit.stdc_leading_zeros_us
+    libc.src.stdbit.stdc_trailing_ones_uc
+    libc.src.stdbit.stdc_trailing_ones_ui
+    libc.src.stdbit.stdc_trailing_ones_ul
+    libc.src.stdbit.stdc_trailing_ones_ull
+    libc.src.stdbit.stdc_trailing_ones_us
+    libc.src.stdbit.stdc_trailing_zeros_uc
+    libc.src.stdbit.stdc_trailing_zeros_ui
+    libc.src.stdbit.stdc_trailing_zeros_ul
+    libc.src.stdbit.stdc_trailing_zeros_ull
+    libc.src.stdbit.stdc_trailing_zeros_us
+
+    # stdlib.h entrypoints
+    libc.src.stdlib._Exit
+    libc.src.stdlib.abort
+    libc.src.stdlib.abs
+    libc.src.stdlib.atexit
+    libc.src.stdlib.atof
+    libc.src.stdlib.atoi
+    libc.src.stdlib.atol
+    libc.src.stdlib.atoll
+    libc.src.stdlib.bsearch
+    libc.src.stdlib.div
+    libc.src.stdlib.exit
+    libc.src.stdlib.labs
+    libc.src.stdlib.ldiv
+    libc.src.stdlib.llabs
+    libc.src.stdlib.lldiv
+    libc.src.stdlib.qsort
+    libc.src.stdlib.qsort_r
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+    libc.src.stdlib.strtod
+    libc.src.stdlib.strtod_l
+    libc.src.stdlib.strtof
+    libc.src.stdlib.strtof_l
+    libc.src.stdlib.strtol
+    libc.src.stdlib.strtol_l
+    libc.src.stdlib.strtold
+    libc.src.stdlib.strtold_l
+    libc.src.stdlib.strtoll
+    libc.src.stdlib.strtoll_l
+    libc.src.stdlib.strtoul
+    libc.src.stdlib.strtoul_l
+    libc.src.stdlib.strtoull
+    libc.src.stdlib.strtoull_l
+    libc.src.stdlib.at_quick_exit
+    libc.src.stdlib.quick_exit
+    libc.src.stdlib.getenv
+    libc.src.stdlib.system
+
+    # TODO: Implement these correctly
+    libc.src.stdlib.aligned_alloc
+    libc.src.stdlib.calloc
+    libc.src.stdlib.free
+    libc.src.stdlib.malloc
+    libc.src.stdlib.realloc
+
+    # errno.h entrypoints
+    libc.src.errno.errno
+
+    # stdio.h entrypoints
+    libc.src.stdio.clearerr
+    libc.src.stdio.fclose
+    libc.src.stdio.printf
+    libc.src.stdio.vprintf
+    libc.src.stdio.fprintf
+    libc.src.stdio.vfprintf
+    libc.src.stdio.snprintf
+    libc.src.stdio.sprintf
+    libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
+    libc.src.stdio.asprintf
+    libc.src.stdio.vasprintf
+    libc.src.stdio.scanf
+    libc.src.stdio.vscanf
+    libc.src.stdio.fscanf
+    libc.src.stdio.vfscanf
+    libc.src.stdio.sscanf
+    libc.src.stdio.vsscanf
+    libc.src.stdio.feof
+    libc.src.stdio.ferror
+    libc.src.stdio.fflush
+    libc.src.stdio.fgetc
+    libc.src.stdio.fgets
+    libc.src.stdio.fopen
+    libc.src.stdio.fputc
+    libc.src.stdio.fputs
+    libc.src.stdio.fread
+    libc.src.stdio.fseek
+    libc.src.stdio.ftell
+    libc.src.stdio.fwrite
+    libc.src.stdio.getc
+    libc.src.stdio.getchar
+    libc.src.stdio.putc
+    libc.src.stdio.putchar
+    libc.src.stdio.puts
+    libc.src.stdio.remove
+    libc.src.stdio.rename
+    libc.src.stdio.stderr
+    libc.src.stdio.stdin
+    libc.src.stdio.stdout
+    libc.src.stdio.ungetc
+
+    # inttypes.h entrypoints
+    libc.src.inttypes.imaxabs
+    libc.src.inttypes.imaxdiv
+    libc.src.inttypes.strtoimax
+    libc.src.inttypes.strtoumax
+
+    # time.h entrypoints
+    libc.src.time.clock
+    libc.src.time.clock_gettime
+    libc.src.time.timespec_get
+    libc.src.time.nanosleep
+
+    # wchar.h entrypoints
+    libc.src.wchar.wctob
+
+    # locale.h entrypoints
+    libc.src.locale.localeconv
+    libc.src.locale.duplocale
+    libc.src.locale.freelocale
+    libc.src.locale.localeconv
+    libc.src.locale.newlocale
+    libc.src.locale.setlocale
+    libc.src.locale.uselocale
+)
+
+set(TARGET_LIBM_ENTRYPOINTS
+    # math.h entrypoints
+    libc.src.math.acos
+    libc.src.math.acosf
+    libc.src.math.acosh
+    libc.src.math.acoshf
+    libc.src.math.asin
+    libc.src.math.asinf
+    libc.src.math.asinh
+    libc.src.math.asinhf
+    libc.src.math.atan
+    libc.src.math.atan2
+    libc.src.math.atan2f
+    libc.src.math.atan2l
+    libc.src.math.atanf
+    libc.src.math.atanh
+    libc.src.math.atanhf
+    libc.src.math.canonicalize
+    libc.src.math.canonicalizef
+    libc.src.math.canonicalizel
+    libc.src.math.cbrt
+    libc.src.math.cbrtf
+    libc.src.math.ceil
+    libc.src.math.ceilf
+    libc.src.math.ceill
+    libc.src.math.copysign
+    libc.src.math.copysignf
+    libc.src.math.copysignl
+    libc.src.math.cos
+    libc.src.math.cosf
+    libc.src.math.cosh
+    libc.src.math.coshf
+    libc.src.math.cospif
+    libc.src.math.ddivl
+    libc.src.math.dfmal
+    libc.src.math.dmull
+    libc.src.math.dsqrtl
+    libc.src.math.erf
+    libc.src.math.erff
+    libc.src.math.exp
+    libc.src.math.exp10
+    libc.src.math.exp10f
+    libc.src.math.exp2
+    libc.src.math.exp2f
+    libc.src.math.exp2m1f
+    libc.src.math.expf
+    libc.src.math.expm1
+    libc.src.math.expm1f
+    libc.src.math.fabs
+    libc.src.math.fabsf
+    libc.src.math.fabsl
+    libc.src.math.fadd
+    libc.src.math.faddl
+    libc.src.math.fdim
+    libc.src.math.fdimf
+    libc.src.math.fdiml
+    libc.src.math.fdiv
+    libc.src.math.fdivl
+    libc.src.math.ffma
+    libc.src.math.ffmal
+    libc.src.math.floor
+    libc.src.math.floorf
+    libc.src.math.floorl
+    libc.src.math.fma
+    libc.src.math.fmaf
+    libc.src.math.fmax
+    libc.src.math.fmaxf
+    libc.src.math.fmaximum
+    libc.src.math.fmaximumf
+    libc.src.math.fmaximuml
+    libc.src.math.fmaximum_mag
+    libc.src.math.fmaximum_magf
+    libc.src.math.fmaximum_magl
+    libc.src.math.fmaximum_mag_num
+    libc.src.math.fmaximum_mag_numf
+    libc.src.math.fmaximum_mag_numl
+    libc.src.math.fmaximum_num
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numl
+    libc.src.math.fmaxl
+    libc.src.math.fmin
+    libc.src.math.fminf
+    libc.src.math.fminimum
+    libc.src.math.fminimumf
+    libc.src.math.fminimuml
+    libc.src.math.fminimum_mag
+    libc.src.math.fminimum_magf
+    libc.src.math.fminimum_magl
+    libc.src.math.fminimum_mag_num
+    libc.src.math.fminimum_mag_numf
+    libc.src.math.fminimum_mag_numl
+    libc.src.math.fminimum_num
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numl
+    libc.src.math.fminl
+    libc.src.math.fmod
+    libc.src.math.fmodf
+    libc.src.math.fmodl
+    libc.src.math.fmul
+    libc.src.math.fmull
+    libc.src.math.frexp
+    libc.src.math.frexpf
+    libc.src.math.frexpl
+    libc.src.math.fromfp
+    libc.src.math.fromfpf
+    libc.src.math.fromfpl
+    libc.src.math.fromfpx
+    libc.src.math.fromfpxf
+    libc.src.math.fromfpxl
+    libc.src.math.fsqrt
+    libc.src.math.fsqrtl
+    libc.src.math.fsub
+    libc.src.math.fsubl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
+    libc.src.math.getpayloadl
+    libc.src.math.hypot
+    libc.src.math.hypotf
+    libc.src.math.ilogb
+    libc.src.math.ilogbf
+    libc.src.math.ilogbl
+    libc.src.math.isnan
+    libc.src.math.isnanf
+    libc.src.math.isnanl
+    libc.src.math.ldexp
+    libc.src.math.ldexpf
+    libc.src.math.ldexpl
+    libc.src.math.lgamma
+    libc.src.math.lgamma_r
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
+    libc.src.math.llrint
+    libc.src.math.llrintf
+    libc.src.math.llrintl
+    libc.src.math.llround
+    libc.src.math.llroundf
+    libc.src.math.llroundl
+    libc.src.math.log
+    libc.src.math.log10
+    libc.src.math.log10f
+    libc.src.math.log1p
+    libc.src.math.log1pf
+    libc.src.math.log2
+    libc.src.math.log2f
+    libc.src.math.logb
+    libc.src.math.logbf
+    libc.src.math.logbl
+    libc.src.math.logf
+    libc.src.math.lrint
+    libc.src.math.lrintf
+    libc.src.math.lrintl
+    libc.src.math.lround
+    libc.src.math.lroundf
+    libc.src.math.lroundl
+    libc.src.math.modf
+    libc.src.math.modff
+    libc.src.math.modfl
+    libc.src.math.nan
+    libc.src.math.nanf
+    libc.src.math.nanl
+    libc.src.math.nearbyint
+    libc.src.math.nearbyintf
+    libc.src.math.nearbyintl
+    libc.src.math.nextafter
+    libc.src.math.nextafterf
+    libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
+    libc.src.math.nexttoward
+    libc.src.math.nexttowardf
+    libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
+    libc.src.math.pow
+    libc.src.math.powf
+    libc.src.math.powi
+    libc.src.math.powif
+    libc.src.math.remainder
+    libc.src.math.remainderf
+    libc.src.math.remainderl
+    libc.src.math.remquo
+    libc.src.math.remquof
+    libc.src.math.remquol
+    libc.src.math.rint
+    libc.src.math.rintf
+    libc.src.math.rintl
+    libc.src.math.roundeven
+    libc.src.math.roundevenf
+    libc.src.math.roundevenl
+    libc.src.math.round
+    libc.src.math.roundf
+    libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
+    libc.src.math.scalbn
+    libc.src.math.scalbnf
+    libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
+    libc.src.math.setpayloadl
+    libc.src.math.setpayloadsig
+    libc.src.math.setpayloadsigf
+    libc.src.math.setpayloadsigl
+    libc.src.math.sin
+    libc.src.math.sincos
+    libc.src.math.sincosf
+    libc.src.math.sinf
+    libc.src.math.sinh
+    libc.src.math.sinhf
+    libc.src.math.sinpif
+    libc.src.math.sqrt
+    libc.src.math.sqrtf
+    libc.src.math.sqrtl
+    libc.src.math.tan
+    libc.src.math.tanf
+    libc.src.math.tanh
+    libc.src.math.tanhf
+    libc.src.math.tgamma
+    libc.src.math.tgammaf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
+    libc.src.math.totalorderl
+    libc.src.math.totalordermag
+    libc.src.math.totalordermagf
+    libc.src.math.totalordermagl
+    libc.src.math.trunc
+    libc.src.math.truncf
+    libc.src.math.truncl
+    libc.src.math.ufromfp
+    libc.src.math.ufromfpf
+    libc.src.math.ufromfpl
+    libc.src.math.ufromfpx
+    libc.src.math.ufromfpxf
+    libc.src.math.ufromfpxl
+)
+
+if(LIBC_TYPES_HAS_FLOAT16)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C23 _Float16 entrypoints
+    libc.src.math.canonicalizef16
+    libc.src.math.ceilf16
+    libc.src.math.copysignf16
+    libc.src.math.coshf16
+    libc.src.math.exp10f16
+    libc.src.math.exp10m1f16
+    libc.src.math.exp2f16
+    libc.src.math.expf16
+    libc.src.math.f16add
+    libc.src.math.f16addf
+    libc.src.math.f16addl
+    libc.src.math.f16div
+    libc.src.math.f16divf
+    libc.src.math.f16divl
+    libc.src.math.f16fma
+    libc.src.math.f16fmaf
+    libc.src.math.f16fmal
+    libc.src.math.f16mul
+    libc.src.math.f16mulf
+    libc.src.math.f16mull
+    libc.src.math.f16sqrt
+    libc.src.math.f16sqrtf
+    libc.src.math.f16sqrtl
+    libc.src.math.f16sub
+    libc.src.math.f16subf
+    libc.src.math.f16subl
+    libc.src.math.fabsf16
+    libc.src.math.fdimf16
+    libc.src.math.floorf16
+    libc.src.math.fmaxf16
+    libc.src.math.fmaximum_mag_numf16
+    libc.src.math.fmaximum_magf16
+    libc.src.math.fmaximum_numf16
+    libc.src.math.fmaximumf16
+    libc.src.math.fminf16
+    libc.src.math.fminimum_mag_numf16
+    libc.src.math.fminimum_magf16
+    libc.src.math.fminimum_numf16
+    libc.src.math.fminimumf16
+    libc.src.math.fmodf16
+    libc.src.math.frexpf16
+    libc.src.math.fromfpf16
+    libc.src.math.fromfpxf16
+    libc.src.math.getpayloadf16
+    libc.src.math.ilogbf16
+    libc.src.math.ldexpf16
+    libc.src.math.llogbf16
+    libc.src.math.llrintf16
+    libc.src.math.llroundf16
+    libc.src.math.log10f16
+    libc.src.math.log2f16
+    libc.src.math.logbf16
+    libc.src.math.logf16
+    libc.src.math.lrintf16
+    libc.src.math.lroundf16
+    libc.src.math.modff16
+    libc.src.math.nanf16
+    libc.src.math.nearbyintf16
+    libc.src.math.nextafterf16
+    libc.src.math.nextdownf16
+    libc.src.math.nexttowardf16
+    libc.src.math.nextupf16
+    libc.src.math.remainderf16
+    libc.src.math.remquof16
+    libc.src.math.rintf16
+    libc.src.math.roundevenf16
+    libc.src.math.roundf16
+    libc.src.math.scalblnf16
+    libc.src.math.scalbnf16
+    libc.src.math.setpayloadf16
+    libc.src.math.setpayloadsigf16
+    libc.src.math.sinhf16
+    libc.src.math.sqrtf16
+    libc.src.math.tanhf16
+    libc.src.math.totalorderf16
+    libc.src.math.totalordermagf16
+    libc.src.math.truncf16
+    libc.src.math.ufromfpf16
+    libc.src.math.ufromfpxf16
+  )
+endif()
+
+set(TARGET_LLVMLIBC_ENTRYPOINTS
+  ${TARGET_LIBC_ENTRYPOINTS}
+  ${TARGET_LIBM_ENTRYPOINTS}
+)
diff --git a/libc/config/gpu/headers.txt b/libc/config/gpu/amdgpu/headers.txt
index fa8ad7c..fa8ad7c 100644
--- a/libc/config/gpu/headers.txt
+++ b/libc/config/gpu/amdgpu/headers.txt
diff --git a/libc/config/gpu/nvptx/config.json b/libc/config/gpu/nvptx/config.json
new file mode 100644
index 0000000..d99f48e
--- /dev/null
+++ b/libc/config/gpu/nvptx/config.json
@@ -0,0 +1,37 @@
+{
+  "errno": {
+    "LIBC_CONF_ERRNO_MODE": {
+      "value": "LIBC_ERRNO_MODE_SHARED"
+    }
+  },
+  "printf": {
+    "LIBC_CONF_PRINTF_DISABLE_FLOAT": {
+      "value": true
+    },
+    "LIBC_CONF_PRINTF_DISABLE_INDEX_MODE": {
+      "value": true
+    },
+    "LIBC_CONF_PRINTF_DISABLE_WRITE_INT": {
+      "value": true
+    },
+    "LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE": {
+      "value": false
+    },
+    "LIBC_CONF_PRINTF_DISABLE_STRERROR": {
+      "value": true
+    }
+  },
+  "scanf": {
+    "LIBC_CONF_SCANF_DISABLE_FLOAT": {
+      "value": true
+    },
+    "LIBC_CONF_SCANF_DISABLE_INDEX_MODE": {
+      "value": true
+    }
+  },
+  "math": {
+    "LIBC_CONF_MATH_OPTIMIZATIONS": {
+      "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES | LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT)"
+    }
+  }
+}
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/nvptx/entrypoints.txt
index b008e0e..059dc9b 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/nvptx/entrypoints.txt
@@ -376,7 +376,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
-    # FIXME: Broken on NVPTX.
+    # FIXME: Broken.
     # libc.src.math.fromfp
     # libc.src.math.fromfpf
     # libc.src.math.fromfpl
@@ -506,7 +506,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
-    # FIXME: Broken on NVPTX.
+    # FIXME: Broken.
     # libc.src.math.ufromfp
     # libc.src.math.ufromfpf
     # libc.src.math.ufromfpl
diff --git a/libc/config/gpu/nvptx/headers.txt b/libc/config/gpu/nvptx/headers.txt
new file mode 100644
index 0000000..fa8ad7c
--- /dev/null
+++ b/libc/config/gpu/nvptx/headers.txt
@@ -0,0 +1,21 @@
+set(TARGET_PUBLIC_HEADERS
+    libc.include.assert
+    libc.include.ctype
+    libc.include.string
+    libc.include.strings
+    libc.include.signal
+    libc.include.float
+    libc.include.stdint
+    libc.include.inttypes
+    libc.include.limits
+    libc.include.math
+    libc.include.fenv
+    libc.include.time
+    libc.include.errno
+    libc.include.stdlib
+    libc.include.stdio
+    libc.include.wchar
+    libc.include.uchar
+    libc.include.features
+    libc.include.locale
+)
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index b949e4b..00f0c6a 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -619,14 +619,18 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ufromfpxl
 )
 
-if(LIBC_TYPES_HAS_FLOAT16)
+if(LIBC_TYPES_HAS_CFLOAT16)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # complex.h C23 _Complex _Float16 entrypoints
-    # libc.src.complex.crealf16
-    # libc.src.complex.cimagf16
-    # libc.src.complex.conjf16
-    # libc.src.complex.cprojf16
-    
+    libc.src.complex.crealf16
+    libc.src.complex.cimagf16
+    libc.src.complex.conjf16
+    libc.src.complex.cprojf16
+  )
+endif()
+
+if(LIBC_TYPES_HAS_FLOAT16)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
     # math.h C23 _Float16 entrypoints
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
@@ -726,14 +730,18 @@ if(LIBC_TYPES_HAS_FLOAT16)
   # endif()
 endif()
 
-if(LIBC_TYPES_HAS_FLOAT128)
+if(LIBC_TYPES_HAS_CFLOAT128)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # complex.h C23 _Complex _Float128 entrypoints
     libc.src.complex.crealf128
     libc.src.complex.cimagf128
     libc.src.complex.conjf128
     libc.src.complex.cprojf128
+  )
+endif()
 
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
     # math.h C23 _Float128 entrypoints
     libc.src.math.canonicalizef128
     libc.src.math.ceilf128
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 19980f7..49a8d61 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -620,14 +620,18 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ufromfpxl
 )
 
-if(LIBC_TYPES_HAS_FLOAT128)
+if(LIBC_TYPES_HAS_CFLOAT128)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # complex.h C23 _Complex _Float128 entrypoints
     libc.src.complex.crealf128
     libc.src.complex.cimagf128
     libc.src.complex.conjf128
     libc.src.complex.cprojf128
-    
+  )
+endif()
+
+if(LIBC_TYPES_HAS_FLOAT128)    
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
     # math.h C23 _Float128 entrypoints
     libc.src.math.canonicalizef128
     libc.src.math.ceilf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 08d8559..7e54960 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -624,14 +624,18 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ufromfpxl
 )
 
-if(LIBC_TYPES_HAS_FLOAT16)
+if(LIBC_TYPES_HAS_CFLOAT16)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # complex.h C23 _Complex _Float16 entrypoints
     libc.src.complex.crealf16
     libc.src.complex.cimagf16
     libc.src.complex.conjf16
     libc.src.complex.cprojf16
+  )
+endif()
 
+if(LIBC_TYPES_HAS_FLOAT16)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
     # math.h C23 _Float16 entrypoints
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
@@ -736,14 +740,18 @@ if(LIBC_TYPES_HAS_FLOAT16)
   endif()
 endif()
 
-if(LIBC_TYPES_HAS_FLOAT128)
+if(LIBC_TYPES_HAS_CFLOAT128)
   list(APPEND TARGET_LIBM_ENTRYPOINTS
     # complex.h C23 _Complex _Float128 entrypoints
-    # libc.src.complex.crealf128
-    # libc.src.complex.cimagf128
-    # libc.src.complex.conjf128
-    # libc.src.complex.cprojf128
-    
+    libc.src.complex.crealf128
+    libc.src.complex.cimagf128
+    libc.src.complex.conjf128
+    libc.src.complex.cprojf128
+  )
+endif()
+
+if(LIBC_TYPES_HAS_FLOAT128)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS    
     # math.h C23 _Float128 entrypoints
     libc.src.math.canonicalizef128
     libc.src.math.ceilf128
diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt
index be09423..5a3f827 100644
--- a/libc/docs/CMakeLists.txt
+++ b/libc/docs/CMakeLists.txt
@@ -1,9 +1,72 @@
-
 if (LLVM_ENABLE_SPHINX)
 include(AddSphinxTarget)
 if (SPHINX_FOUND)
   if (${SPHINX_OUTPUT_HTML})
-    add_sphinx_target(html libc)
+    # Similar to clang, we copy our static .rst files from libc/docs/ to the
+    # $build_dir/libc/docs/. That way, we can have a mix of both static
+    # (committed) .rst files, and dynamically generated .rst files. We don't
+    # want the dynamically generated .rst files to pollute the source tree.
+    add_custom_target(copy-libc-rst-docs
+      COMMAND "${CMAKE_COMMAND}" -E copy_directory
+              "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}")
+
+    # For headers that are nested in directories, we need to
+    # `mkdir $build_dir/libc/docs/headers/$dir` since the above copy_directory
+    # command does not create such copies. Otherwise, the invocation of docgen
+    # below will fail since the output file would be placed in a directory that
+    # does not exist, leading to a `No such file or directory` error from the
+    # shell.
+    file(MAKE_DIRECTORY
+      "${CMAKE_CURRENT_BINARY_DIR}/headers/arpa/"
+      "${CMAKE_CURRENT_BINARY_DIR}/headers/sys/"
+    )
+
+    # Change sphinx to build from $build_dir/libc/docs/ rather than
+    # llvm-project/libc/docs/.
+    add_sphinx_target(html libc SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    # Depend on the copy target.
+    add_dependencies(docs-libc-html copy-libc-rst-docs)
+
+    # Maintain a list of headers for which we dynamically generate html docs
+    # for via docgen. For more complex docs (such as per arch support, a la
+    # math.h), those should be omitted and exist statically in
+    # libc/docs/headers/.
+    list(APPEND docgen_list
+      arpa/inet
+      assert
+      ctype
+      errno
+      fenv
+      float
+      inttypes
+      locale
+      setjmp
+      signal
+      stdbit
+      stdio
+      stdlib
+      string
+      strings
+      sys/mman
+      threads
+      uchar
+      wchar
+      wctype
+    )
+
+    foreach(stem IN LISTS docgen_list)
+      # It is an error in cmake to have a target name that contains a "/", but
+      # docgen relies on the "/" to find headers nested under directories.
+      # Replace with underscore.
+      string(REPLACE "/" "_" stem_rst ${stem})
+
+      # docgen invocation.
+      add_custom_target(${stem_rst}
+        COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/../utils/docgen/docgen.py ${stem}.h >
+                ${CMAKE_CURRENT_BINARY_DIR}/headers/${stem}.rst)
+      # depend on the docgen invocation.
+      add_dependencies(docs-libc-html ${stem_rst})
+    endforeach()
   endif()
 endif()
 endif()
diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst
index 2c586cc..a946106 100644
--- a/libc/docs/dev/header_generation.rst
+++ b/libc/docs/dev/header_generation.rst
@@ -44,15 +44,15 @@ To add through the command line:
 
    .. code-block:: none
 
-     python3 libc/hdrgen/yaml_to_classes.py
-     libc/hdrgen/yaml/[yaml_file.yaml] --add_function "<return_type>" <function_name> "<function_arg1, function_arg2>" <standard> <guard> <attribute>
+     python3 libc/utils/hdrgen/yaml_to_classes.py
+     libc/include/[yaml_file.yaml] --add_function "<return_type>" <function_name> "<function_arg1, function_arg2>" <standard> <guard> <attribute>
 
    Example:
 
    .. code-block:: none
 
-      python3 libc/hdrgen/yaml_to_classes.py
-      libc/hdrgen/yaml/ctype.yaml --add_function "char" example_function
+      python3 libc/utils/hdrgen/yaml_to_classes.py
+      libc/include/ctype.yaml --add_function "char" example_function
       "int, void, const void" stdc example_float example_attribute
 
    Keep in mind only the return_type and arguments have quotes around them. If
@@ -62,7 +62,8 @@ To add through the command line:
    generated header file with the new addition in the hdrgen directory to
    examine.
 
-If you want to sort the functions alphabetically you can check out libc/hdrgen/yaml_functions_sorted.py.
+If you want to sort the functions alphabetically you can check out
+libc/utils/hdrgen/yaml_functions_sorted.py.
 
 
 Testing
@@ -75,10 +76,10 @@ ensures the process of YAML to classes to generate headers works properly. If
 there are any new additions on formatting headers, make sure the test is
 updated with the specific addition.
 
-Integration Test can be found in: ``libc/hdrgen/tests/test_integration.py``
+Integration Test can be found in: ``libc/utils/hdrgen/tests/test_integration.py``
 
 File to modify if adding something to formatting:
-``libc/hdrgen/tests/expected_output/test_header.h``
+``libc/utils/hdrgen/tests/expected_output/test_header.h``
 
 
 Common Errors
@@ -89,7 +90,7 @@ Common Errors
 
    .. code-block:: none
 
-      "/llvm-project/libc/hdrgen/yaml_to_classes.py", line 67, in yaml_to_classes function_data["return_type"]
+      "/llvm-project/libc/utils/hdrgen/yaml_to_classes.py", line 67, in yaml_to_classes function_data["return_type"]
 
    If you receive this error or any error pertaining to
    ``function_data[function_specific_component]`` while building the headers
@@ -117,7 +118,7 @@ Common Errors
    missing. Ensure the correct style and required files are present:
 
    | ``[header_name]``
-   | ``[../libc/hdrgen/yaml/[yaml_file.yaml]``
+   | ``[../libc/include/[yaml_file.yaml]``
    | ``[header_name.h.def]``
    | ``[header_name.h]``
    | ``DEPENDS``
@@ -147,13 +148,13 @@ Common Errors
 
    .. code-block:: none
 
-     File "/llvm-project/libc/hdrgen/header.py", line 60, in __str__ for
+     File "/llvm-project/libc/utils/hdrgen/header.py", line 60, in __str__ for
      function in self.functions: AttributeError: 'HeaderFile' object has no
      attribute 'functions'
 
    When running ``ninja libc`` in the build directory to generate headers you
    may receive the error above. Essentially this means that in
-   ``libc/hdrgen/header.py`` there is a missing attribute named functions.
+   ``libc/utils/hdrgen/header.py`` there is a missing attribute named functions.
    Make sure all function components are defined within this file and there are
    no missing functions to add these components.
 
@@ -184,12 +185,12 @@ Common Errors
    Sometimes the integration test will fail but that
    still means the process is working unless the comparison between the output
    and expected_output is not showing. If that is the case make sure in
-   ``libc/hdrgen/tests/test_integration.py`` there are no missing arguments
+   ``libc/utils/hdrgen/tests/test_integration.py`` there are no missing arguments
    that run through the script.
 
    If the integration tests are failing due to mismatching of lines or small
    errors in spacing that is nothing to worry about. If this is happening while
    you are making a new change to the formatting of the headers, then
    ensure the expected output file
-   ``libc/hdrgen/tests/expected_output/test_header.h`` has the changes you
+   ``libc/utils/hdrgen/tests/expected_output/test_header.h`` has the changes you
    are applying.
diff --git a/libc/docs/dev/source_tree_layout.rst b/libc/docs/dev/source_tree_layout.rst
index bd9d6ca..62c0434 100644
--- a/libc/docs/dev/source_tree_layout.rst
+++ b/libc/docs/dev/source_tree_layout.rst
@@ -15,7 +15,6 @@ directories::
         - examples
         - fuzzing
         - hdr
-        - hdrgen
         - include
         - lib
         - src
@@ -88,15 +87,6 @@ The ``lib`` directory
 This directory contains a ``CMakeLists.txt`` file listing the targets for the
 public libraries ``libc.a``, ``libm.a`` etc.
 
-The ``hdrgen`` directory
----------------------------
-
-This directory contains the sources and specifications for the types, macros
-and entrypoint functions. These definitions are organized in the ``yaml``
-subdirectory and match the organization of the ``*.h.def`` files. This folder
-also contains the python sources for headergen, which is what generates the
-headers.
-
 The ``src`` directory
 ---------------------
 
diff --git a/libc/docs/full_cross_build.rst b/libc/docs/full_cross_build.rst
index 5f57169..cd1ec89 100644
--- a/libc/docs/full_cross_build.rst
+++ b/libc/docs/full_cross_build.rst
@@ -8,7 +8,7 @@ Full Cross Build
    :depth: 1
    :local:
 
-.. note:: 
+.. note::
    Fullbuild requires running headergen, which is a python program that depends on
    pyyaml. The minimum versions are listed on the :ref:`header_generation`
    page, as well as additional information.
@@ -95,8 +95,8 @@ configure step.
 Bootstrap cross build
 =====================
 
-In this recipe, the clang compiler and the ``libc-hdrgen`` binary, both are
-built automatically before building the libc for the target.
+In this recipe, the clang compiler is built automatically before building
+the libc for the target.
 
 CMake configure step
 --------------------
@@ -151,8 +151,8 @@ built using any of the three recipes described above.
 Building for the GPU
 ====================
 
-To build for a GPU architecture, it should only be necessary to specify the 
-target triple as one of the supported GPU targets. Currently, this is either 
-``nvptx64-nvidia-cuda`` for NVIDIA GPUs or ``amdgcn-amd-amdhsa`` for AMD GPUs. 
-More detailed information is provided in the :ref:`GPU 
+To build for a GPU architecture, it should only be necessary to specify the
+target triple as one of the supported GPU targets. Currently, this is either
+``nvptx64-nvidia-cuda`` for NVIDIA GPUs or ``amdgcn-amd-amdhsa`` for AMD GPUs.
+More detailed information is provided in the :ref:`GPU
 documentation<libc_gpu_building>`.
diff --git a/libc/docs/headers/arpa/inet.rst b/libc/docs/headers/arpa/inet.rst
deleted file mode 100644
index c82ca54..0000000
--- a/libc/docs/headers/arpa/inet.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. include:: ../../check.rst
-
-===========
-arpa/inet.h
-===========
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX.1-2024 Standard Section
-  * - htonl
-    - |check|
-    -
-    - 
-  * - htons
-    - |check|
-    -
-    - 
-  * - inet_addr
-    -
-    -
-    - 
-  * - inet_ntoa
-    -
-    -
-    - 
-  * - inet_ntop
-    -
-    -
-    - 
-  * - inet_pton
-    -
-    -
-    - 
-  * - ntohl
-    - |check|
-    -
-    - 
-  * - ntohs
-    - |check|
-    -
-    - 
diff --git a/libc/docs/headers/assert.rst b/libc/docs/headers/assert.rst
deleted file mode 100644
index 6821707..0000000
--- a/libc/docs/headers/assert.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-.. include:: ../check.rst
-
-========
-assert.h
-========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - __STDC_VERSION_ASSERT_H__
-    - |check|
-    - 7.2.1
-    -
-  * - assert
-    -
-    - 7.2.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/assert.h.html>`__
-
diff --git a/libc/docs/headers/ctype.rst b/libc/docs/headers/ctype.rst
deleted file mode 100644
index 9b5b157..0000000
--- a/libc/docs/headers/ctype.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-.. include:: ../check.rst
-
-=======
-ctype.h
-=======
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - isalnum
-    - |check|
-    - 7.4.1.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isalnum.html>`__
-  * - isalnum_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isalnum_l.html>`__
-  * - isalpha
-    - |check|
-    - 7.4.1.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isalpha.html>`__
-  * - isalpha_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isalpha_l.html>`__
-  * - isblank
-    - |check|
-    - 7.4.1.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isblank.html>`__
-  * - isblank_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isblank_l.html>`__
-  * - iscntrl
-    - |check|
-    - 7.4.1.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/iscntrl.html>`__
-  * - iscntrl_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/iscntrl_l.html>`__
-  * - isdigit
-    - |check|
-    - 7.4.1.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isdigit.html>`__
-  * - isdigit_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isdigit_l.html>`__
-  * - isgraph
-    - |check|
-    - 7.4.1.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isgraph.html>`__
-  * - isgraph_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isgraph_l.html>`__
-  * - islower
-    - |check|
-    - 7.4.1.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/islower.html>`__
-  * - islower_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/islower_l.html>`__
-  * - isprint
-    - |check|
-    - 7.4.1.8
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isprint.html>`__
-  * - isprint_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isprint_l.html>`__
-  * - ispunct
-    - |check|
-    - 7.4.1.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ispunct.html>`__
-  * - ispunct_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ispunct_l.html>`__
-  * - isspace
-    - |check|
-    - 7.4.1.10
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isspace.html>`__
-  * - isspace_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isspace_l.html>`__
-  * - isupper
-    - |check|
-    - 7.4.1.11
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isupper.html>`__
-  * - isupper_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isupper_l.html>`__
-  * - isxdigit
-    - |check|
-    - 7.4.1.12
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isxdigit.html>`__
-  * - isxdigit_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/isxdigit_l.html>`__
-  * - tolower
-    - |check|
-    - 7.4.2.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tolower.html>`__
-  * - tolower_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tolower_l.html>`__
-  * - toupper
-    - |check|
-    - 7.4.2.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/toupper.html>`__
-  * - toupper_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/toupper_l.html>`__
diff --git a/libc/docs/headers/errno.rst b/libc/docs/headers/errno.rst
deleted file mode 100644
index b2b2e62..0000000
--- a/libc/docs/headers/errno.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-.. include:: ../check.rst
-
-=======
-errno.h
-=======
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - EDOM
-    -
-    - 7.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/errno.h.html>`__
-  * - EILSEQ
-    -
-    - 7.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/errno.h.html>`__
-  * - ERANGE
-    -
-    - 7.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/errno.h.html>`__
-  * - errno
-    -
-    - 7.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/errno.h.html>`__
-
diff --git a/libc/docs/headers/fenv.rst b/libc/docs/headers/fenv.rst
deleted file mode 100644
index d0e3c5d..0000000
--- a/libc/docs/headers/fenv.rst
+++ /dev/null
@@ -1,175 +0,0 @@
-.. include:: ../check.rst
-
-======
-fenv.h
-======
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - FE_ALL_EXCEPT
-    - |check|
-    - 7.6.12
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_DEC_DOWNWARD
-    -
-    - 7.6.14
-    -
-  * - FE_DEC_TONEAREST
-    -
-    - 7.6.14
-    -
-  * - FE_DEC_TONEARESTFROMZERO
-    -
-    - 7.6.14
-    -
-  * - FE_DEC_TOWARDZERO
-    -
-    - 7.6.14
-    -
-  * - FE_DEC_UPWARD
-    -
-    - 7.6.14
-    -
-  * - FE_DFL_ENV
-    - |check|
-    - 7.6.17
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_DFL_MODE
-    -
-    - 7.6.11
-    -
-  * - FE_DIVBYZERO
-    - |check|
-    - 7.6.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_DOWNWARD
-    - |check|
-    - 7.6.13
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_INEXACT
-    - |check|
-    - 7.6.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_INVALID
-    - |check|
-    - 7.6.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_OVERFLOW
-    - |check|
-    - 7.6.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_TONEAREST
-    - |check|
-    - 7.6.13
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_TONEARESTFROMZERO
-    -
-    - 7.6.13
-    -
-  * - FE_TOWARDZERO
-    - |check|
-    - 7.6.13
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_UNDERFLOW
-    - |check|
-    - 7.6.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - FE_UPWARD
-    - |check|
-    - 7.6.13
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/fenv.h.html>`__
-  * - __STDC_VERSION_FENV_H__
-    -
-    - 7.6.5
-    -
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - fe_dec_getround
-    -
-    - 7.6.5.3
-    -
-  * - fe_dec_setround
-    -
-    - 7.6.5.6
-    -
-  * - feclearexcept
-    - |check|
-    - 7.6.4.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feclearexcept.html>`__
-  * - fegetenv
-    - |check|
-    - 7.6.6.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fegetenv.html>`__
-  * - fegetexceptflag
-    - |check|
-    - 7.6.4.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fegetexceptflag.html>`__
-  * - fegetmode
-    -
-    - 7.6.5.1
-    -
-  * - fegetround
-    - |check|
-    - 7.6.5.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fegetround.html>`__
-  * - feholdexcept
-    - |check|
-    - 7.6.6.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feholdexcept.html>`__
-  * - feraiseexcept
-    - |check|
-    - 7.6.4.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feraiseexcept.html>`__
-  * - fesetenv
-    - |check|
-    - 7.6.6.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fesetenv.html>`__
-  * - fesetexcept
-    - |check|
-    - 7.6.4.4
-    -
-  * - fesetexceptflag
-    - |check|
-    - 7.6.4.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fesetexceptflag.html>`__
-  * - fesetmode
-    -
-    - 7.6.5.4
-    -
-  * - fesetround
-    - |check|
-    - 7.6.5.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fesetround.html>`__
-  * - fetestexcept
-    - |check|
-    - 7.6.4.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fetestexcept.html>`__
-  * - fetestexceptflag
-    - |check|
-    - 7.6.4.6
-    -
-  * - feupdateenv
-    - |check|
-    - 7.6.6.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feupdateenv.html>`__
diff --git a/libc/docs/headers/float.rst b/libc/docs/headers/float.rst
deleted file mode 100644
index 8ef0f3a..0000000
--- a/libc/docs/headers/float.rst
+++ /dev/null
@@ -1,227 +0,0 @@
-.. include:: ../check.rst
-
-=======
-float.h
-=======
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - DBL_DECIMAL_DIG
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_DIG
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_EPSILON
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_HAS_SUBNORM
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_IS_IEC_60559
-    -
-    - 5.3.5.3.3
-    -
-  * - DBL_MANT_DIG
-    - |check|
-    - 5.3.5.3.3
-    -
-  * - DBL_MAX
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_MAX_10_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_MAX_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_MIN
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_MIN_10_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_MIN_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DBL_NORM_MAX
-    -
-    - 5.3.5.3.3
-    -
-  * - DBL_SNAN
-    -
-    - 5.3.5.3.3
-    -
-  * - DBL_TRUE_MIN
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - DECIMAL_DIG
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_DECIMAL_DIG
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_DIG
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_EPSILON
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_EVAL_METHOD
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_HAS_SUBNORM
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_IS_IEC_60559
-    -
-    - 5.3.5.3.3
-    -
-  * - FLT_MANT_DIG
-    - |check|
-    - 5.3.5.3.3
-    -
-  * - FLT_MAX
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_MAX_10_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_MAX_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_MIN
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_MIN_10_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_MIN_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_NORM_MAX
-    -
-    - 5.3.5.3.3
-    -
-  * - FLT_RADIX
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_ROUNDS
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - FLT_SNAN
-    -
-    - 5.3.5.3.3
-    -
-  * - FLT_TRUE_MIN
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - INFINITY
-    -
-    - 5.3.5.3.3
-    -
-  * - LDBL_DECIMAL_DIG
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_DIG
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_EPSILON
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_HAS_SUBNORM
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_IS_IEC_60559
-    -
-    - 5.3.5.3.3
-    -
-  * - LDBL_MANT_DIG
-    - |check|
-    - 5.3.5.3.3
-    -
-  * - LDBL_MAX
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_MAX_10_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_MAX_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_MIN
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_MIN_10_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_MIN_EXP
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - LDBL_NORM_MAX
-    -
-    - 5.3.5.3.3
-    -
-  * - LDBL_SNAN
-    -
-    - 5.3.5.3.3
-    -
-  * - LDBL_TRUE_MIN
-    - |check|
-    - 5.3.5.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/float.h.html>`__
-  * - NAN
-    -
-    - 5.3.5.3.3
-    -
-  * - __STDC_VERSION_FLOAT_H__
-    -
-    - 7.7
-    -
-
diff --git a/libc/docs/headers/inttypes.rst b/libc/docs/headers/inttypes.rst
deleted file mode 100644
index 9269b40..0000000
--- a/libc/docs/headers/inttypes.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-.. include:: ../check.rst
-
-==========
-inttypes.h
-==========
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - imaxabs
-    - |check|
-    - 7.8.2.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/imaxabs.html>`__
-  * - imaxdiv
-    - |check|
-    - 7.8.2.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/imaxdiv.html>`__
-  * - strtoimax
-    - |check|
-    - 7.8.2.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoimax.html>`__
-  * - strtoumax
-    - |check|
-    - 7.8.2.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoumax.html>`__
-  * - wcstoimax
-    -
-    - 7.8.2.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/wcstoimax.html>`__
-  * - wcstoumax
-    -
-    - 7.8.2.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/wcstoumax.html>`__
diff --git a/libc/docs/headers/locale.rst b/libc/docs/headers/locale.rst
deleted file mode 100644
index c97d1f6..0000000
--- a/libc/docs/headers/locale.rst
+++ /dev/null
@@ -1,83 +0,0 @@
-.. include:: ../check.rst
-
-========
-locale.h
-========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - LC_ALL
-    - |check|
-    - 7.11
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
-  * - LC_COLLATE
-    - |check|
-    - 7.11
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
-  * - LC_CTYPE
-    - |check|
-    - 7.11
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
-  * - LC_MONETARY
-    - |check|
-    - 7.11
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
-  * - LC_NUMERIC
-    - |check|
-    - 7.11
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
-  * - LC_TIME
-    - |check|
-    - 7.11
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/locale.h.html>`__
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - duplocale
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/duplocale.html>`__
-  * - freelocale
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/freelocale.html>`__
-  * - getlocalename_l
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/getlocalename_l.html>`__
-  * - localeconv
-    - |check|
-    - 7.11.2.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/localeconv.html>`__
-  * - newlocale
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/newlocale.html>`__
-  * - setlocale
-    - |check|
-    - 7.11.1.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/setlocale.html>`__
-  * - uselocale
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/uselocale.html>`__
diff --git a/libc/docs/headers/setjmp.rst b/libc/docs/headers/setjmp.rst
deleted file mode 100644
index b009113..0000000
--- a/libc/docs/headers/setjmp.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-.. include:: ../check.rst
-
-========
-setjmp.h
-========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX.1-2024 Standard Section
-  * - __STDC_VERSION_SETJMP_H__
-    -
-    - 7.13.2
-    -
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX.1-2024 Standard Section
-  * - longjmp
-    - |check|
-    - 7.13.2.1
-    -
-  * - setjmp
-    - |check|
-    - 7.13.1.1
-    -
diff --git a/libc/docs/headers/signal.rst b/libc/docs/headers/signal.rst
deleted file mode 100644
index 4f51f61..0000000
--- a/libc/docs/headers/signal.rst
+++ /dev/null
@@ -1,207 +0,0 @@
-.. include:: ../check.rst
-
-========
-signal.h
-========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - SIGABRT
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGALRM
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGBUS
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGCHLD
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGCONT
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGFPE
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGHUP
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGILL
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGINT
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGKILL
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGPIPE
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGPOLL
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGPROF
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGQUIT
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGRTMAX
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGRTMIN
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGSEGV
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGSTOP
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGSYS
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGTERM
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGTRAP
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGTSTP
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGTTIN
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGTTOU
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGURG
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGUSR1
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGUSR2
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGVTALRM
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGXCPU
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIGXFSZ
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIG_DFL
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIG_ERR
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIG_HOLD
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-  * - SIG_IGN
-    - |check|
-    - 7.14.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/signal.h.html>`__
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - kill
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/kill.html>`__
-  * - raise
-    - |check|
-    - 7.14.2.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/raise.html>`__
-  * - sigaction
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaction.html>`__
-  * - sigaddset
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaddset.html>`__
-  * - sigaltstack
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigaltstack.html>`__
-  * - sigdelset
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigdelset.html>`__
-  * - sigemptyset
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigemptyset.html>`__
-  * - sigfillset
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigfillset.html>`__
-  * - signal
-    - |check|
-    - 7.14.1.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/signal.html>`__
-  * - sigprocmask
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sigprocmask.html>`__
diff --git a/libc/docs/headers/stdbit.rst b/libc/docs/headers/stdbit.rst
deleted file mode 100644
index 0484d95..0000000
--- a/libc/docs/headers/stdbit.rst
+++ /dev/null
@@ -1,383 +0,0 @@
-.. include:: ../check.rst
-
-========
-stdbit.h
-========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX.1-2024 Standard Section
-  * - __STDC_ENDIAN_BIG__
-    - |check|
-    - 7.18.2.2
-    -
-  * - __STDC_ENDIAN_LITTLE__
-    - |check|
-    - 7.18.2.2
-    -
-  * - __STDC_ENDIAN_NATIVE__
-    - |check|
-    - 7.18.2.2
-    -
-  * - __STDC_VERSION_STDBIT_H__
-    - |check|
-    - 7.18.1.2
-    -
-  * - stdc_bit_ceil
-    - |check|
-    - 7.18.16.1
-    -
-  * - stdc_bit_floor
-    - |check|
-    - 7.18.15.1
-    -
-  * - stdc_bit_width
-    - |check|
-    - 7.18.14.1
-    -
-  * - stdc_count_ones
-    - |check|
-    - 7.18.12.1
-    -
-  * - stdc_count_zeros
-    - |check|
-    - 7.18.11.1
-    -
-  * - stdc_first_leading_one
-    - |check|
-    - 7.18.8.1
-    -
-  * - stdc_first_leading_zero
-    - |check|
-    - 7.18.7.1
-    -
-  * - stdc_first_trailing_one
-    - |check|
-    - 7.18.10.1
-    -
-  * - stdc_first_trailing_zero
-    - |check|
-    - 7.18.9.1
-    -
-  * - stdc_has_single_bit
-    - |check|
-    - 7.18.13.1
-    -
-  * - stdc_leading_ones
-    - |check|
-    - 7.18.4.1
-    -
-  * - stdc_leading_zeros
-    - |check|
-    - 7.18.3.1
-    -
-  * - stdc_trailing_ones
-    - |check|
-    - 7.18.6.1
-    -
-  * - stdc_trailing_zeros
-    - |check|
-    - 7.18.5.1
-    -
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX.1-2024 Standard Section
-  * - stdc_bit_ceil_uc
-    - |check|
-    - 7.18.16
-    -
-  * - stdc_bit_ceil_ui
-    - |check|
-    - 7.18.16
-    -
-  * - stdc_bit_ceil_ul
-    - |check|
-    - 7.18.16
-    -
-  * - stdc_bit_ceil_ull
-    - |check|
-    - 7.18.16
-    -
-  * - stdc_bit_ceil_us
-    - |check|
-    - 7.18.16
-    -
-  * - stdc_bit_floor_uc
-    - |check|
-    - 7.18.15
-    -
-  * - stdc_bit_floor_ui
-    - |check|
-    - 7.18.15
-    -
-  * - stdc_bit_floor_ul
-    - |check|
-    - 7.18.15
-    -
-  * - stdc_bit_floor_ull
-    - |check|
-    - 7.18.15
-    -
-  * - stdc_bit_floor_us
-    - |check|
-    - 7.18.15
-    -
-  * - stdc_bit_width_uc
-    - |check|
-    - 7.18.14
-    -
-  * - stdc_bit_width_ui
-    - |check|
-    - 7.18.14
-    -
-  * - stdc_bit_width_ul
-    - |check|
-    - 7.18.14
-    -
-  * - stdc_bit_width_ull
-    - |check|
-    - 7.18.14
-    -
-  * - stdc_bit_width_us
-    - |check|
-    - 7.18.14
-    -
-  * - stdc_count_ones_uc
-    - |check|
-    - 7.18.12
-    -
-  * - stdc_count_ones_ui
-    - |check|
-    - 7.18.12
-    -
-  * - stdc_count_ones_ul
-    - |check|
-    - 7.18.12
-    -
-  * - stdc_count_ones_ull
-    - |check|
-    - 7.18.12
-    -
-  * - stdc_count_ones_us
-    - |check|
-    - 7.18.12
-    -
-  * - stdc_count_zeros_uc
-    - |check|
-    - 7.18.11
-    -
-  * - stdc_count_zeros_ui
-    - |check|
-    - 7.18.11
-    -
-  * - stdc_count_zeros_ul
-    - |check|
-    - 7.18.11
-    -
-  * - stdc_count_zeros_ull
-    - |check|
-    - 7.18.11
-    -
-  * - stdc_count_zeros_us
-    - |check|
-    - 7.18.11
-    -
-  * - stdc_first_leading_one_uc
-    - |check|
-    - 7.18.8
-    -
-  * - stdc_first_leading_one_ui
-    - |check|
-    - 7.18.8
-    -
-  * - stdc_first_leading_one_ul
-    - |check|
-    - 7.18.8
-    -
-  * - stdc_first_leading_one_ull
-    - |check|
-    - 7.18.8
-    -
-  * - stdc_first_leading_one_us
-    - |check|
-    - 7.18.8
-    -
-  * - stdc_first_leading_zero_uc
-    - |check|
-    - 7.18.7
-    -
-  * - stdc_first_leading_zero_ui
-    - |check|
-    - 7.18.7
-    -
-  * - stdc_first_leading_zero_ul
-    - |check|
-    - 7.18.7
-    -
-  * - stdc_first_leading_zero_ull
-    - |check|
-    - 7.18.7
-    -
-  * - stdc_first_leading_zero_us
-    - |check|
-    - 7.18.7
-    -
-  * - stdc_first_trailing_one_uc
-    - |check|
-    - 7.18.10
-    -
-  * - stdc_first_trailing_one_ui
-    - |check|
-    - 7.18.10
-    -
-  * - stdc_first_trailing_one_ul
-    - |check|
-    - 7.18.10
-    -
-  * - stdc_first_trailing_one_ull
-    - |check|
-    - 7.18.10
-    -
-  * - stdc_first_trailing_one_us
-    - |check|
-    - 7.18.10
-    -
-  * - stdc_first_trailing_zero_uc
-    - |check|
-    - 7.18.9
-    -
-  * - stdc_first_trailing_zero_ui
-    - |check|
-    - 7.18.9
-    -
-  * - stdc_first_trailing_zero_ul
-    - |check|
-    - 7.18.9
-    -
-  * - stdc_first_trailing_zero_ull
-    - |check|
-    - 7.18.9
-    -
-  * - stdc_first_trailing_zero_us
-    - |check|
-    - 7.18.9
-    -
-  * - stdc_has_single_bit_uc
-    - |check|
-    - 7.18.13
-    -
-  * - stdc_has_single_bit_ui
-    - |check|
-    - 7.18.13
-    -
-  * - stdc_has_single_bit_ul
-    - |check|
-    - 7.18.13
-    -
-  * - stdc_has_single_bit_ull
-    - |check|
-    - 7.18.13
-    -
-  * - stdc_has_single_bit_us
-    - |check|
-    - 7.18.13
-    -
-  * - stdc_leading_ones_uc
-    - |check|
-    - 7.18.4
-    -
-  * - stdc_leading_ones_ui
-    - |check|
-    - 7.18.4
-    -
-  * - stdc_leading_ones_ul
-    - |check|
-    - 7.18.4
-    -
-  * - stdc_leading_ones_ull
-    - |check|
-    - 7.18.4
-    -
-  * - stdc_leading_ones_us
-    - |check|
-    - 7.18.4
-    -
-  * - stdc_leading_zeros_uc
-    - |check|
-    - 7.18.3
-    -
-  * - stdc_leading_zeros_ui
-    - |check|
-    - 7.18.3
-    -
-  * - stdc_leading_zeros_ul
-    - |check|
-    - 7.18.3
-    -
-  * - stdc_leading_zeros_ull
-    - |check|
-    - 7.18.3
-    -
-  * - stdc_leading_zeros_us
-    - |check|
-    - 7.18.3
-    -
-  * - stdc_trailing_ones_uc
-    - |check|
-    - 7.18.6
-    -
-  * - stdc_trailing_ones_ui
-    - |check|
-    - 7.18.6
-    -
-  * - stdc_trailing_ones_ul
-    - |check|
-    - 7.18.6
-    -
-  * - stdc_trailing_ones_ull
-    - |check|
-    - 7.18.6
-    -
-  * - stdc_trailing_ones_us
-    - |check|
-    - 7.18.6
-    -
-  * - stdc_trailing_zeros_uc
-    - |check|
-    - 7.18.5
-    -
-  * - stdc_trailing_zeros_ui
-    - |check|
-    - 7.18.5
-    -
-  * - stdc_trailing_zeros_ul
-    - |check|
-    - 7.18.5
-    -
-  * - stdc_trailing_zeros_ull
-    - |check|
-    - 7.18.5
-    -
-  * - stdc_trailing_zeros_us
-    - |check|
-    - 7.18.5
-    -
diff --git a/libc/docs/headers/stdio.rst b/libc/docs/headers/stdio.rst
deleted file mode 100644
index 1833eb5..0000000
--- a/libc/docs/headers/stdio.rst
+++ /dev/null
@@ -1,359 +0,0 @@
-.. include:: ../check.rst
-
-=======
-stdio.h
-=======
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - BUFSIZ
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - EOF
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - FILENAME_MAX
-    -
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - FOPEN_MAX
-    -
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - L_ctermid
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - L_tmpnam
-    -
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - SEEK_CUR
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - SEEK_END
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - SEEK_SET
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - TMP_MAX
-    -
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - _IOFBF
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - _IOLBF
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - _IONBF
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - _PRINTF_NAN_LEN_MAX
-    -
-    - 7.23.1
-    -
-  * - __STDC_VERSION_STDIO_H__
-    -
-    - 7.23.1
-    -
-  * - stderr
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - stdin
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-  * - stdout
-    - |check|
-    - 7.23.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdio.h.html>`__
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - clearerr
-    - |check|
-    - 7.23.10.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/clearerr.html>`__
-  * - ctermid
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ctermid.html>`__
-  * - dprintf
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/dprintf.html>`__
-  * - fclose
-    - |check|
-    - 7.23.5.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fclose.html>`__
-  * - fdopen
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fdopen.html>`__
-  * - feof
-    - |check|
-    - 7.23.10.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/feof.html>`__
-  * - ferror
-    - |check|
-    - 7.23.10.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ferror.html>`__
-  * - fflush
-    - |check|
-    - 7.23.5.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fflush.html>`__
-  * - fgetc
-    - |check|
-    - 7.23.7.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fgetc.html>`__
-  * - fgetpos
-    -
-    - 7.23.9.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fgetpos.html>`__
-  * - fgets
-    - |check|
-    - 7.23.7.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fgets.html>`__
-  * - fileno
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fileno.html>`__
-  * - flockfile
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/flockfile.html>`__
-  * - fmemopen
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fmemopen.html>`__
-  * - fopen
-    - |check|
-    - 7.23.5.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fopen.html>`__
-  * - fprintf
-    - |check|
-    - 7.23.6.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fprintf.html>`__
-  * - fputc
-    - |check|
-    - 7.23.7.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fputc.html>`__
-  * - fputs
-    - |check|
-    - 7.23.7.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fputs.html>`__
-  * - fread
-    - |check|
-    - 7.23.8.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fread.html>`__
-  * - freopen
-    -
-    - 7.23.5.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/freopen.html>`__
-  * - fscanf
-    - |check|
-    - 7.23.6.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fscanf.html>`__
-  * - fseek
-    - |check|
-    - 7.23.9.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fseek.html>`__
-  * - fseeko
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fseeko.html>`__
-  * - fsetpos
-    -
-    - 7.23.9.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fsetpos.html>`__
-  * - ftell
-    - |check|
-    - 7.23.9.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ftell.html>`__
-  * - ftello
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ftello.html>`__
-  * - ftrylockfile
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ftrylockfile.html>`__
-  * - funlockfile
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/funlockfile.html>`__
-  * - fwrite
-    - |check|
-    - 7.23.8.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/fwrite.html>`__
-  * - getc
-    - |check|
-    - 7.23.7.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/getc.html>`__
-  * - getchar
-    - |check|
-    - 7.23.7.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/getchar.html>`__
-  * - getdelim
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/getdelim.html>`__
-  * - getline
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/getline.html>`__
-  * - open_memstream
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/open_memstream.html>`__
-  * - pclose
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/pclose.html>`__
-  * - perror
-    -
-    - 7.23.10.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/perror.html>`__
-  * - popen
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/popen.html>`__
-  * - printf
-    - |check|
-    - 7.23.6.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/printf.html>`__
-  * - putc
-    - |check|
-    - 7.23.7.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/putc.html>`__
-  * - putc_unlocked
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/putc_unlocked.html>`__
-  * - putchar
-    - |check|
-    - 7.23.7.8
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/putchar.html>`__
-  * - putchar_unlocked
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/putchar_unlocked.html>`__
-  * - puts
-    - |check|
-    - 7.23.7.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/puts.html>`__
-  * - remove
-    - |check|
-    - 7.23.4.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/remove.html>`__
-  * - rename
-    - |check|
-    - 7.23.4.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/rename.html>`__
-  * - renameat
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/renameat.html>`__
-  * - rewind
-    -
-    - 7.23.9.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/rewind.html>`__
-  * - scanf
-    - |check|
-    - 7.23.6.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/scanf.html>`__
-  * - setbuf
-    - |check|
-    - 7.23.5.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/setbuf.html>`__
-  * - setvbuf
-    - |check|
-    - 7.23.5.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/setvbuf.html>`__
-  * - snprintf
-    - |check|
-    - 7.23.6.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/snprintf.html>`__
-  * - sprintf
-    - |check|
-    - 7.23.6.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sprintf.html>`__
-  * - sscanf
-    - |check|
-    - 7.23.6.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/sscanf.html>`__
-  * - tmpfile
-    -
-    - 7.23.4.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tmpfile.html>`__
-  * - tmpnam
-    -
-    - 7.23.4.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tmpnam.html>`__
-  * - ungetc
-    - |check|
-    - 7.23.7.10
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ungetc.html>`__
-  * - vdprintf
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/vdprintf.html>`__
-  * - vfprintf
-    - |check|
-    - 7.23.6.8
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/vfprintf.html>`__
-  * - vfscanf
-    - |check|
-    - 7.23.6.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/vfscanf.html>`__
-  * - vprintf
-    - |check|
-    - 7.23.6.10
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/vprintf.html>`__
-  * - vscanf
-    - |check|
-    - 7.23.6.11
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/vscanf.html>`__
-  * - vsnprintf
-    - |check|
-    - 7.23.6.12
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/vsnprintf.html>`__
-  * - vsprintf
-    - |check|
-    - 7.23.6.13
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/vsprintf.html>`__
-  * - vsscanf
-    - |check|
-    - 7.23.6.14
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/vsscanf.html>`__
diff --git a/libc/docs/headers/stdlib.rst b/libc/docs/headers/stdlib.rst
deleted file mode 100644
index 4151f29..0000000
--- a/libc/docs/headers/stdlib.rst
+++ /dev/null
@@ -1,255 +0,0 @@
-.. include:: ../check.rst
-
-========
-stdlib.h
-========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - EXIT_FAILURE
-    - |check|
-    - 7.24
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdlib.h.html>`__
-  * - EXIT_SUCCESS
-    - |check|
-    - 7.24
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdlib.h.html>`__
-  * - MB_CUR_MAX
-    - |check|
-    - 7.24
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdlib.h.html>`__
-  * - RAND_MAX
-    - |check|
-    - 7.24
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/stdlib.h.html>`__
-  * - __STDC_VERSION_STDLIB_H__
-    -
-    - 7.24
-    -
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - _Exit
-    - |check|
-    - 7.24.4.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/_Exit.html>`__
-  * - abort
-    - |check|
-    - 7.24.4.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/abort.html>`__
-  * - abs
-    - |check|
-    - 7.24.6.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/abs.html>`__
-  * - aligned_alloc
-    - |check|
-    - 7.24.3.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/aligned_alloc.html>`__
-  * - at_quick_exit
-    - |check|
-    - 7.24.4.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/at_quick_exit.html>`__
-  * - atexit
-    - |check|
-    - 7.24.4.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atexit.html>`__
-  * - atof
-    - |check|
-    - 7.24.1.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atof.html>`__
-  * - atoi
-    - |check|
-    - 7.24.1.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atoi.html>`__
-  * - atol
-    - |check|
-    - 7.24.1.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atol.html>`__
-  * - atoll
-    - |check|
-    - 7.24.1.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/atoll.html>`__
-  * - bsearch
-    - |check|
-    - 7.24.5.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/bsearch.html>`__
-  * - calloc
-    - |check|
-    - 7.24.3.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/calloc.html>`__
-  * - div
-    - |check|
-    - 7.24.6.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/div.html>`__
-  * - exit
-    - |check|
-    - 7.24.4.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/exit.html>`__
-  * - free
-    - |check|
-    - 7.24.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/free.html>`__
-  * - free_aligned_sized
-    -
-    - 7.24.3.5
-    -
-  * - free_sized
-    -
-    - 7.24.3.4
-    -
-  * - getenv
-    - |check|
-    - 7.24.4.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/getenv.html>`__
-  * - labs
-    - |check|
-    - 7.24.6.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/labs.html>`__
-  * - ldiv
-    - |check|
-    - 7.24.6.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ldiv.html>`__
-  * - llabs
-    - |check|
-    - 7.24.6.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/llabs.html>`__
-  * - lldiv
-    - |check|
-    - 7.24.6.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/lldiv.html>`__
-  * - malloc
-    - |check|
-    - 7.24.3.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/malloc.html>`__
-  * - mblen
-    -
-    - 7.24.7.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mblen.html>`__
-  * - mbstowcs
-    -
-    - 7.24.8.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbstowcs.html>`__
-  * - mbtowc
-    -
-    - 7.24.7.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbtowc.html>`__
-  * - memalignment
-    -
-    - 7.24.9.1
-    -
-  * - qsort
-    - |check|
-    - 7.24.5.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/qsort.html>`__
-  * - quick_exit
-    - |check|
-    - 7.24.4.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/quick_exit.html>`__
-  * - rand
-    - |check|
-    - 7.24.2.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/rand.html>`__
-  * - realloc
-    - |check|
-    - 7.24.3.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/realloc.html>`__
-  * - srand
-    - |check|
-    - 7.24.2.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/srand.html>`__
-  * - strfromd
-    - |check|
-    - 7.24.1.3
-    -
-  * - strfromd128
-    -
-    - 7.24.1.4
-    -
-  * - strfromd32
-    -
-    - 7.24.1.4
-    -
-  * - strfromd64
-    -
-    - 7.24.1.4
-    -
-  * - strfromf
-    - |check|
-    - 7.24.1.3
-    -
-  * - strfroml
-    - |check|
-    - 7.24.1.3
-    -
-  * - strtod
-    - |check|
-    - 7.24.1.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtod.html>`__
-  * - strtod128
-    -
-    - 7.24.1.6
-    -
-  * - strtod32
-    -
-    - 7.24.1.6
-    -
-  * - strtod64
-    -
-    - 7.24.1.6
-    -
-  * - strtof
-    - |check|
-    - 7.24.1.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtof.html>`__
-  * - strtol
-    - |check|
-    - 7.24.1.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtol.html>`__
-  * - strtold
-    - |check|
-    - 7.24.1.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtold.html>`__
-  * - strtoll
-    - |check|
-    - 7.24.1.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoll.html>`__
-  * - strtoul
-    - |check|
-    - 7.24.1.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoul.html>`__
-  * - strtoull
-    - |check|
-    - 7.24.1.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtoull.html>`__
-  * - system
-    - |check|
-    - 7.24.4.8
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/system.html>`__
-  * - wcstombs
-    -
-    - 7.24.8.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/wcstombs.html>`__
-  * - wctomb
-    -
-    - 7.24.7.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/wctomb.html>`__
diff --git a/libc/docs/headers/string.rst b/libc/docs/headers/string.rst
deleted file mode 100644
index 2665ed8..0000000
--- a/libc/docs/headers/string.rst
+++ /dev/null
@@ -1,163 +0,0 @@
-.. include:: ../check.rst
-
-========
-string.h
-========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - __STDC_VERSION_STRING_H__
-    -
-    - 7.26.1
-    -
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - memccpy
-    - |check|
-    - 7.26.2.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memccpy.html>`__
-  * - memchr
-    - |check|
-    - 7.26.5.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memchr.html>`__
-  * - memcmp
-    - |check|
-    - 7.26.4.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memcmp.html>`__
-  * - memcpy
-    - |check|
-    - 7.26.2.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memcpy.html>`__
-  * - memmove
-    - |check|
-    - 7.26.2.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memmove.html>`__
-  * - mempcpy
-    - |check|
-    - TODO: glibc extension
-    -
-  * - memset
-    - |check|
-    - 7.26.6.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/memset.html>`__
-  * - memset_explicit
-    - |check|
-    - 7.26.6.2
-    -
-  * - stpcpy
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/stpcpy.html>`__
-  * - stpncpy
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/stpncpy.html>`__
-  * - strcat
-    - |check|
-    - 7.26.3.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcat.html>`__
-  * - strchr
-    - |check|
-    - 7.26.5.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strchr.html>`__
-  * - strcmp
-    - |check|
-    - 7.26.4.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcmp.html>`__
-  * - strcoll
-    - |check|
-    - 7.26.4.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcoll.html>`__
-  * - strcoll_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcoll_l.html>`__
-  * - strcpy
-    - |check|
-    - 7.26.2.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcpy.html>`__
-  * - strcspn
-    - |check|
-    - 7.26.5.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcspn.html>`__
-  * - strdup
-    - |check|
-    - 7.26.2.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strdup.html>`__
-  * - strerror
-    - |check|
-    - 7.26.6.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strerror.html>`__
-  * - strlen
-    - |check|
-    - 7.26.6.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strlen.html>`__
-  * - strncat
-    - |check|
-    - 7.26.3.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncat.html>`__
-  * - strncmp
-    - |check|
-    - 7.26.4.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncmp.html>`__
-  * - strncpy
-    - |check|
-    - 7.26.2.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncpy.html>`__
-  * - strndup
-    - |check|
-    - 7.26.2.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strndup.html>`__
-  * - strpbrk
-    - |check|
-    - 7.26.5.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strpbrk.html>`__
-  * - strrchr
-    - |check|
-    - 7.26.5.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strrchr.html>`__
-  * - strspn
-    - |check|
-    - 7.26.5.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strspn.html>`__
-  * - strstr
-    - |check|
-    - 7.26.5.8
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strstr.html>`__
-  * - strtok
-    - |check|
-    - 7.26.5.9
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtok.html>`__
-  * - strtok_r
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strtok_r.html>`__
-  * - strxfrm
-    - |check|
-    - 7.26.4.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strxfrm.html>`__
-  * - strxfrm_l
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strxfrm_l.html>`__
diff --git a/libc/docs/headers/strings.rst b/libc/docs/headers/strings.rst
deleted file mode 100644
index effd667..0000000
--- a/libc/docs/headers/strings.rst
+++ /dev/null
@@ -1,66 +0,0 @@
-.. include:: ../check.rst
-
-=========
-strings.h
-=========
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - bcmp
-    - |check|
-    -
-    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/bcmp.html>`__
-  * - bcopy
-    - |check|
-    -
-    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/bcopy.html>`__
-  * - bzero
-    - |check|
-    -
-    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/bzero.html>`__
-  * - ffs
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffs.html>`__
-  * - ffsl
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffsl.html>`__
-  * - ffsll
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/ffsll.html>`__
-  * - index
-    - |check|
-    -
-    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/index.html>`__
-  * - rindex
-    - |check|
-    -
-    - `removed in POSIX.1-2008 <https://pubs.opengroup.org/onlinepubs/007904875/functions/rindex.html>`__
-  * - strcasecmp
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcasecmp.html>`__
-  * - strcasecmp_l
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strcasecmp_l.html>`__
-  * - strncasecmp
-    - |check|
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp.html>`__
-  * - strncasecmp_l
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/strncasecmp_l.html>`__
diff --git a/libc/docs/headers/sys/mman.rst b/libc/docs/headers/sys/mman.rst
deleted file mode 100644
index e340420..0000000
--- a/libc/docs/headers/sys/mman.rst
+++ /dev/null
@@ -1,179 +0,0 @@
-.. include:: ../../check.rst
-
-==========
-sys/mman.h
-==========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX.1-2024 Standard Section
-  * - MAP_ANON
-    -
-    -
-    - 
-  * - MAP_ANONYMOUS
-    -
-    -
-    - 
-  * - MAP_FAILED
-    - |check|
-    -
-    - 
-  * - MAP_FIXED
-    -
-    -
-    - 
-  * - MAP_PRIVATE
-    -
-    -
-    - 
-  * - MAP_SHARED
-    -
-    -
-    - 
-  * - MCL_CURRENT
-    -
-    -
-    - 
-  * - MCL_FUTURE
-    -
-    -
-    - 
-  * - MS_ASYNC
-    -
-    -
-    - 
-  * - MS_INVALIDATE
-    -
-    -
-    - 
-  * - MS_SYNC
-    -
-    -
-    - 
-  * - POSIX_MADV_DONTNEED
-    - |check|
-    -
-    - 
-  * - POSIX_MADV_NORMAL
-    - |check|
-    -
-    - 
-  * - POSIX_MADV_RANDOM
-    - |check|
-    -
-    - 
-  * - POSIX_MADV_SEQUENTIAL
-    - |check|
-    -
-    - 
-  * - POSIX_MADV_WILLNEED
-    - |check|
-    -
-    - 
-  * - POSIX_TYPED_MEM_ALLOCATE
-    -
-    -
-    - 
-  * - POSIX_TYPED_MEM_ALLOCATE_CONTIG
-    -
-    -
-    - 
-  * - POSIX_TYPED_MEM_MAP_ALLOCATABLE
-    -
-    -
-    - 
-  * - PROT_EXEC
-    -
-    -
-    - 
-  * - PROT_NONE
-    -
-    -
-    - 
-  * - PROT_READ
-    -
-    -
-    - 
-  * - PROT_WRITE
-    -
-    -
-    - 
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX.1-2024 Standard Section
-  * - mlock
-    - |check|
-    -
-    - 
-  * - mlockall
-    - |check|
-    -
-    - 
-  * - mmap
-    - |check|
-    -
-    - 
-  * - mprotect
-    - |check|
-    -
-    - 
-  * - msync
-    - |check|
-    -
-    - 
-  * - munlock
-    - |check|
-    -
-    - 
-  * - munlockall
-    - |check|
-    -
-    - 
-  * - munmap
-    - |check|
-    -
-    - 
-  * - posix_madvise
-    - |check|
-    -
-    - 
-  * - posix_mem_offset
-    -
-    -
-    - 
-  * - posix_typed_mem_get_info
-    -
-    -
-    - 
-  * - posix_typed_mem_open
-    -
-    -
-    - 
-  * - shm_open
-    - |check|
-    -
-    - 
-  * - shm_unlink
-    - |check|
-    -
-    - 
diff --git a/libc/docs/headers/threads.rst b/libc/docs/headers/threads.rst
deleted file mode 100644
index c2837b8..0000000
--- a/libc/docs/headers/threads.rst
+++ /dev/null
@@ -1,147 +0,0 @@
-.. include:: ../check.rst
-
-=========
-threads.h
-=========
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - ONCE_FLAG_INIT
-    -
-    - 7.28.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/threads.h.html>`__
-  * - TSS_DTOR_ITERATIONS
-    -
-    - 7.28.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/threads.h.html>`__
-  * - __STDC_NO_THREADS__
-    -
-    - 7.28.1
-    -
-  * - thread_local
-    -
-    -
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/threads.h.html>`__
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - call_once
-    - |check|
-    - 7.28.2.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/call_once.html>`__
-  * - cnd_broadcast
-    - |check|
-    - 7.28.3.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_broadcast.html>`__
-  * - cnd_destroy
-    - |check|
-    - 7.28.3.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_destroy.html>`__
-  * - cnd_init
-    - |check|
-    - 7.28.3.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_init.html>`__
-  * - cnd_signal
-    - |check|
-    - 7.28.3.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_signal.html>`__
-  * - cnd_timedwait
-    -
-    - 7.28.3.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_timedwait.html>`__
-  * - cnd_wait
-    - |check|
-    - 7.28.3.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/cnd_wait.html>`__
-  * - mtx_destroy
-    - |check|
-    - 7.28.4.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_destroy.html>`__
-  * - mtx_init
-    - |check|
-    - 7.28.4.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_init.html>`__
-  * - mtx_lock
-    - |check|
-    - 7.28.4.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_lock.html>`__
-  * - mtx_timedlock
-    -
-    - 7.28.4.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_timedlock.html>`__
-  * - mtx_trylock
-    -
-    - 7.28.4.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_trylock.html>`__
-  * - mtx_unlock
-    - |check|
-    - 7.28.4.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mtx_unlock.html>`__
-  * - thrd_create
-    - |check|
-    - 7.28.5.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_create.html>`__
-  * - thrd_current
-    - |check|
-    - 7.28.5.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_current.html>`__
-  * - thrd_detach
-    - |check|
-    - 7.28.5.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_detach.html>`__
-  * - thrd_equal
-    - |check|
-    - 7.28.5.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_equal.html>`__
-  * - thrd_exit
-    - |check|
-    - 7.28.5.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_exit.html>`__
-  * - thrd_join
-    - |check|
-    - 7.28.5.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_join.html>`__
-  * - thrd_sleep
-    -
-    - 7.28.5.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_sleep.html>`__
-  * - thrd_yield
-    -
-    - 7.28.5.8
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/thrd_yield.html>`__
-  * - tss_create
-    - |check|
-    - 7.28.6.1
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tss_create.html>`__
-  * - tss_delete
-    - |check|
-    - 7.28.6.2
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tss_delete.html>`__
-  * - tss_get
-    - |check|
-    - 7.28.6.3
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tss_get.html>`__
-  * - tss_set
-    - |check|
-    - 7.28.6.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/tss_set.html>`__
diff --git a/libc/docs/headers/uchar.rst b/libc/docs/headers/uchar.rst
deleted file mode 100644
index abb684b..0000000
--- a/libc/docs/headers/uchar.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-.. include:: ../check.rst
-
-=======
-uchar.h
-=======
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - __STDC_VERSION_UCHAR_H__
-    -
-    - 7.30.1
-    -
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - c16rtomb
-    -
-    - 7.30.2.5
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/c16rtomb.html>`__
-  * - c32rtomb
-    -
-    - 7.30.2.7
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/c32rtomb.html>`__
-  * - c8rtomb
-    -
-    - 7.30.2.3
-    -
-  * - mbrtoc16
-    -
-    - 7.30.2.4
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbrtoc16.html>`__
-  * - mbrtoc32
-    -
-    - 7.30.2.6
-    - `POSIX.1-2024 <https://pubs.opengroup.org/onlinepubs/9799919799/functions/mbrtoc32.html>`__
-  * - mbrtoc8
-    -
-    - 7.30.2.2
-    -
diff --git a/libc/docs/headers/wchar.rst b/libc/docs/headers/wchar.rst
deleted file mode 100644
index 89a1e7b..0000000
--- a/libc/docs/headers/wchar.rst
+++ /dev/null
@@ -1,287 +0,0 @@
-.. include:: ../check.rst
-
-=======
-wchar.h
-=======
-
-Macros
-======
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Macro
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - WEOF
-    - |check|
-    - 7.31.1
-    -
-  * - __STDC_VERSION_WCHAR_H__
-    -
-    - 7.31.1
-    -
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - btowc
-    - |check|
-    - 7.31.6.2.1
-    -
-  * - fgetwc
-    -
-    - 7.31.3.1
-    -
-  * - fgetws
-    -
-    - 7.31.3.2
-    -
-  * - fputwc
-    -
-    - 7.31.3.3
-    -
-  * - fputws
-    -
-    - 7.31.3.4
-    -
-  * - fwide
-    -
-    - 7.31.3.5
-    -
-  * - fwprintf
-    -
-    - 7.31.2.2
-    -
-  * - fwscanf
-    -
-    - 7.31.2.3
-    -
-  * - getwc
-    -
-    - 7.31.3.6
-    -
-  * - getwchar
-    -
-    - 7.31.3.7
-    -
-  * - mbrlen
-    -
-    - 7.31.6.4.2
-    -
-  * - mbrtowc
-    -
-    - 7.31.6.4.3
-    -
-  * - mbsinit
-    -
-    - 7.31.6.3.1
-    -
-  * - mbsrtowcs
-    -
-    - 7.31.6.5.2
-    -
-  * - putwc
-    -
-    - 7.31.3.8
-    -
-  * - putwchar
-    -
-    - 7.31.3.9
-    -
-  * - swprintf
-    -
-    - 7.31.2.4
-    -
-  * - swscanf
-    -
-    - 7.31.2.5
-    -
-  * - ungetwc
-    -
-    - 7.31.3.10
-    -
-  * - vfwprintf
-    -
-    - 7.31.2.6
-    -
-  * - vfwscanf
-    -
-    - 7.31.2.7
-    -
-  * - vswprintf
-    -
-    - 7.31.2.8
-    -
-  * - vswscanf
-    -
-    - 7.31.2.9
-    -
-  * - vwprintf
-    -
-    - 7.31.2.10
-    -
-  * - vwscanf
-    -
-    - 7.31.2.11
-    -
-  * - wcrtomb
-    -
-    - 7.31.6.4.4
-    -
-  * - wcscat
-    -
-    - 7.31.4.4.1
-    -
-  * - wcschr
-    -
-    - 7.31.4.6.2
-    -
-  * - wcscmp
-    -
-    - 7.31.4.5.2
-    -
-  * - wcscoll
-    -
-    - 7.31.4.5.3
-    -
-  * - wcscpy
-    -
-    - 7.31.4.3.1
-    -
-  * - wcscspn
-    -
-    - 7.31.4.6.3
-    -
-  * - wcsftime
-    -
-    - 7.31.5.1
-    -
-  * - wcslen
-    -
-    - 7.31.4.7.1
-    -
-  * - wcsncat
-    -
-    - 7.31.4.4.2
-    -
-  * - wcsncmp
-    -
-    - 7.31.4.5.4
-    -
-  * - wcsncpy
-    -
-    - 7.31.4.3.2
-    -
-  * - wcspbrk
-    -
-    - 7.31.4.6.4
-    -
-  * - wcsrchr
-    -
-    - 7.31.4.6.5
-    -
-  * - wcsrtombs
-    -
-    - 7.31.6.5.3
-    -
-  * - wcsspn
-    -
-    - 7.31.4.6.6
-    -
-  * - wcsstr
-    -
-    - 7.31.4.6.7
-    -
-  * - wcstod
-    -
-    - 7.31.4.2.2
-    -
-  * - wcstod128
-    -
-    - 7.31.4.2.3
-    -
-  * - wcstod32
-    -
-    - 7.31.4.2.3
-    -
-  * - wcstod64
-    -
-    - 7.31.4.2.3
-    -
-  * - wcstof
-    -
-    - 7.31.4.2.2
-    -
-  * - wcstok
-    -
-    - 7.31.4.6.8
-    -
-  * - wcstol
-    -
-    - 7.31.4.2.4
-    -
-  * - wcstold
-    -
-    - 7.31.4.2.2
-    -
-  * - wcstoll
-    -
-    - 7.31.4.2.4
-    -
-  * - wcstoul
-    -
-    - 7.31.4.2.4
-    -
-  * - wcstoull
-    -
-    - 7.31.4.2.4
-    -
-  * - wcsxfrm
-    -
-    - 7.31.4.5.5
-    -
-  * - wctob
-    - |check|
-    - 7.31.6.2.2
-    -
-  * - wmemchr
-    -
-    - 7.31.4.6.9
-    -
-  * - wmemcmp
-    -
-    - 7.31.4.5.6
-    -
-  * - wmemcpy
-    -
-    - 7.31.4.3.3
-    -
-  * - wmemmove
-    -
-    - 7.31.4.3.4
-    -
-  * - wmemset
-    -
-    - 7.31.4.7.2
-    -
-  * - wprintf
-    -
-    - 7.31.2.12
-    -
-  * - wscanf
-    -
-    - 7.31.2.13
-    -
diff --git a/libc/docs/headers/wctype.rst b/libc/docs/headers/wctype.rst
deleted file mode 100644
index 076db04..0000000
--- a/libc/docs/headers/wctype.rst
+++ /dev/null
@@ -1,86 +0,0 @@
-.. include:: ../check.rst
-
-========
-wctype.h
-========
-
-Functions
-=========
-
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
-
-  * - Function
-    - Implemented
-    - C23 Standard Section
-    - POSIX Docs
-  * - iswalnum
-    -
-    - 7.32.2.1.1
-    -
-  * - iswalpha
-    -
-    - 7.32.2.1.2
-    -
-  * - iswblank
-    -
-    - 7.32.2.1.4
-    -
-  * - iswctype
-    -
-    - 7.32.2.2.1
-    -
-  * - iswdigit
-    -
-    - 7.32.2.1.5
-    -
-  * - iswgraph
-    -
-    - 7.32.2.1.6
-    -
-  * - iswlower
-    -
-    - 7.32.2.1.7
-    -
-  * - iswprint
-    -
-    - 7.32.2.1.8
-    -
-  * - iswpunct
-    -
-    - 7.32.2.1.9
-    -
-  * - iswspace
-    -
-    - 7.32.2.1.10
-    -
-  * - iswupper
-    -
-    - 7.32.2.1.11
-    -
-  * - iswxdigit
-    -
-    - 7.32.2.1.12
-    -
-  * - towctrans
-    -
-    - 7.32.3.2.1
-    -
-  * - towlower
-    -
-    - 7.32.3.1.1
-    -
-  * - towupper
-    -
-    - 7.32.3.1.2
-    -
-  * - wctrans
-    -
-    - 7.32.3.2.2
-    -
-  * - wctype
-    -
-    - 7.32.2.2.2
-    -
diff --git a/libc/fuzzing/stdlib/CMakeLists.txt b/libc/fuzzing/stdlib/CMakeLists.txt
index 9b3298c..3dbd640 100644
--- a/libc/fuzzing/stdlib/CMakeLists.txt
+++ b/libc/fuzzing/stdlib/CMakeLists.txt
@@ -1,9 +1,9 @@
 add_libc_fuzzer(
-  qsort_fuzz
+  quick_sort_fuzz
   SRCS
-    qsort_fuzz.cpp
+    quick_sort_fuzz.cpp
   DEPENDS
-    libc.src.stdlib.qsort
+    libc.src.stdlib.qsort_util
 )
 
 add_libc_fuzzer(
diff --git a/libc/fuzzing/stdlib/heap_sort_fuzz.cpp b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp
index 876c5f9..6b00306 100644
--- a/libc/fuzzing/stdlib/heap_sort_fuzz.cpp
+++ b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp
@@ -10,21 +10,10 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "src/stdlib/heap_sort.h"
+#include "src/stdlib/qsort_util.h"
 #include <stdint.h>
 
-static int int_compare(const void *l, const void *r) {
-  int li = *reinterpret_cast<const int *>(l);
-  int ri = *reinterpret_cast<const int *>(r);
-  if (li == ri)
-    return 0;
-  if (li > ri)
-    return 1;
-  return -1;
-}
-
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
-
   const size_t array_size = size / sizeof(int);
   if (array_size == 0)
     return 0;
@@ -34,14 +23,22 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   for (size_t i = 0; i < array_size; ++i)
     array[i] = data_as_int[i];
 
-  auto arr = LIBC_NAMESPACE::internal::Array(
-      reinterpret_cast<uint8_t *>(array), array_size, sizeof(int), int_compare);
+  const auto is_less = [](const void *a_ptr,
+                          const void *b_ptr) noexcept -> bool {
+    const int &a = *static_cast<const int *>(a_ptr);
+    const int &b = *static_cast<const int *>(b_ptr);
+
+    return a < b;
+  };
 
-  LIBC_NAMESPACE::internal::heap_sort(arr);
+  constexpr bool USE_QUICKSORT = false;
+  LIBC_NAMESPACE::internal::unstable_sort_impl<USE_QUICKSORT>(
+      array, array_size, sizeof(int), is_less);
 
-  for (size_t i = 0; i < array_size - 1; ++i)
+  for (size_t i = 0; i < array_size - 1; ++i) {
     if (array[i] > array[i + 1])
       __builtin_trap();
+  }
 
   delete[] array;
   return 0;
diff --git a/libc/fuzzing/stdlib/qsort_fuzz.cpp b/libc/fuzzing/stdlib/quick_sort_fuzz.cpp
index 5d5053c..6371e85 100644
--- a/libc/fuzzing/stdlib/qsort_fuzz.cpp
+++ b/libc/fuzzing/stdlib/quick_sort_fuzz.cpp
@@ -1,4 +1,4 @@
-//===-- qsort_fuzz.cpp ----------------------------------------------------===//
+//===-- quick_sort_fuzz.cpp------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,24 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// Fuzzing test for llvm-libc qsort implementation.
+/// Fuzzing test for llvm-libc quick_sort implementation.
 ///
 //===----------------------------------------------------------------------===//
 
-#include "src/stdlib/qsort.h"
+#include "src/stdlib/qsort_util.h"
 #include <stdint.h>
 
-static int int_compare(const void *l, const void *r) {
-  int li = *reinterpret_cast<const int *>(l);
-  int ri = *reinterpret_cast<const int *>(r);
-  if (li == ri)
-    return 0;
-  else if (li > ri)
-    return 1;
-  else
-    return -1;
-}
-
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   const size_t array_size = size / sizeof(int);
   if (array_size == 0)
@@ -34,7 +23,17 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   for (size_t i = 0; i < array_size; ++i)
     array[i] = data_as_int[i];
 
-  LIBC_NAMESPACE::qsort(array, array_size, sizeof(int), int_compare);
+  const auto is_less = [](const void *a_ptr,
+                          const void *b_ptr) noexcept -> bool {
+    const int &a = *static_cast<const int *>(a_ptr);
+    const int &b = *static_cast<const int *>(b_ptr);
+
+    return a < b;
+  };
+
+  constexpr bool USE_QUICKSORT = true;
+  LIBC_NAMESPACE::internal::unstable_sort_impl<USE_QUICKSORT>(
+      array, array_size, sizeof(int), is_less);
 
   for (size_t i = 0; i < array_size - 1; ++i) {
     if (array[i] > array[i + 1])
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 5156b58..1674de1 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -86,6 +86,14 @@ add_proxy_header_library(
 )
 
 add_proxy_header_library(
+  struct_tm
+  HDRS
+    struct_tm.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.struct_tm
+)
+
+add_proxy_header_library(
   size_t
   HDRS
     size_t.h
diff --git a/libc/hdr/types/struct_tm.h b/libc/hdr/types/struct_tm.h
new file mode 100644
index 0000000..96c23e2
--- /dev/null
+++ b/libc/hdr/types/struct_tm.h
@@ -0,0 +1,21 @@
+//===-- Proxy for struct tm  ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_HDR_TYPES_STRUCT_TM_H
+#define LLVM_LIBC_HDR_TYPES_STRUCT_TM_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/struct_tm.h"
+
+#else
+
+#include <time.h>
+
+#endif // LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_STRUCT_TM_H
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 3a05c01..568bb05 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -19,11 +19,10 @@ add_header(
 
 # TODO: Can we simplify this macro expansion?
 # https://github.com/llvm/llvm-project/issues/117254
-macro(add_header_macro TARGET_NAME YAML_FILE DEF_FILE GEN_HDR DEPENDS)
+macro(add_header_macro TARGET_NAME YAML_FILE GEN_HDR DEPENDS)
   add_gen_header(
     ${TARGET_NAME}
     YAML_FILE ${YAML_FILE}
-    DEF_FILE ${DEF_FILE}
     GEN_HDR ${GEN_HDR}
     ${DEPENDS}
     ${ARGN}
@@ -32,8 +31,7 @@ endmacro()
 
 add_header_macro(
   ctype
-  ../libc/hdrgen/yaml/ctype.yaml
-  ctype.h.def
+  ../libc/include/ctype.yaml
   ctype.h
   DEPENDS
     .llvm_libc_common_h
@@ -42,8 +40,7 @@ add_header_macro(
 
 add_header_macro(
   dirent
-  ../libc/hdrgen/yaml/dirent.yaml
-  dirent.h.def
+  ../libc/include/dirent.yaml
   dirent.h
   DEPENDS
     .llvm_libc_common_h
@@ -54,8 +51,7 @@ add_header_macro(
 
 add_header_macro(
   fcntl
-  ../libc/hdrgen/yaml/fcntl.yaml
-  fcntl.h.def
+  ../libc/include/fcntl.yaml
   fcntl.h
   DEPENDS
     .llvm-libc-macros.fcntl_macros
@@ -70,8 +66,7 @@ add_header_macro(
 
 add_header_macro(
   dlfcn
-  ../libc/hdrgen/yaml/dlfcn.yaml
-  dlfcn.h.def
+  ../libc/include/dlfcn.yaml
   dlfcn.h
   DEPENDS
     .llvm-libc-macros.dlfcn_macros
@@ -80,8 +75,7 @@ add_header_macro(
 
 add_header_macro(
   features
-  ../libc/hdrgen/yaml/features.yaml
-  features.h.def
+  ../libc/include/features.yaml
   features.h
   DEPENDS
     .llvm_libc_common_h
@@ -90,8 +84,7 @@ add_header_macro(
 
 add_header_macro(
   fenv
-  ../libc/hdrgen/yaml/fenv.yaml
-  fenv.h.def
+  ../libc/include/fenv.yaml
   fenv.h
   DEPENDS
     .llvm_libc_common_h
@@ -102,8 +95,7 @@ add_header_macro(
 
 add_header_macro(
   inttypes
-  ../libc/hdrgen/yaml/inttypes.yaml
-  inttypes.h.def
+  ../libc/include/inttypes.yaml
   inttypes.h
   DEPENDS
     .llvm_libc_common_h
@@ -113,8 +105,7 @@ add_header_macro(
 
 add_header_macro(
   float
-  ../libc/hdrgen/yaml/float.yaml
-  float.h.def
+  ../libc/include/float.yaml
   float.h
   DEPENDS
     .llvm-libc-macros.float_macros
@@ -122,8 +113,7 @@ add_header_macro(
 
 add_header_macro(
   stdint
-  ../libc/hdrgen/yaml/stdint.yaml
-  stdint.h.def
+  ../libc/include/stdint.yaml
   stdint.h
   DEPENDS
     .llvm-libc-macros.stdint_macros
@@ -131,8 +121,7 @@ add_header_macro(
 
 add_header_macro(
   limits
-  ../libc/hdrgen/yaml/limits.yaml
-  limits.h.def
+  ../libc/include/limits.yaml
   limits.h
   DEPENDS
     .llvm-libc-macros.limits_macros
@@ -140,8 +129,7 @@ add_header_macro(
 
 add_header_macro(
   malloc
-  ../libc/hdrgen/yaml/malloc.yaml
-  malloc.h.def
+  ../libc/include/malloc.yaml
   malloc.h
   DEPENDS
     .llvm_libc_common_h
@@ -150,8 +138,7 @@ add_header_macro(
 
 add_header_macro(
   math
-  ../libc/hdrgen/yaml/math.yaml
-  math.h.def
+  ../libc/include/math.yaml
   math.h
   DEPENDS
     .llvm_libc_common_h
@@ -165,8 +152,7 @@ add_header_macro(
 
 add_header_macro(
   stdfix
-  ../libc/hdrgen/yaml/stdfix.yaml
-  stdfix.h.def
+  ../libc/include/stdfix.yaml
   stdfix.h
   DEPENDS
     .llvm-libc-macros.stdfix_macros
@@ -178,8 +164,7 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa)
 
 add_header_macro(
   arpa_inet
-  ../libc/hdrgen/yaml/arpa/inet.yaml
-  arpa/inet.h.def
+  ../libc/include/arpa/inet.yaml
   arpa/inet.h
   DEPENDS
     .llvm_libc_common_h
@@ -187,8 +172,7 @@ add_header_macro(
 
 add_header_macro(
   assert
-  ../libc/hdrgen/yaml/assert.yaml
-  assert.h.def
+  ../libc/include/assert.yaml
   assert.h
   DEPENDS
     .llvm_libc_common_h
@@ -197,8 +181,7 @@ add_header_macro(
 
 add_header_macro(
   complex
-  ../libc/hdrgen/yaml/complex.yaml
-  complex.h.def
+  ../libc/include/complex.yaml
   complex.h
   DEPENDS
     .llvm_libc_common_h
@@ -207,8 +190,7 @@ add_header_macro(
 
 add_header_macro(
   setjmp
-  ../libc/hdrgen/yaml/setjmp.yaml
-  setjmp.h.def
+  ../libc/include/setjmp.yaml
   setjmp.h
   DEPENDS
     .llvm_libc_common_h
@@ -217,8 +199,7 @@ add_header_macro(
 
 add_header_macro(
   string
-  ../libc/hdrgen/yaml/string.yaml
-  string.h.def
+  ../libc/include/string.yaml
   string.h
   DEPENDS
     .llvm_libc_common_h
@@ -228,8 +209,7 @@ add_header_macro(
 
 add_header_macro(
   strings
-  ../libc/hdrgen/yaml/strings.yaml
-  strings.h.def
+  ../libc/include/strings.yaml
   strings.h
   DEPENDS
     .llvm_libc_common_h
@@ -238,8 +218,7 @@ add_header_macro(
 
 add_header_macro(
   search
-  ../libc/hdrgen/yaml/search.yaml
-  search.h.def
+  ../libc/include/search.yaml
   search.h
   DEPENDS
     .llvm_libc_common_h
@@ -252,8 +231,7 @@ add_header_macro(
 
 add_header_macro(
   time
-  ../libc/hdrgen/yaml/time.yaml
-  time.h.def
+  ../libc/include/time.yaml
   time.h
   DEPENDS
     .llvm_libc_common_h
@@ -268,8 +246,7 @@ add_header_macro(
 
 add_header_macro(
   threads
-  ../libc/hdrgen/yaml/threads.yaml
-  threads.h.def
+  ../libc/include/threads.yaml
   threads.h
   DEPENDS
     .llvm_libc_common_h
@@ -285,8 +262,7 @@ add_header_macro(
 
 add_header_macro(
   errno
-  ../libc/hdrgen/yaml/errno.yaml
-  errno.h.def
+  ../libc/include/errno.yaml
   errno.h
   DEPENDS
     .llvm-libc-macros.generic_error_number_macros
@@ -295,8 +271,7 @@ add_header_macro(
 
 add_header_macro(
   signal
-  ../libc/hdrgen/yaml/signal.yaml
-  signal.h.def
+  ../libc/include/signal.yaml
   signal.h
   DEPENDS
     .llvm-libc-macros.signal_macros
@@ -311,8 +286,7 @@ add_header_macro(
 
 add_header_macro(
   stdbit
-  ../libc/hdrgen/yaml/stdbit.yaml
-  stdbit.h.def
+  ../libc/include/stdbit.yaml
   stdbit.h
   DEPENDS
     .llvm_libc_common_h
@@ -321,8 +295,7 @@ add_header_macro(
 
 add_header_macro(
   stdckdint
-  ../libc/hdrgen/yaml/stdckdint.yaml
-  stdckdint.h.def
+  ../libc/include/stdckdint.yaml
   stdckdint.h
   DEPENDS
     .llvm_libc_common_h
@@ -331,8 +304,7 @@ add_header_macro(
 
 add_header_macro(
   stdio
-  ../libc/hdrgen/yaml/stdio.yaml
-  stdio.h.def
+  ../libc/include/stdio.yaml
   stdio.h
   DEPENDS
     .llvm-libc-macros.file_seek_macros
@@ -347,8 +319,7 @@ add_header_macro(
 
 add_header_macro(
   stdlib
-  ../libc/hdrgen/yaml/stdlib.yaml
-  stdlib.h.def
+  ../libc/include/stdlib.yaml
   stdlib.h
   DEPENDS
     .llvm_libc_common_h
@@ -366,8 +337,7 @@ add_header_macro(
 
 add_header_macro(
   unistd
-  ../libc/hdrgen/yaml/unistd.yaml
-  unistd.h.def
+  ../libc/include/unistd.yaml
   unistd.h
   DEPENDS
     .llvm_libc_common_h
@@ -385,8 +355,7 @@ add_header_macro(
 
 add_header_macro(
   pthread
-  ../libc/hdrgen/yaml/pthread.yaml
-  pthread.h.def
+  ../libc/include/pthread.yaml
   pthread.h
   DEPENDS
     .llvm-libc-macros.pthread_macros
@@ -409,8 +378,7 @@ add_header_macro(
 
 add_header_macro(
   sched
-  ../libc/hdrgen/yaml/sched.yaml
-  sched.h.def
+  ../libc/include/sched.yaml
   sched.h
   DEPENDS
     .llvm_libc_common_h
@@ -426,8 +394,7 @@ add_header_macro(
 
 add_header_macro(
   spawn
-  ../libc/hdrgen/yaml/spawn.yaml
-  spawn.h.def
+  ../libc/include/spawn.yaml
   spawn.h
   DEPENDS
     .llvm_libc_common_h
@@ -439,8 +406,7 @@ add_header_macro(
 
 add_header_macro(
   link
-  ../libc/hdrgen/yaml/link.yaml
-  link.h.def
+  ../libc/include/link.yaml
   link.h
   DEPENDS
     .llvm_libc_common_h
@@ -449,8 +415,7 @@ add_header_macro(
 
 add_header_macro(
   elf
-  ../libc/hdrgen/yaml/elf.yaml
-  elf.h.def
+  ../libc/include/elf.yaml
   elf.h
   DEPENDS
     .llvm-libc-macros.elf_macros
@@ -463,8 +428,7 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/sys)
 
 add_header_macro(
   sys_auxv
-  ../libc/hdrgen/yaml/sys/auxv.yaml
-  sys/auxv.h.def
+  ../libc/include/sys/auxv.yaml
   sys/auxv.h
   DEPENDS
     .llvm_libc_common_h
@@ -473,8 +437,7 @@ add_header_macro(
 
 add_header_macro(
   sys_epoll
-  ../libc/hdrgen/yaml/sys/epoll.yaml
-  sys/epoll.h.def
+  ../libc/include/sys/epoll.yaml
   sys/epoll.h
   DEPENDS
     .llvm_libc_common_h
@@ -486,8 +449,7 @@ add_header_macro(
 
 add_header_macro(
   sys_ioctl
-  ../libc/hdrgen/yaml/sys/ioctl.yaml
-  sys/ioctl.h.def
+  ../libc/include/sys/ioctl.yaml
   sys/ioctl.h
   DEPENDS
     .llvm_libc_common_h
@@ -496,8 +458,7 @@ add_header_macro(
 
 add_header_macro(
   sys_mman
-  ../libc/hdrgen/yaml/sys/mman.yaml
-  sys/mman.h.def
+  ../libc/include/sys/mman.yaml
   sys/mman.h
   DEPENDS
     .llvm_libc_common_h
@@ -509,8 +470,7 @@ add_header_macro(
 
 add_header_macro(
   sys_prctl
-  ../libc/hdrgen/yaml/sys/prctl.yaml
-  sys/prctl.h.def
+  ../libc/include/sys/prctl.yaml
   sys/prctl.h
   DEPENDS
     .llvm_libc_common_h
@@ -526,8 +486,7 @@ add_header(
 
 add_header_macro(
   sys_random
-  ../libc/hdrgen/yaml/sys/random.yaml
-  sys/random.h.def
+  ../libc/include/sys/random.yaml
   sys/random.h
   DEPENDS
     .llvm_libc_common_h
@@ -538,8 +497,7 @@ add_header_macro(
 
 add_header_macro(
   sys_resource
-  ../libc/hdrgen/yaml/sys/resource.yaml
-  sys/resource.h.def
+  ../libc/include/sys/resource.yaml
   sys/resource.h
   DEPENDS
     .llvm_libc_common_h
@@ -550,8 +508,7 @@ add_header_macro(
 
 add_header_macro(
   sys_stat
-  ../libc/hdrgen/yaml/sys/stat.yaml
-  sys/stat.h.def
+  ../libc/include/sys/stat.yaml
   sys/stat.h
   DEPENDS
     .llvm_libc_common_h
@@ -572,8 +529,7 @@ add_header_macro(
 
 add_header_macro(
   sys_select
-  ../libc/hdrgen/yaml/sys/select.yaml
-  sys/select.h.def
+  ../libc/include/sys/select.yaml
   sys/select.h
   DEPENDS
     .llvm_libc_common_h
@@ -588,8 +544,7 @@ add_header_macro(
 
 add_header_macro(
   sys_sendfile
-  ../libc/hdrgen/yaml/sys/sendfile.yaml
-  sys/sendfile.h.def
+  ../libc/include/sys/sendfile.yaml
   sys/sendfile.h
   DEPENDS
     .llvm_libc_common_h
@@ -600,8 +555,7 @@ add_header_macro(
 
 add_header_macro(
   sys_socket
-  ../libc/hdrgen/yaml/sys/socket.yaml
-  sys/socket.h.def
+  ../libc/include/sys/socket.yaml
   sys/socket.h
   DEPENDS
     .llvm_libc_common_h
@@ -616,8 +570,7 @@ add_header_macro(
 
 add_header_macro(
   sys_statvfs
-  ../libc/hdrgen/yaml/sys/statvfs.yaml
-  sys/statvfs.h.def
+  ../libc/include/sys/statvfs.yaml
   sys/statvfs.h
   DEPENDS
     .llvm_libc_common_h
@@ -626,16 +579,14 @@ add_header_macro(
 
 add_header_macro(
   sys_syscall
-  ../libc/hdrgen/yaml/sys/syscall.yaml
-  sys/syscall.h.def
+  ../libc/include/sys/syscall.yaml
   sys/syscall.h
   DEPENDS
 )
 
 add_header_macro(
   sys_time
-  ../libc/hdrgen/yaml/sys/time.yaml
-  sys/time.h.def
+  ../libc/include/sys/time.yaml
   sys/time.h
   DEPENDS
     .llvm_libc_common_h
@@ -645,8 +596,7 @@ add_header_macro(
 
 add_header_macro(
   sys_types
-  ../libc/hdrgen/yaml/sys/types.yaml
-  sys/types.h.def
+  ../libc/include/sys/types.yaml
   sys/types.h
   DEPENDS
     .llvm_libc_common_h
@@ -675,8 +625,7 @@ add_header_macro(
 
 add_header_macro(
   sys_utsname
-  ../libc/hdrgen/yaml/sys/utsname.yaml
-  sys/utsname.h.def
+  ../libc/include/sys/utsname.yaml
   sys/utsname.h
   DEPENDS
     .llvm_libc_common_h
@@ -685,8 +634,7 @@ add_header_macro(
 
 add_header_macro(
   sys_wait
-  ../libc/hdrgen/yaml/sys/wait.yaml
-  sys/wait.h.def
+  ../libc/include/sys/wait.yaml
   sys/wait.h
   DEPENDS
     .llvm_libc_common_h
@@ -698,8 +646,7 @@ add_header_macro(
 
 add_header_macro(
   termios
-  ../libc/hdrgen/yaml/termios.yaml
-  termios.h.def
+  ../libc/include/termios.yaml
   termios.h
   DEPENDS
     .llvm_libc_common_h
@@ -713,8 +660,7 @@ add_header_macro(
 
 add_header_macro(
   uchar
-  ../libc/hdrgen/yaml/uchar.yaml
-  uchar.h.def
+  ../libc/include/uchar.yaml
   uchar.h
   DEPENDS
     .llvm_libc_common_h
@@ -726,8 +672,7 @@ add_header_macro(
 
 add_header_macro(
   wchar
-  ../libc/hdrgen/yaml/wchar.yaml
-  wchar.h.def
+  ../libc/include/wchar.yaml
   wchar.h
   DEPENDS
     .llvm_libc_common_h
@@ -740,8 +685,7 @@ add_header_macro(
 
 add_header_macro(
   locale
-  ../libc/hdrgen/yaml/locale.yaml
-  locale.h.def
+  ../libc/include/locale.yaml
   locale.h
   DEPENDS
     .llvm_libc_common_h
diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h
index d54ee7b..c63eb13 100644
--- a/libc/include/__llvm-libc-common.h
+++ b/libc/include/__llvm-libc-common.h
@@ -50,7 +50,24 @@
 #define __END_C_DECLS
 
 #undef __restrict
-#define __restrict restrict // C99 and above support the restrict keyword.
+#if __STDC_VERSION__ >= 199901L
+// C99 and above support the restrict keyword.
+#define __restrict restrict
+#elif !defined(__GNUC__)
+// GNU-compatible compilers accept the __ spelling in all modes.
+// Otherwise, omit the qualifier for pure C89 compatibility.
+#define __restrict
+#endif
+
+#undef _Noreturn
+#if __STDC_VERSION__ >= 201112L
+// In C11 and later, _Noreturn is a keyword.
+#elif defined(__GNUC__)
+// GNU-compatible compilers have an equivalent attribute.
+#define _Noreturn __attribute__((__noreturn__))
+#else
+#define _Noreturn
+#endif
 
 #undef __NOEXCEPT
 #ifdef __GNUC__
diff --git a/libc/hdrgen/yaml/arpa/inet.yaml b/libc/include/arpa/inet.yaml
index cb366e0..10cd56d 100644
--- a/libc/hdrgen/yaml/arpa/inet.yaml
+++ b/libc/include/arpa/inet.yaml
@@ -1,4 +1,5 @@
-header: arpa-inet.h
+header: arpa/inet.h
+header_template: inet.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/hdrgen/yaml/assert.yaml b/libc/include/assert.yaml
index f740554..1a3bded 100644
--- a/libc/hdrgen/yaml/assert.yaml
+++ b/libc/include/assert.yaml
@@ -1,4 +1,5 @@
 header: assert.h
+header_template: assert.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/hdrgen/yaml/complex.yaml b/libc/include/complex.yaml
index cd81de7..0531848 100644
--- a/libc/hdrgen/yaml/complex.yaml
+++ b/libc/include/complex.yaml
@@ -1,4 +1,5 @@
 header: complex.h
+header_template: complex.h.def
 macros: []
 types:
   - type_name: cfloat16
diff --git a/libc/hdrgen/yaml/ctype.yaml b/libc/include/ctype.yaml
index b4823c3..6238f1b 100644
--- a/libc/hdrgen/yaml/ctype.yaml
+++ b/libc/include/ctype.yaml
@@ -1,4 +1,5 @@
 header: ctype.h
+header_template: ctype.h.def
 macros: []
 types:
   - type_name: locale_t
diff --git a/libc/hdrgen/yaml/dirent.yaml b/libc/include/dirent.yaml
index cdccf6a..3fc522f 100644
--- a/libc/hdrgen/yaml/dirent.yaml
+++ b/libc/include/dirent.yaml
@@ -1,4 +1,5 @@
 header: dirent.h
+header_template: dirent.h.def
 macros: []
 types:
   - type_name: struct_dirent
diff --git a/libc/hdrgen/yaml/dlfcn.yaml b/libc/include/dlfcn.yaml
index 725ee705..9e8803c 100644
--- a/libc/hdrgen/yaml/dlfcn.yaml
+++ b/libc/include/dlfcn.yaml
@@ -1,4 +1,5 @@
 header: dlfcn.h
+header_template: dlfcn.h.def
 macros:
   - macro_name: RTLD_LAZY
     macro_value: null
diff --git a/libc/hdrgen/yaml/elf.yaml b/libc/include/elf.yaml
index 2e9db32..f78ae82 100644
--- a/libc/hdrgen/yaml/elf.yaml
+++ b/libc/include/elf.yaml
@@ -1,4 +1,5 @@
 header: elf.h
+header_template: elf.h.def
 standards:
   - Linux
 macros: []
diff --git a/libc/hdrgen/yaml/errno.yaml b/libc/include/errno.yaml
index a894063..188a9fa 100644
--- a/libc/hdrgen/yaml/errno.yaml
+++ b/libc/include/errno.yaml
@@ -1,4 +1,5 @@
 header: errno.h
+header_template: errno.h.def
 standards:
   - stdc
   - Linux
diff --git a/libc/hdrgen/yaml/fcntl.yaml b/libc/include/fcntl.yaml
index 71c0df3..78f9353 100644
--- a/libc/hdrgen/yaml/fcntl.yaml
+++ b/libc/include/fcntl.yaml
@@ -1,4 +1,5 @@
 header: fcntl.h
+header_template: fcntl.h.def
 macros: []
 types:
   - type_name: off_t
diff --git a/libc/hdrgen/yaml/features.yaml b/libc/include/features.yaml
index a18af22..726320a 100644
--- a/libc/hdrgen/yaml/features.yaml
+++ b/libc/include/features.yaml
@@ -1,4 +1,5 @@
 header: features.h
+header_template: features.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/hdrgen/yaml/fenv.yaml b/libc/include/fenv.yaml
index 1010efc..1ecaf63 100644
--- a/libc/hdrgen/yaml/fenv.yaml
+++ b/libc/include/fenv.yaml
@@ -1,4 +1,5 @@
 header: fenv.h
+header_template: fenv.h.def
 macros: []
 types:
   - type_name: fenv_t
diff --git a/libc/hdrgen/yaml/float.yaml b/libc/include/float.yaml
index 63639a6..21df651 100644
--- a/libc/hdrgen/yaml/float.yaml
+++ b/libc/include/float.yaml
@@ -1,4 +1,5 @@
 header: float.h
+header_template: float.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/hdrgen/yaml/inttypes.yaml b/libc/include/inttypes.yaml
index ad636cc..d5dec5b 100644
--- a/libc/hdrgen/yaml/inttypes.yaml
+++ b/libc/include/inttypes.yaml
@@ -1,4 +1,5 @@
 header: inttypes.h
+header_template: inttypes.h.def
 macros: []
 types:
   - type_name: imaxdiv_t
diff --git a/libc/hdrgen/yaml/limits.yaml b/libc/include/limits.yaml
index bf33ed2..b664041 100644
--- a/libc/hdrgen/yaml/limits.yaml
+++ b/libc/include/limits.yaml
@@ -1,4 +1,5 @@
 header: limits.h
+header_template: limits.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/hdrgen/yaml/link.yaml b/libc/include/link.yaml
index d1963a8..1cd609e 100644
--- a/libc/hdrgen/yaml/link.yaml
+++ b/libc/include/link.yaml
@@ -1,4 +1,5 @@
 header: link.h
+header_template: link.h.def
 standards:
   - Linux
 macros: []
diff --git a/libc/hdrgen/yaml/locale.yaml b/libc/include/locale.yaml
index 7da7966..9ff53c1 100644
--- a/libc/hdrgen/yaml/locale.yaml
+++ b/libc/include/locale.yaml
@@ -1,4 +1,5 @@
 header: locale.h
+header_template: locale.h.def
 functions:
   - name: localeconv
     standards:
diff --git a/libc/hdrgen/yaml/malloc.yaml b/libc/include/malloc.yaml
index 8db4f3a..ec73c90 100644
--- a/libc/hdrgen/yaml/malloc.yaml
+++ b/libc/include/malloc.yaml
@@ -1,4 +1,5 @@
 header: malloc.h
+header_template: malloc.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/hdrgen/yaml/math.yaml b/libc/include/math.yaml
index 3b8caec..831d045 100644
--- a/libc/hdrgen/yaml/math.yaml
+++ b/libc/include/math.yaml
@@ -1,4 +1,5 @@
 header: math.h
+header_template: math.h.def
 macros: []
 types:
   - type_name: float_t
diff --git a/libc/hdrgen/yaml/pthread.yaml b/libc/include/pthread.yaml
index b9068c3..4f386bd 100644
--- a/libc/hdrgen/yaml/pthread.yaml
+++ b/libc/include/pthread.yaml
@@ -1,4 +1,5 @@
 header: pthread.h
+header_template: pthread.h.def
 macros: []
 types:
   - type_name: pthread_t
diff --git a/libc/hdrgen/yaml/sched.yaml b/libc/include/sched.yaml
index 2d4876b..57871f5 100644
--- a/libc/hdrgen/yaml/sched.yaml
+++ b/libc/include/sched.yaml
@@ -1,4 +1,5 @@
 header: sched.h
+header_template: sched.h.def
 macros: []
 types:
   - type_name: struct_timespec
diff --git a/libc/hdrgen/yaml/search.yaml b/libc/include/search.yaml
index a0c73bc..b7ce06d 100644
--- a/libc/hdrgen/yaml/search.yaml
+++ b/libc/include/search.yaml
@@ -1,4 +1,5 @@
 header: search.h
+header_template: search.h.def
 macros: []
 types:
   - type_name: struct_hsearch_data
diff --git a/libc/hdrgen/yaml/setjmp.yaml b/libc/include/setjmp.yaml
index 68e3ff0..2c4f7fb 100644
--- a/libc/hdrgen/yaml/setjmp.yaml
+++ b/libc/include/setjmp.yaml
@@ -1,4 +1,5 @@
 header: setjmp.h
+header_template: setjmp.h.def
 macros: []
 types:
   - type_name: jmp_buf
diff --git a/libc/hdrgen/yaml/signal.yaml b/libc/include/signal.yaml
index c66abb1..576e7757 100644
--- a/libc/hdrgen/yaml/signal.yaml
+++ b/libc/include/signal.yaml
@@ -1,4 +1,5 @@
 header: signal.h
+header_template: signal.h.def
 macros: []
 types:
   - type_name: pid_t
diff --git a/libc/hdrgen/yaml/spawn.yaml b/libc/include/spawn.yaml
index be3f4e9..e725ab9 100644
--- a/libc/hdrgen/yaml/spawn.yaml
+++ b/libc/include/spawn.yaml
@@ -1,4 +1,5 @@
 header: spawn.h
+header_template: spawn.h.def
 macros: []
 types:
   - type_name: posix_spawn_file_actions_t
diff --git a/libc/hdrgen/yaml/stdbit.yaml b/libc/include/stdbit.yaml
index 25d2d32..e9bd6b3 100644
--- a/libc/hdrgen/yaml/stdbit.yaml
+++ b/libc/include/stdbit.yaml
@@ -1,4 +1,5 @@
 header: stdbit.h
+header_template: stdbit.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/hdrgen/yaml/stdckdint.yaml b/libc/include/stdckdint.yaml
index ea8fc47..e8b2e80 100644
--- a/libc/hdrgen/yaml/stdckdint.yaml
+++ b/libc/include/stdckdint.yaml
@@ -1,4 +1,5 @@
 header: stdckdint.h
+header_template: stdckdint.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/hdrgen/yaml/stdfix.yaml b/libc/include/stdfix.yaml
index 9787eab..7b3bdba 100644
--- a/libc/hdrgen/yaml/stdfix.yaml
+++ b/libc/include/stdfix.yaml
@@ -1,4 +1,5 @@
 header: stdfix.h
+header_template: stdfix.h.def
 macros: []
 types: 
   - type_name: stdfix-types
diff --git a/libc/hdrgen/yaml/stdint.yaml b/libc/include/stdint.yaml
index 8887f59..d583a10 100644
--- a/libc/hdrgen/yaml/stdint.yaml
+++ b/libc/include/stdint.yaml
@@ -1,4 +1,5 @@
 header: stdint.h
+header_template: stdint.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/hdrgen/yaml/stdio.yaml b/libc/include/stdio.yaml
index fd116bb..2619984 100644
--- a/libc/hdrgen/yaml/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -1,4 +1,5 @@
 header: stdio.h
+header_template: stdio.h.def
 macros:
   - macro_name: stdout
     macro_value: stdout
diff --git a/libc/hdrgen/yaml/stdlib.yaml b/libc/include/stdlib.yaml
index c6c95e4..4b68f27 100644
--- a/libc/hdrgen/yaml/stdlib.yaml
+++ b/libc/include/stdlib.yaml
@@ -1,4 +1,5 @@
 header: stdlib.h
+header_template: stdlib.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/hdrgen/yaml/string.yaml b/libc/include/string.yaml
index af1750e..deded30 100644
--- a/libc/hdrgen/yaml/string.yaml
+++ b/libc/include/string.yaml
@@ -1,4 +1,5 @@
 header: string.h
+header_template: string.h.def
 macros: []
 types:
   - type_name: size_t
diff --git a/libc/hdrgen/yaml/strings.yaml b/libc/include/strings.yaml
index ca91b62..e672dca 100644
--- a/libc/hdrgen/yaml/strings.yaml
+++ b/libc/include/strings.yaml
@@ -1,4 +1,5 @@
 header: strings.h
+header_template: strings.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/hdrgen/yaml/sys/auxv.yaml b/libc/include/sys/auxv.yaml
index 9d546b3..82ecee7 100644
--- a/libc/hdrgen/yaml/sys/auxv.yaml
+++ b/libc/include/sys/auxv.yaml
@@ -1,4 +1,5 @@
-header: sys-auxv.h
+header: sys/auxv.h
+header_template: auxv.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/hdrgen/yaml/sys/epoll.yaml b/libc/include/sys/epoll.yaml
index ee188c1..996eb78 100644
--- a/libc/hdrgen/yaml/sys/epoll.yaml
+++ b/libc/include/sys/epoll.yaml
@@ -1,4 +1,5 @@
-header: sys-epoll.h
+header: sys/epoll.h
+header_template: epoll.h.def
 macros: []
 types:
   - type_name: struct_epoll_event
diff --git a/libc/hdrgen/yaml/sys/ioctl.yaml b/libc/include/sys/ioctl.yaml
index ffe73a8..5f7b7f3 100644
--- a/libc/hdrgen/yaml/sys/ioctl.yaml
+++ b/libc/include/sys/ioctl.yaml
@@ -1,4 +1,5 @@
-header: sys-ioctl.h
+header: sys/ioctl.h
+header_template: ioctl.h.def
 standards: POSIX
 macros: []
 types: []
diff --git a/libc/hdrgen/yaml/sys/mman.yaml b/libc/include/sys/mman.yaml
index 962ca35..8c20755 100644
--- a/libc/hdrgen/yaml/sys/mman.yaml
+++ b/libc/include/sys/mman.yaml
@@ -1,4 +1,5 @@
-header: sys-mman.h
+header: sys/mman.h
+header_template: mman.h.def
 macros: []
 types:
   - type_name: mode_t
diff --git a/libc/hdrgen/yaml/sys/prctl.yaml b/libc/include/sys/prctl.yaml
index 82374be..53f5764 100644
--- a/libc/hdrgen/yaml/sys/prctl.yaml
+++ b/libc/include/sys/prctl.yaml
@@ -1,4 +1,5 @@
-header: sys-prctl.h
+header: sys/prctl.h
+header_template: prctl.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/hdrgen/yaml/sys/random.yaml b/libc/include/sys/random.yaml
index 228bb50..4efb2fb 100644
--- a/libc/hdrgen/yaml/sys/random.yaml
+++ b/libc/include/sys/random.yaml
@@ -1,4 +1,5 @@
-header: sys-random.h
+header: sys/random.h
+header_template: random.h.def
 macros: []
 types:
   - type_name: ssize_t
diff --git a/libc/hdrgen/yaml/sys/resource.yaml b/libc/include/sys/resource.yaml
index 85ea1ad..3652d6d 100644
--- a/libc/hdrgen/yaml/sys/resource.yaml
+++ b/libc/include/sys/resource.yaml
@@ -1,4 +1,5 @@
-header: sys-resource.h
+header: sys/resource.h
+header_template: resource.h.def
 macros: []
 types:
   - type_name: struct_rlimit
diff --git a/libc/hdrgen/yaml/sys/select.yaml b/libc/include/sys/select.yaml
index c680612..6066fd3 100644
--- a/libc/hdrgen/yaml/sys/select.yaml
+++ b/libc/include/sys/select.yaml
@@ -1,4 +1,5 @@
-header: sys-select.h
+header: sys/select.h
+header_template: select.h.def
 macros: []
 types:
   - type_name: struct_timeval
diff --git a/libc/hdrgen/yaml/sys/sendfile.yaml b/libc/include/sys/sendfile.yaml
index 7e45e40..259ab83 100644
--- a/libc/hdrgen/yaml/sys/sendfile.yaml
+++ b/libc/include/sys/sendfile.yaml
@@ -1,4 +1,5 @@
-header: sys-sendfile.h
+header: sys/sendfile.h
+header_template: sendfile.h.def
 macros: []
 types:
   - type_name: ssize_t
diff --git a/libc/hdrgen/yaml/sys/socket.yaml b/libc/include/sys/socket.yaml
index 47d835f..00d5de6 100644
--- a/libc/hdrgen/yaml/sys/socket.yaml
+++ b/libc/include/sys/socket.yaml
@@ -1,4 +1,5 @@
-header: sys-socket.h
+header: sys/socket.h
+header_template: socket.h.def
 macros: []
 types:
   - type_name: struct_sockaddr_un
diff --git a/libc/hdrgen/yaml/sys/stat.yaml b/libc/include/sys/stat.yaml
index ed500f8..7f01342 100644
--- a/libc/hdrgen/yaml/sys/stat.yaml
+++ b/libc/include/sys/stat.yaml
@@ -1,4 +1,5 @@
-header: sys-stat.h
+header: sys/stat.h
+header_template: stat.h.def
 macros: []
 types:
   - type_name: blkcnt_t
diff --git a/libc/hdrgen/yaml/sys/statvfs.yaml b/libc/include/sys/statvfs.yaml
index 22e0ef2..8c1d254 100644
--- a/libc/hdrgen/yaml/sys/statvfs.yaml
+++ b/libc/include/sys/statvfs.yaml
@@ -1,4 +1,5 @@
-header: sys-statvfs.h
+header: sys/statvfs.h
+header_template: statvfs.h.def
 macros: []
 types:
   - type_name: struct_statvfs
diff --git a/libc/hdrgen/yaml/sys/syscall.yaml b/libc/include/sys/syscall.yaml
index c0a64338..879d95c 100644
--- a/libc/hdrgen/yaml/sys/syscall.yaml
+++ b/libc/include/sys/syscall.yaml
@@ -1,4 +1,5 @@
-header: sys-syscall.h
+header: sys/syscall.h
+header_template: syscall.h.def
 standards: Linux
 macros: []
 types: []
diff --git a/libc/hdrgen/yaml/sys/time.yaml b/libc/include/sys/time.yaml
index eb3dd54..687c1f8 100644
--- a/libc/hdrgen/yaml/sys/time.yaml
+++ b/libc/include/sys/time.yaml
@@ -1,4 +1,5 @@
-header: sys-time.h
+header: sys/time.h
+header_template: time.h.def
 standards: Linux
 macros: []
 types: []
diff --git a/libc/hdrgen/yaml/sys/types.yaml b/libc/include/sys/types.yaml
index 15eaf10..6fa0b44 100644
--- a/libc/hdrgen/yaml/sys/types.yaml
+++ b/libc/include/sys/types.yaml
@@ -1,4 +1,5 @@
-header: sys-types.h
+header: sys/types.h
+header_template: types.h.def
 standards: POSIX
 macros: []
 types:
diff --git a/libc/hdrgen/yaml/sys/utsname.yaml b/libc/include/sys/utsname.yaml
index eecd55b..6c7cb71 100644
--- a/libc/hdrgen/yaml/sys/utsname.yaml
+++ b/libc/include/sys/utsname.yaml
@@ -1,4 +1,5 @@
-header: sys-utsname.h
+header: sys/utsname.h
+header_template: utsname.h.def
 macros: []
 types:
   - type_name: struct_utsname
diff --git a/libc/hdrgen/yaml/sys/wait.yaml b/libc/include/sys/wait.yaml
index 4f0c69b..6257e34 100644
--- a/libc/hdrgen/yaml/sys/wait.yaml
+++ b/libc/include/sys/wait.yaml
@@ -1,4 +1,5 @@
-header: sys-wait.h
+header: sys/wait.h
+header_template: wait.h.def
 macros: []
 types:
   - type_name: siginfo_t
diff --git a/libc/hdrgen/yaml/termios.yaml b/libc/include/termios.yaml
index e9c4cd3..8815097 100644
--- a/libc/hdrgen/yaml/termios.yaml
+++ b/libc/include/termios.yaml
@@ -1,4 +1,5 @@
 header: termios.h
+header_template: termios.h.def
 macros: []
 types:
   - type_name: tcflag_t
diff --git a/libc/hdrgen/yaml/threads.yaml b/libc/include/threads.yaml
index aadcaf5..7014822 100644
--- a/libc/hdrgen/yaml/threads.yaml
+++ b/libc/include/threads.yaml
@@ -1,4 +1,5 @@
 header: threads.h
+header_template: threads.h.def
 macros:
   - macro_name: ONCE_FLAG_INIT
     macro_value: '{0}'
diff --git a/libc/hdrgen/yaml/time.yaml b/libc/include/time.yaml
index 3f745e5..b71b9ab 100644
--- a/libc/hdrgen/yaml/time.yaml
+++ b/libc/include/time.yaml
@@ -1,4 +1,5 @@
 header: time.h
+header_template: time.h.def
 macros: []
 types:
   - type_name: struct_timeval
diff --git a/libc/hdrgen/yaml/uchar.yaml b/libc/include/uchar.yaml
index 18ca840..7139197 100644
--- a/libc/hdrgen/yaml/uchar.yaml
+++ b/libc/include/uchar.yaml
@@ -1,4 +1,5 @@
 header: uchar.h
+header_template: uchar.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/hdrgen/yaml/unistd.yaml b/libc/include/unistd.yaml
index c6441c0..fada365 100644
--- a/libc/hdrgen/yaml/unistd.yaml
+++ b/libc/include/unistd.yaml
@@ -1,4 +1,5 @@
 header: unistd.h
+header_template: unistd.h.def
 macros: []
 types:
   - type_name: uid_t
diff --git a/libc/hdrgen/yaml/wchar.yaml b/libc/include/wchar.yaml
index bc824b2..27a5926b 100644
--- a/libc/hdrgen/yaml/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -1,4 +1,5 @@
 header: wchar.h
+header_template: wchar.h.def
 macros: []
 types:
   - type_name: size_t
diff --git a/libc/shared/rpc_util.h b/libc/shared/rpc_util.h
index 9406de5..687814b7 100644
--- a/libc/shared/rpc_util.h
+++ b/libc/shared/rpc_util.h
@@ -152,10 +152,10 @@ public:
 
 /// Suspend the thread briefly to assist the thread scheduler during busy loops.
 RPC_ATTRS void sleep_briefly() {
-#if defined(__NVPTX__) && defined(RPC_TARGET_IS_GPU)
+#if __has_builtin(__nvvm_reflect)
   if (__nvvm_reflect("__CUDA_ARCH") >= 700)
     asm("nanosleep.u32 64;" ::: "memory");
-#elif defined(__AMDGPU__) && defined(RPC_TARGET_IS_GPU)
+#elif __has_builtin(__builtin_amdgcn_s_sleep)
   __builtin_amdgcn_s_sleep(2);
 #elif __has_builtin(__builtin_ia32_pause)
   __builtin_ia32_pause();
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 4e90aad..5090dc2 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -267,7 +267,9 @@ add_header_library(
   HDRS
     fixedvector.h
   DEPENDS
+    .libc_assert
     libc.src.__support.CPP.array
+    libc.src.string.memory_utils.inline_memset
 )
 
 add_header_library(
diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp
index 972249f..528542c 100644
--- a/libc/src/__support/File/file.cpp
+++ b/libc/src/__support/File/file.cpp
@@ -42,7 +42,7 @@ FileIOResult File::write_unlocked_nbf(const uint8_t *data, size_t len) {
   if (pos > 0) { // If the buffer is not empty
     // Flush the buffer
     const size_t write_size = pos;
-    auto write_result = platform_write(this, buf, write_size);
+    FileIOResult write_result = platform_write(this, buf, write_size);
     pos = 0; // Buffer is now empty so reset pos to the beginning.
     // If less bytes were written than expected, then an error occurred.
     if (write_result < write_size) {
@@ -52,7 +52,7 @@ FileIOResult File::write_unlocked_nbf(const uint8_t *data, size_t len) {
     }
   }
 
-  auto write_result = platform_write(this, data, len);
+  FileIOResult write_result = platform_write(this, data, len);
   if (write_result < len)
     err = true;
   return write_result;
@@ -99,7 +99,7 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) {
   // is full.
   const size_t write_size = pos;
 
-  auto buf_result = platform_write(this, buf, write_size);
+  FileIOResult buf_result = platform_write(this, buf, write_size);
   size_t bytes_written = buf_result.value;
 
   pos = 0; // Buffer is now empty so reset pos to the beginning.
@@ -121,7 +121,8 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) {
     pos = remainder.size();
   } else {
 
-    auto result = platform_write(this, remainder.data(), remainder.size());
+    FileIOResult result =
+        platform_write(this, remainder.data(), remainder.size());
     size_t bytes_written = buf_result.value;
 
     // If less bytes were written than expected, then an error occurred. Return
@@ -190,6 +191,17 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
 
   prev_op = FileOp::READ;
 
+  if (bufmode == _IONBF) { // unbuffered.
+    return read_unlocked_nbf(static_cast<uint8_t *>(data), len);
+  } else if (bufmode == _IOFBF) { // fully buffered
+    return read_unlocked_fbf(static_cast<uint8_t *>(data), len);
+  } else /*if (bufmode == _IOLBF) */ { // line buffered
+    // There is no line buffered mode for read. Use fully buffered instead.
+    return read_unlocked_fbf(static_cast<uint8_t *>(data), len);
+  }
+}
+
+size_t File::copy_data_from_buf(uint8_t *data, size_t len) {
   cpp::span<uint8_t> bufref(static_cast<uint8_t *>(buf), bufsize);
   cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data), len);
 
@@ -209,32 +221,42 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
   for (size_t i = 0; i < available_data; ++i)
     dataref[i] = bufref[i + pos];
   read_limit = pos = 0; // Reset the pointers.
+
+  return available_data;
+}
+
+FileIOResult File::read_unlocked_fbf(uint8_t *data, size_t len) {
+  // Read data from the buffer first.
+  size_t available_data = copy_data_from_buf(data, len);
+  if (available_data == len)
+    return available_data;
+
   // Update the dataref to reflect that fact that we have already
   // copied |available_data| into |data|.
-  dataref = cpp::span<uint8_t>(dataref.data() + available_data,
-                               dataref.size() - available_data);
-
   size_t to_fetch = len - available_data;
+  cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data) + available_data,
+                             to_fetch);
+
   if (to_fetch > bufsize) {
-    auto result = platform_read(this, dataref.data(), to_fetch);
+    FileIOResult result = platform_read(this, dataref.data(), to_fetch);
     size_t fetched_size = result.value;
     if (result.has_error() || fetched_size < to_fetch) {
       if (!result.has_error())
         eof = true;
       else
         err = true;
-      return {available_data + fetched_size, result.has_error()};
+      return {available_data + fetched_size, result.error};
     }
     return len;
   }
 
   // Fetch and buffer another buffer worth of data.
-  auto result = platform_read(this, buf, bufsize);
+  FileIOResult result = platform_read(this, buf, bufsize);
   size_t fetched_size = result.value;
   read_limit += fetched_size;
   size_t transfer_size = fetched_size >= to_fetch ? to_fetch : fetched_size;
   for (size_t i = 0; i < transfer_size; ++i)
-    dataref[i] = bufref[i];
+    dataref[i] = buf[i];
   pos += transfer_size;
   if (result.has_error() || fetched_size < to_fetch) {
     if (!result.has_error())
@@ -245,6 +267,26 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
   return {transfer_size + available_data, result.error};
 }
 
+FileIOResult File::read_unlocked_nbf(uint8_t *data, size_t len) {
+  // Check whether there is a character in the ungetc buffer.
+  size_t available_data = copy_data_from_buf(data, len);
+  if (available_data == len)
+    return available_data;
+
+  // Directly copy the data into |data|.
+  cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data) + available_data,
+                             len - available_data);
+  FileIOResult result = platform_read(this, dataref.data(), dataref.size());
+
+  if (result.has_error() || result < dataref.size()) {
+    if (!result.has_error())
+      eof = true;
+    else
+      err = true;
+  }
+  return {result + available_data, result.error};
+}
+
 int File::ungetc_unlocked(int c) {
   // There is no meaning to unget if:
   // 1. You are trying to push back EOF.
@@ -287,7 +329,7 @@ ErrorOr<int> File::seek(off_t offset, int whence) {
   FileLock lock(this);
   if (prev_op == FileOp::WRITE && pos > 0) {
 
-    auto buf_result = platform_write(this, buf, pos);
+    FileIOResult buf_result = platform_write(this, buf, pos);
     if (buf_result.has_error() || buf_result.value < pos) {
       err = true;
       return Error(buf_result.error);
@@ -325,7 +367,7 @@ ErrorOr<off_t> File::tell() {
 
 int File::flush_unlocked() {
   if (prev_op == FileOp::WRITE && pos > 0) {
-    auto buf_result = platform_write(this, buf, pos);
+    FileIOResult buf_result = platform_write(this, buf, pos);
     if (buf_result.has_error() || buf_result.value < pos) {
       err = true;
       return buf_result.error;
diff --git a/libc/src/__support/File/file.h b/libc/src/__support/File/file.h
index 42e1d11..5c97a9c 100644
--- a/libc/src/__support/File/file.h
+++ b/libc/src/__support/File/file.h
@@ -280,6 +280,10 @@ private:
   FileIOResult write_unlocked_fbf(const uint8_t *data, size_t len);
   FileIOResult write_unlocked_nbf(const uint8_t *data, size_t len);
 
+  FileIOResult read_unlocked_fbf(uint8_t *data, size_t len);
+  FileIOResult read_unlocked_nbf(uint8_t *data, size_t len);
+  size_t copy_data_from_buf(uint8_t *data, size_t len);
+
   constexpr void adjust_buf() {
     if (read_allowed() && (buf == nullptr || bufsize == 0)) {
       // We should allow atleast one ungetc operation.
diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index 28fd9a1..9b359f6 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -1,16 +1,12 @@
-if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+# These utilities are GPU only.
+if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
-add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
-set(target_gpu_utils libc.src.__support.GPU.${LIBC_TARGET_ARCHITECTURE}.${LIBC_TARGET_ARCHITECTURE}_utils)
-
 add_header_library(
   utils
   HDRS
     utils.h
-  DEPENDS
-    ${target_gpu_utils}
 )
 
 add_object_library(
@@ -21,6 +17,6 @@ add_object_library(
     allocator.h
   DEPENDS
     libc.src.__support.common
-    libc.src.__support.GPU.utils
     libc.src.__support.RPC.rpc_client
+    .utils
 )
diff --git a/libc/src/__support/GPU/amdgpu/CMakeLists.txt b/libc/src/__support/GPU/amdgpu/CMakeLists.txt
deleted file mode 100644
index f2b98fc..0000000
--- a/libc/src/__support/GPU/amdgpu/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  amdgpu_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
deleted file mode 100644
index 6ab9540..0000000
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ /dev/null
@@ -1,183 +0,0 @@
-//===-------------- AMDGPU implementation of GPU utils ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-/// Type aliases to the address spaces used by the AMDGPU backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
-
-/// Returns the number of workgroups in the 'x' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_x() {
-  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-}
-
-/// Returns the number of workgroups in the 'y' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_y() {
-  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-}
-
-/// Returns the number of workgroups in the 'z' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_z() {
-  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-}
-
-/// Returns the total number of workgruops in the grid.
-LIBC_INLINE uint64_t get_num_blocks() {
-  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
-}
-
-/// Returns the 'x' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_x() {
-  return __builtin_amdgcn_workgroup_id_x();
-}
-
-/// Returns the 'y' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_y() {
-  return __builtin_amdgcn_workgroup_id_y();
-}
-
-/// Returns the 'z' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_z() {
-  return __builtin_amdgcn_workgroup_id_z();
-}
-
-/// Returns the absolute id of the AMD workgroup.
-LIBC_INLINE uint64_t get_block_id() {
-  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
-         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
-}
-
-/// Returns the number of workitems in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_threads_x() {
-  return __builtin_amdgcn_workgroup_size_x();
-}
-
-/// Returns the number of workitems in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_threads_y() {
-  return __builtin_amdgcn_workgroup_size_y();
-}
-
-/// Returns the number of workitems in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_threads_z() {
-  return __builtin_amdgcn_workgroup_size_z();
-}
-
-/// Returns the total number of workitems in the workgroup.
-LIBC_INLINE uint64_t get_num_threads() {
-  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
-}
-
-/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_x() {
-  return __builtin_amdgcn_workitem_id_x();
-}
-
-/// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_y() {
-  return __builtin_amdgcn_workitem_id_y();
-}
-
-/// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_z() {
-  return __builtin_amdgcn_workitem_id_z();
-}
-
-/// Returns the absolute id of the thread in the current AMD workgroup.
-LIBC_INLINE uint64_t get_thread_id() {
-  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
-         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
-}
-
-/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
-/// and compilation options.
-LIBC_INLINE uint32_t get_lane_size() {
-  return __builtin_amdgcn_wavefrontsize();
-}
-
-/// Returns the id of the thread inside of an AMD wavefront executing together.
-[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-/// Returns the bit-mask of active threads in the current wavefront.
-[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  return __builtin_amdgcn_read_exec();
-}
-
-/// Copies the value from the first active thread in the wavefront to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
-                                                           uint32_t x) {
-  return __builtin_amdgcn_readfirstlane(x);
-}
-
-/// Returns a bitmask of threads in the current lane for which \p x is true.
-[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  // the lane_mask & gives the nvptx semantics when lane_mask is a subset of
-  // the active threads
-  return lane_mask & __builtin_amdgcn_ballot_w64(x);
-}
-
-/// Waits for all the threads in the block to converge and issues a fence.
-[[clang::convergent]] LIBC_INLINE void sync_threads() {
-  __builtin_amdgcn_s_barrier();
-  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
-}
-
-/// Waits for all pending memory operations to complete in program order.
-[[clang::convergent]] LIBC_INLINE void memory_fence() {
-  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
-}
-
-/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
-  __builtin_amdgcn_wave_barrier();
-}
-
-/// Shuffles the the lanes inside the wavefront according to the given index.
-[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
-                                                   uint32_t x) {
-  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
-}
-
-/// Returns the current value of the GPU's processor clock.
-/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
-LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
-
-/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
-/// the card and can only be queried via the driver.
-LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
-}
-
-/// Terminates execution of the associated wavefront.
-[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); }
-
-/// Returns a unique identifier for the process cluster the current wavefront is
-/// executing on. Here we use the identifier for the compute unit (CU) and
-/// shader engine.
-/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface
-/// than the one at
-/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899
-LIBC_INLINE uint32_t get_cluster_id() { return 0; }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif
diff --git a/libc/src/__support/GPU/generic/CMakeLists.txt b/libc/src/__support/GPU/generic/CMakeLists.txt
deleted file mode 100644
index 68ba7d1..0000000
--- a/libc/src/__support/GPU/generic/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  generic_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
deleted file mode 100644
index 9461ef0..0000000
--- a/libc/src/__support/GPU/generic/utils.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//===-------------- Generic implementation of GPU utils ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-template <typename T> using Private = T;
-template <typename T> using Constant = T;
-template <typename T> using Shared = T;
-template <typename T> using Global = T;
-
-LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }
-
-LIBC_INLINE uint32_t get_num_blocks_y() { return 1; }
-
-LIBC_INLINE uint32_t get_num_blocks_z() { return 1; }
-
-LIBC_INLINE uint64_t get_num_blocks() { return 1; }
-
-LIBC_INLINE uint32_t get_block_id_x() { return 0; }
-
-LIBC_INLINE uint32_t get_block_id_y() { return 0; }
-
-LIBC_INLINE uint32_t get_block_id_z() { return 0; }
-
-LIBC_INLINE uint64_t get_block_id() { return 0; }
-
-LIBC_INLINE uint32_t get_num_threads_x() { return 1; }
-
-LIBC_INLINE uint32_t get_num_threads_y() { return 1; }
-
-LIBC_INLINE uint32_t get_num_threads_z() { return 1; }
-
-LIBC_INLINE uint64_t get_num_threads() { return 1; }
-
-LIBC_INLINE uint32_t get_thread_id_x() { return 0; }
-
-LIBC_INLINE uint32_t get_thread_id_y() { return 0; }
-
-LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
-
-LIBC_INLINE uint64_t get_thread_id() { return 0; }
-
-LIBC_INLINE uint32_t get_lane_size() { return 1; }
-
-LIBC_INLINE uint32_t get_lane_id() { return 0; }
-
-LIBC_INLINE uint64_t get_lane_mask() { return 1; }
-
-LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }
-
-LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
-
-LIBC_INLINE void sync_threads() {}
-
-LIBC_INLINE void sync_lane(uint64_t) {}
-
-LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; }
-
-LIBC_INLINE uint64_t processor_clock() { return 0; }
-
-LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }
-
-[[noreturn]] LIBC_INLINE void end_program() { __builtin_unreachable(); }
-
-LIBC_INLINE uint32_t get_cluster_id() { return 0; }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
diff --git a/libc/src/__support/GPU/nvptx/CMakeLists.txt b/libc/src/__support/GPU/nvptx/CMakeLists.txt
deleted file mode 100644
index 0d3f8c7..0000000
--- a/libc/src/__support/GPU/nvptx/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  nvptx_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
deleted file mode 100644
index 1a43a83..0000000
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ /dev/null
@@ -1,160 +0,0 @@
-//===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-id: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-/// Type aliases to the address spaces used by the NVPTX backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
-
-/// Returns the number of CUDA blocks in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_blocks_x() {
-  return __nvvm_read_ptx_sreg_nctaid_x();
-}
-
-/// Returns the number of CUDA blocks in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_blocks_y() {
-  return __nvvm_read_ptx_sreg_nctaid_y();
-}
-
-/// Returns the number of CUDA blocks in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_blocks_z() {
-  return __nvvm_read_ptx_sreg_nctaid_z();
-}
-
-/// Returns the total number of CUDA blocks.
-LIBC_INLINE uint64_t get_num_blocks() {
-  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
-}
-
-/// Returns the 'x' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
-
-/// Returns the 'y' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
-
-/// Returns the 'z' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
-
-/// Returns the absolute id of the CUDA block.
-LIBC_INLINE uint64_t get_block_id() {
-  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
-         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
-}
-
-/// Returns the number of CUDA threads in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_threads_x() {
-  return __nvvm_read_ptx_sreg_ntid_x();
-}
-
-/// Returns the number of CUDA threads in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_threads_y() {
-  return __nvvm_read_ptx_sreg_ntid_y();
-}
-
-/// Returns the number of CUDA threads in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_threads_z() {
-  return __nvvm_read_ptx_sreg_ntid_z();
-}
-
-/// Returns the total number of threads in the block.
-LIBC_INLINE uint64_t get_num_threads() {
-  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
-}
-
-/// Returns the 'x' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
-
-/// Returns the 'y' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
-
-/// Returns the 'z' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
-
-/// Returns the absolute id of the thread in the current CUDA block.
-LIBC_INLINE uint64_t get_thread_id() {
-  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
-         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
-}
-
-/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
-LIBC_INLINE uint32_t get_lane_size() { return 32; }
-
-/// Returns the id of the thread inside of a CUDA warp executing together.
-[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  return __nvvm_read_ptx_sreg_laneid();
-}
-
-/// Returns the bit-mask of active threads in the current warp.
-[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  return __nvvm_activemask();
-}
-
-/// Copies the value from the first active thread in the warp to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
-                                                           uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t id = __builtin_ffs(mask) - 1;
-  return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
-}
-
-/// Returns a bitmask of threads in the current lane for which \p x is true.
-[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  return __nvvm_vote_ballot_sync(mask, x);
-}
-
-/// Waits for all the threads in the block to converge and issues a fence.
-[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
-
-/// Waits for all pending memory operations to complete in program order.
-[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
-
-/// Waits for all threads in the warp to reconverge for independent scheduling.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
-  __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
-}
-
-/// Shuffles the the lanes inside the warp according to the given index.
-[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
-                                                   uint32_t idx, uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t bitmask = (mask >> idx) & 1;
-  return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
-}
-
-/// Returns the current value of the GPU's processor clock.
-LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
-
-/// Returns a global fixed-frequency timer at nanosecond frequency.
-LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
-}
-
-/// Terminates execution of the calling thread.
-[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
-
-/// Returns a unique identifier for the process cluster the current warp is
-/// executing on. Here we use the identifier for the symmetric multiprocessor.
-LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index ae52e7a..e138c84 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -9,48 +9,108 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
 #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
 
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
 
-#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#include "amdgpu/utils.h"
-#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
-#include "nvptx/utils.h"
-#else
-#include "generic/utils.h"
+#if !__has_include(<gpuintrin.h>)
+#error "Unsupported compiler"
 #endif
 
+#include <gpuintrin.h>
+
 namespace LIBC_NAMESPACE_DECL {
 namespace gpu {
-/// Get the first active thread inside the lane.
-LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) {
-  return __builtin_ffsll(lane_mask) - 1;
+
+template <typename T> using Private = __gpu_private T;
+template <typename T> using Constant = __gpu_constant T;
+template <typename T> using Local = __gpu_local T;
+template <typename T> using Global = __gpu_local T;
+
+LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); }
+
+LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); }
+
+LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); }
+
+LIBC_INLINE uint64_t get_num_blocks() {
+  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
+}
+
+LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); }
+
+LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); }
+
+LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); }
+
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); }
+
+LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); }
+
+LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); }
+
+LIBC_INLINE uint64_t get_num_threads() {
+  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
+}
+
+LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); }
+
+LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); }
+
+LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); }
+
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); }
+
+LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); }
+
+LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); }
+
+LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+  return __gpu_read_first_lane_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  return __gpu_ballot(lane_mask, x);
+}
+
+LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
+
+LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
+
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
+  return __gpu_shuffle_idx_u32(lane_mask, idx, x);
 }
 
-/// Conditional that is only true for a single thread in a lane.
+[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
+
 LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
-  return gpu::get_lane_id() == get_first_lane_id(lane_mask);
+  return __gpu_is_first_in_lane(lane_mask);
 }
 
-/// Gets the sum of all lanes inside the warp or wavefront.
 LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
-  for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) {
-    uint32_t index = step + gpu::get_lane_id();
-    x += gpu::shuffle(lane_mask, index, x);
-  }
-  return gpu::broadcast_value(lane_mask, x);
+  return __gpu_lane_sum_u32(lane_mask, x);
 }
 
-/// Gets the accumulator scan of the threads in the warp or wavefront.
 LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
-  for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) {
-    uint32_t index = gpu::get_lane_id() - step;
-    uint32_t bitmask = gpu::get_lane_id() >= step;
-    x += -bitmask & gpu::shuffle(lane_mask, index, x);
-  }
-  return x;
+  return __gpu_lane_scan_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t fixed_frequency_clock() {
+  return __builtin_readsteadycounter();
 }
 
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h
index 7ac0c23..34601f8 100644
--- a/libc/src/__support/fixedvector.h
+++ b/libc/src/__support/fixedvector.h
@@ -10,9 +10,10 @@
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDVECTOR_H
 
 #include "src/__support/CPP/array.h"
-
 #include "src/__support/CPP/iterator.h"
+#include "src/__support/libc_assert.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memset.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -23,27 +24,32 @@ template <typename T, size_t CAPACITY> class FixedVector {
   size_t item_count = 0;
 
 public:
-  constexpr FixedVector() = default;
+  LIBC_INLINE constexpr FixedVector() = default;
 
   using iterator = typename cpp::array<T, CAPACITY>::iterator;
-  constexpr FixedVector(iterator begin, iterator end) : store{}, item_count{} {
+  LIBC_INLINE constexpr FixedVector(iterator begin, iterator end)
+      : store{}, item_count{} {
+    LIBC_ASSERT(begin + CAPACITY >= end);
     for (; begin != end; ++begin)
       push_back(*begin);
   }
 
   using const_iterator = typename cpp::array<T, CAPACITY>::const_iterator;
-  constexpr FixedVector(const_iterator begin, const_iterator end)
+  LIBC_INLINE constexpr FixedVector(const_iterator begin, const_iterator end)
       : store{}, item_count{} {
+    LIBC_ASSERT(begin + CAPACITY >= end);
     for (; begin != end; ++begin)
       push_back(*begin);
   }
 
-  constexpr FixedVector(size_t count, const T &value) : store{}, item_count{} {
+  LIBC_INLINE constexpr FixedVector(size_t count, const T &value)
+      : store{}, item_count{} {
+    LIBC_ASSERT(count <= CAPACITY);
     for (size_t i = 0; i < count; ++i)
       push_back(value);
   }
 
-  constexpr bool push_back(const T &obj) {
+  LIBC_INLINE constexpr bool push_back(const T &obj) {
     if (item_count == CAPACITY)
       return false;
     store[item_count] = obj;
@@ -51,27 +57,43 @@ public:
     return true;
   }
 
-  constexpr const T &back() const { return store[item_count - 1]; }
+  LIBC_INLINE constexpr const T &back() const {
+    LIBC_ASSERT(!empty());
+    return store[item_count - 1];
+  }
 
-  constexpr T &back() { return store[item_count - 1]; }
+  LIBC_INLINE constexpr T &back() {
+    LIBC_ASSERT(!empty());
+    return store[item_count - 1];
+  }
 
-  constexpr bool pop_back() {
+  LIBC_INLINE constexpr bool pop_back() {
     if (item_count == 0)
       return false;
+    inline_memset(&store[item_count - 1], 0, sizeof(T));
     --item_count;
     return true;
   }
 
-  constexpr T &operator[](size_t idx) { return store[idx]; }
+  LIBC_INLINE constexpr T &operator[](size_t idx) {
+    LIBC_ASSERT(idx < item_count);
+    return store[idx];
+  }
 
-  constexpr const T &operator[](size_t idx) const { return store[idx]; }
+  LIBC_INLINE constexpr const T &operator[](size_t idx) const {
+    LIBC_ASSERT(idx < item_count);
+    return store[idx];
+  }
 
-  constexpr bool empty() const { return item_count == 0; }
+  LIBC_INLINE constexpr bool empty() const { return item_count == 0; }
 
-  constexpr size_t size() const { return item_count; }
+  LIBC_INLINE constexpr size_t size() const { return item_count; }
 
   // Empties the store for all practical purposes.
-  constexpr void reset() { item_count = 0; }
+  LIBC_INLINE constexpr void reset() {
+    inline_memset(store.data(), 0, sizeof(T) * item_count);
+    item_count = 0;
+  }
 
   // This static method does not free up the resources held by |store|,
   // say by calling `free` or something similar. It just does the equivalent
@@ -81,7 +103,9 @@ public:
   // dynamically allocated storate. So, the `destroy` method like this
   // matches the `destroy` API of those other data structures so that users
   // can easily swap one data structure for the other.
-  static void destroy(FixedVector<T, CAPACITY> *store) { store->reset(); }
+  LIBC_INLINE static void destroy(FixedVector<T, CAPACITY> *store) {
+    store->reset();
+  }
 
   using reverse_iterator = typename cpp::array<T, CAPACITY>::reverse_iterator;
   LIBC_INLINE constexpr reverse_iterator rbegin() {
diff --git a/libc/src/__support/threads/thread.cpp b/libc/src/__support/threads/thread.cpp
index dad4f75..6f6b75b 100644
--- a/libc/src/__support/threads/thread.cpp
+++ b/libc/src/__support/threads/thread.cpp
@@ -117,7 +117,9 @@ public:
 
   int add_callback(AtExitCallback *callback, void *obj) {
     cpp::lock_guard lock(mtx);
-    return callback_list.push_back({callback, obj});
+    if (callback_list.push_back({callback, obj}))
+      return 0;
+    return -1;
   }
 
   void call() {
diff --git a/libc/src/compiler/generic/__stack_chk_fail.cpp b/libc/src/compiler/generic/__stack_chk_fail.cpp
index c76ec14..00e976a 100644
--- a/libc/src/compiler/generic/__stack_chk_fail.cpp
+++ b/libc/src/compiler/generic/__stack_chk_fail.cpp
@@ -9,9 +9,12 @@
 #include "src/compiler/__stack_chk_fail.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/stdlib/abort.h"
+#include <stdint.h> // For uintptr_t
 
 extern "C" {
 
+uintptr_t __stack_chk_guard = static_cast<uintptr_t>(0xa9fff01234);
+
 void __stack_chk_fail(void) {
   LIBC_NAMESPACE::write_to_stderr("stack smashing detected\n");
   LIBC_NAMESPACE::abort();
diff --git a/libc/src/complex/cimagf128.h b/libc/src/complex/cimagf128.h
index ab8f9ac..aaf52cf 100644
--- a/libc/src/complex/cimagf128.h
+++ b/libc/src/complex/cimagf128.h
@@ -6,15 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/macros/properties/complex_types.h"
-#include "src/__support/macros/properties/types.h"
-
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 #ifndef LLVM_LIBC_SRC_COMPLEX_CIMAGF128_H
 #define LLVM_LIBC_SRC_COMPLEX_CIMAGF128_H
 
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -23,5 +20,3 @@ float128 cimagf128(cfloat128 x);
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_COMPLEX_CIMAGF128_H
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/src/complex/cimagf16.h b/libc/src/complex/cimagf16.h
index 5c5de2e..81ed4d2 100644
--- a/libc/src/complex/cimagf16.h
+++ b/libc/src/complex/cimagf16.h
@@ -6,15 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/macros/properties/complex_types.h"
-#include "src/__support/macros/properties/types.h"
-
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 #ifndef LLVM_LIBC_SRC_COMPLEX_CIMAGF16_H
 #define LLVM_LIBC_SRC_COMPLEX_CIMAGF16_H
 
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -23,5 +20,3 @@ float16 cimagf16(cfloat16 x);
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_COMPLEX_CIMAGF16_H
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/src/complex/conjf128.h b/libc/src/complex/conjf128.h
index c1ae0b0..cae01d3 100644
--- a/libc/src/complex/conjf128.h
+++ b/libc/src/complex/conjf128.h
@@ -6,14 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/macros/properties/complex_types.h"
-
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 #ifndef LLVM_LIBC_SRC_COMPLEX_CONJF128_H
 #define LLVM_LIBC_SRC_COMPLEX_CONJF128_H
 
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -22,5 +19,3 @@ cfloat128 conjf128(cfloat128 x);
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_COMPLEX_CONJF128_H
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/src/complex/conjf16.h b/libc/src/complex/conjf16.h
index 685ac8a..dde1221 100644
--- a/libc/src/complex/conjf16.h
+++ b/libc/src/complex/conjf16.h
@@ -6,14 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/macros/properties/complex_types.h"
-
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 #ifndef LLVM_LIBC_SRC_COMPLEX_CONJF16_H
 #define LLVM_LIBC_SRC_COMPLEX_CONJF16_H
 
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -22,5 +19,3 @@ cfloat16 conjf16(cfloat16 x);
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_COMPLEX_CONJF16_H
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/src/complex/cprojf128.h b/libc/src/complex/cprojf128.h
index 5f7fe99..71c1bbe 100644
--- a/libc/src/complex/cprojf128.h
+++ b/libc/src/complex/cprojf128.h
@@ -6,14 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/macros/properties/complex_types.h"
-
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 #ifndef LLVM_LIBC_SRC_COMPLEX_CPROJF128_H
 #define LLVM_LIBC_SRC_COMPLEX_CPROJF128_H
 
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -22,5 +19,3 @@ cfloat128 cprojf128(cfloat128 x);
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_COMPLEX_CPROJF128_H
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/src/complex/cprojf16.h b/libc/src/complex/cprojf16.h
index 8cce5f0..f12a46d 100644
--- a/libc/src/complex/cprojf16.h
+++ b/libc/src/complex/cprojf16.h
@@ -6,14 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/macros/properties/complex_types.h"
-
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 #ifndef LLVM_LIBC_SRC_COMPLEX_CPROJF16_H
 #define LLVM_LIBC_SRC_COMPLEX_CPROJF16_H
 
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -22,5 +19,3 @@ cfloat16 cprojf16(cfloat16 x);
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_COMPLEX_CPROJF16_H
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/src/complex/crealf128.h b/libc/src/complex/crealf128.h
index 4922ae7..b90c3e7 100644
--- a/libc/src/complex/crealf128.h
+++ b/libc/src/complex/crealf128.h
@@ -6,15 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/macros/properties/complex_types.h"
-#include "src/__support/macros/properties/types.h"
-
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 #ifndef LLVM_LIBC_SRC_COMPLEX_CREALF128_H
 #define LLVM_LIBC_SRC_COMPLEX_CREALF128_H
 
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -23,5 +20,3 @@ float128 crealf128(cfloat128 x);
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_COMPLEX_CREALF128_H
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/src/complex/crealf16.h b/libc/src/complex/crealf16.h
index e6098a2..09d6664 100644
--- a/libc/src/complex/crealf16.h
+++ b/libc/src/complex/crealf16.h
@@ -6,15 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/macros/properties/complex_types.h"
-#include "src/__support/macros/properties/types.h"
-
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 #ifndef LLVM_LIBC_SRC_COMPLEX_CREALF16_H
 #define LLVM_LIBC_SRC_COMPLEX_CREALF16_H
 
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -23,5 +20,3 @@ float16 crealf16(cfloat16 x);
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_COMPLEX_CREALF16_H
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/src/complex/generic/cimagf128.cpp b/libc/src/complex/generic/cimagf128.cpp
index c21bd7f..78dbb8e 100644
--- a/libc/src/complex/generic/cimagf128.cpp
+++ b/libc/src/complex/generic/cimagf128.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/complex/cimagf128.h"
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 #include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/complex_type.h"
@@ -21,5 +19,3 @@ LLVM_LIBC_FUNCTION(float128, cimagf128, (cfloat128 x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/src/complex/generic/cimagf16.cpp b/libc/src/complex/generic/cimagf16.cpp
index 3616879..25d9b3d 100644
--- a/libc/src/complex/generic/cimagf16.cpp
+++ b/libc/src/complex/generic/cimagf16.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/complex/cimagf16.h"
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 #include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/complex_type.h"
@@ -21,5 +19,3 @@ LLVM_LIBC_FUNCTION(float16, cimagf16, (cfloat16 x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/src/complex/generic/conjf128.cpp b/libc/src/complex/generic/conjf128.cpp
index c65b548..a63809a 100644
--- a/libc/src/complex/generic/conjf128.cpp
+++ b/libc/src/complex/generic/conjf128.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/complex/conjf128.h"
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 #include "src/__support/common.h"
 #include "src/__support/complex_type.h"
 
@@ -19,5 +17,3 @@ LLVM_LIBC_FUNCTION(cfloat128, conjf128, (cfloat128 x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/src/complex/generic/conjf16.cpp b/libc/src/complex/generic/conjf16.cpp
index dac11e2..cd1ab67 100644
--- a/libc/src/complex/generic/conjf16.cpp
+++ b/libc/src/complex/generic/conjf16.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/complex/conjf16.h"
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 #include "src/__support/common.h"
 #include "src/__support/complex_type.h"
 
@@ -19,5 +17,3 @@ LLVM_LIBC_FUNCTION(cfloat16, conjf16, (cfloat16 x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/src/complex/generic/cprojf128.cpp b/libc/src/complex/generic/cprojf128.cpp
index 97134b5..eb2cd08 100644
--- a/libc/src/complex/generic/cprojf128.cpp
+++ b/libc/src/complex/generic/cprojf128.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/complex/cprojf128.h"
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 #include "src/__support/common.h"
 #include "src/__support/complex_type.h"
 
@@ -19,5 +17,3 @@ LLVM_LIBC_FUNCTION(cfloat128, cprojf128, (cfloat128 x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/src/complex/generic/cprojf16.cpp b/libc/src/complex/generic/cprojf16.cpp
index bd0425f..8d2d64a4 100644
--- a/libc/src/complex/generic/cprojf16.cpp
+++ b/libc/src/complex/generic/cprojf16.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/complex/cprojf16.h"
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 #include "src/__support/common.h"
 #include "src/__support/complex_type.h"
 
@@ -19,5 +17,3 @@ LLVM_LIBC_FUNCTION(cfloat16, cprojf16, (cfloat16 x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/src/complex/generic/crealf128.cpp b/libc/src/complex/generic/crealf128.cpp
index e72a778..e755498 100644
--- a/libc/src/complex/generic/crealf128.cpp
+++ b/libc/src/complex/generic/crealf128.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/complex/crealf128.h"
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 #include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/complex_type.h"
@@ -21,5 +19,3 @@ LLVM_LIBC_FUNCTION(float128, crealf128, (cfloat128 x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/src/complex/generic/crealf16.cpp b/libc/src/complex/generic/crealf16.cpp
index 3514207..c9e8626 100644
--- a/libc/src/complex/generic/crealf16.cpp
+++ b/libc/src/complex/generic/crealf16.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/complex/crealf16.h"
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 #include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/complex_type.h"
@@ -21,5 +19,3 @@ LLVM_LIBC_FUNCTION(float16, crealf16, (cfloat16 x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md
index f02d502..daaf1a3 100644
--- a/libc/src/math/docs/add_math_function.md
+++ b/libc/src/math/docs/add_math_function.md
@@ -18,7 +18,7 @@ together with its specifications:
 ```
 - Add function specs to the file:
 ```
-  libc/hdrgen/yaml/math.yaml
+  libc/include/math.yaml
 ```
 
 ## Implementation
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b3d4612..382f5b3 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -358,7 +358,6 @@ add_header_library(
   HDRS
     sincosf16_utils.h
   DEPENDS
-    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.nearest_integer
     libc.src.__support.common
@@ -1702,8 +1701,6 @@ add_header_library(
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
     libc.src.__support.common
diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/math/generic/exp10f_impl.h
index d741318..975fd01 100644
--- a/libc/src/math/generic/exp10f_impl.h
+++ b/libc/src/math/generic/exp10f_impl.h
@@ -10,12 +10,9 @@
 #define LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H
 
 #include "explogxf.h"
-#include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h
index bcab82f..06aeb49 100644
--- a/libc/src/math/generic/range_reduction_double_common.h
+++ b/libc/src/math/generic/range_reduction_double_common.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_COMMON_H
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/double_double.h"
 #include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/FPUtil/multiply_add.h"
diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h
index 5e5edd4..87b1dde 100644
--- a/libc/src/math/generic/sincosf16_utils.h
+++ b/libc/src/math/generic/sincosf16_utils.h
@@ -9,9 +9,7 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/pthread/pthread_condattr_init.cpp b/libc/src/pthread/pthread_condattr_init.cpp
index 12005b8..b360804 100644
--- a/libc/src/pthread/pthread_condattr_init.cpp
+++ b/libc/src/pthread/pthread_condattr_init.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 
-#include <pthread.h> // pthread_condattr_t, PTHREAD_PROCESS_PRIVATE
-#include <time.h>    // CLOCK_REALTIME
+#include "hdr/time_macros.h" // CLOCK_REALTIME
+#include <pthread.h>         // pthread_condattr_t, PTHREAD_PROCESS_PRIVATE
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp
index 37fbd6b..5e825d5 100644
--- a/libc/src/pthread/pthread_condattr_setclock.cpp
+++ b/libc/src/pthread/pthread_condattr_setclock.cpp
@@ -12,9 +12,9 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <pthread.h>   // pthread_condattr_t
-#include <sys/types.h> // clockid_t
-#include <time.h>      // CLOCK_MONOTONIC, CLOCK_REALTIME
+#include "hdr/time_macros.h" // CLOCK_MONOTONIC, CLOCK_REALTIME
+#include <pthread.h>         // pthread_condattr_t
+#include <sys/types.h>       // clockid_t
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/exit_handler.h b/libc/src/stdlib/exit_handler.h
index 9720c54..e9d163d 100644
--- a/libc/src/stdlib/exit_handler.h
+++ b/libc/src/stdlib/exit_handler.h
@@ -48,7 +48,7 @@ LIBC_INLINE void stdc_at_exit_func(void *payload) {
 LIBC_INLINE void call_exit_callbacks(ExitCallbackList &callbacks) {
   handler_list_mtx.lock();
   while (!callbacks.empty()) {
-    AtExitUnit &unit = callbacks.back();
+    AtExitUnit unit = callbacks.back();
     callbacks.pop_back();
     handler_list_mtx.unlock();
     unit.callback(unit.payload);
diff --git a/libc/src/stdlib/heap_sort.h b/libc/src/stdlib/heap_sort.h
index ccb9ec5..b969977 100644
--- a/libc/src/stdlib/heap_sort.h
+++ b/libc/src/stdlib/heap_sort.h
@@ -18,11 +18,12 @@ namespace internal {
 // A simple in-place heapsort implementation.
 // Follow the implementation in https://en.wikipedia.org/wiki/Heapsort.
 
-LIBC_INLINE void heap_sort(const Array &array) {
-  size_t end = array.size();
+template <typename A, typename F>
+LIBC_INLINE void heap_sort(const A &array, const F &is_less) {
+  size_t end = array.len();
   size_t start = end / 2;
 
-  auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
+  const auto left_child = [](size_t i) -> size_t { return 2 * i + 1; };
 
   while (end > 1) {
     if (start > 0) {
@@ -40,12 +41,11 @@ LIBC_INLINE void heap_sort(const Array &array) {
     while (left_child(root) < end) {
       size_t child = left_child(root);
       // If there are two children, set child to the greater.
-      if (child + 1 < end &&
-          array.elem_compare(child, array.get(child + 1)) < 0)
+      if ((child + 1 < end) && is_less(array.get(child), array.get(child + 1)))
         ++child;
 
       // If the root is less than the greater child
-      if (array.elem_compare(root, array.get(child)) >= 0)
+      if (!is_less(array.get(root), array.get(child)))
         break;
 
       // Swap the root with the greater child and continue sifting down.
diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp
index 65a63c2..0bf5fc7 100644
--- a/libc/src/stdlib/qsort.cpp
+++ b/libc/src/stdlib/qsort.cpp
@@ -18,14 +18,12 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(void, qsort,
                    (void *array, size_t array_size, size_t elem_size,
                     int (*compare)(const void *, const void *))) {
-  if (array == nullptr || array_size == 0 || elem_size == 0)
-    return;
-  internal::Comparator c(compare);
 
-  auto arr = internal::Array(reinterpret_cast<uint8_t *>(array), array_size,
-                             elem_size, c);
+  const auto is_less = [compare](const void *a, const void *b) -> bool {
+    return compare(a, b) < 0;
+  };
 
-  internal::sort(arr);
+  internal::unstable_sort(array, array_size, elem_size, is_less);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index c529d55..aa6d9bb 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -17,91 +17,122 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-using Compare = int(const void *, const void *);
-using CompareWithState = int(const void *, const void *, void *);
-
-enum class CompType { COMPARE, COMPARE_WITH_STATE };
-
-struct Comparator {
-  union {
-    Compare *comp_func;
-    CompareWithState *comp_func_r;
-  };
-  const CompType comp_type;
-
-  void *arg;
-
-  Comparator(Compare *func)
-      : comp_func(func), comp_type(CompType::COMPARE), arg(nullptr) {}
-
-  Comparator(CompareWithState *func, void *arg_val)
-      : comp_func_r(func), comp_type(CompType::COMPARE_WITH_STATE),
-        arg(arg_val) {}
-
-#if defined(__clang__)
-  // Recent upstream changes to -fsanitize=function find more instances of
-  // function type mismatches. One case is with the comparator passed to this
-  // class. Libraries will tend to pass comparators that take pointers to
-  // varying types while this comparator expects to accept const void pointers.
-  // Ideally those tools would pass a function that strictly accepts const
-  // void*s to avoid UB, or would use qsort_r to pass their own comparator.
-  [[clang::no_sanitize("function")]]
-#endif
-  int comp_vals(const void *a, const void *b) const {
-    if (comp_type == CompType::COMPARE) {
-      return comp_func(a, b);
-    } else {
-      return comp_func_r(a, b, arg);
+class ArrayGenericSize {
+  cpp::byte *array_base;
+  size_t array_len;
+  size_t elem_size;
+
+  LIBC_INLINE cpp::byte *get_internal(size_t i) const {
+    return array_base + (i * elem_size);
+  }
+
+public:
+  LIBC_INLINE ArrayGenericSize(void *a, size_t s, size_t e)
+      : array_base(reinterpret_cast<cpp::byte *>(a)), array_len(s),
+        elem_size(e) {}
+
+  static constexpr bool has_fixed_size() { return false; }
+
+  LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
+
+  LIBC_INLINE void swap(size_t i, size_t j) const {
+    // It's possible to use 8 byte blocks with `uint64_t`, but that
+    // generates more machine code as the remainder loop gets
+    // unrolled, plus 4 byte operations are more likely to be
+    // efficient on a wider variety of hardware. On x86 LLVM tends
+    // to unroll the block loop again into 2 16 byte swaps per
+    // iteration which is another reason that 4 byte blocks yields
+    // good performance even for big types.
+    using block_t = uint32_t;
+    constexpr size_t BLOCK_SIZE = sizeof(block_t);
+
+    alignas(block_t) cpp::byte tmp_block[BLOCK_SIZE];
+
+    cpp::byte *elem_i = get_internal(i);
+    cpp::byte *elem_j = get_internal(j);
+
+    const size_t elem_size_rem = elem_size % BLOCK_SIZE;
+    const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem);
+
+    while (elem_i != elem_i_block_end) {
+      __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE);
+      __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE);
+      __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE);
+
+      elem_i += BLOCK_SIZE;
+      elem_j += BLOCK_SIZE;
+    }
+
+    for (size_t n = 0; n < elem_size_rem; ++n) {
+      cpp::byte tmp = elem_i[n];
+      elem_i[n] = elem_j[n];
+      elem_j[n] = tmp;
     }
   }
+
+  LIBC_INLINE size_t len() const { return array_len; }
+
+  // Make an Array starting at index |i| and length |s|.
+  LIBC_INLINE ArrayGenericSize make_array(size_t i, size_t s) const {
+    return ArrayGenericSize(get_internal(i), s, elem_size);
+  }
+
+  // Reset this Array to point at a different interval of the same
+  // items starting at index |i|.
+  LIBC_INLINE void reset_bounds(size_t i, size_t s) {
+    array_base = get_internal(i);
+    array_len = s;
+  }
 };
 
-class Array {
-  uint8_t *array;
-  size_t array_size;
-  size_t elem_size;
-  Comparator compare;
+// Having a specialized Array type for sorting that knows at
+// compile-time what the size of the element is, allows for much more
+// efficient swapping and for cheaper offset calculations.
+template <size_t ELEM_SIZE> class ArrayFixedSize {
+  cpp::byte *array_base;
+  size_t array_len;
 
-public:
-  Array(uint8_t *a, size_t s, size_t e, Comparator c)
-      : array(a), array_size(s), elem_size(e), compare(c) {}
-
-  uint8_t *get(size_t i) const { return array + i * elem_size; }
-
-  void swap(size_t i, size_t j) const {
-    uint8_t *elem_i = get(i);
-    uint8_t *elem_j = get(j);
-    for (size_t b = 0; b < elem_size; ++b) {
-      uint8_t temp = elem_i[b];
-      elem_i[b] = elem_j[b];
-      elem_j[b] = temp;
-    }
+  LIBC_INLINE cpp::byte *get_internal(size_t i) const {
+    return array_base + (i * ELEM_SIZE);
   }
 
-  int elem_compare(size_t i, const uint8_t *other) const {
-    // An element must compare equal to itself so we don't need to consult the
-    // user provided comparator.
-    if (get(i) == other)
-      return 0;
-    return compare.comp_vals(get(i), other);
+public:
+  LIBC_INLINE ArrayFixedSize(void *a, size_t s)
+      : array_base(reinterpret_cast<cpp::byte *>(a)), array_len(s) {}
+
+  // Beware this function is used a heuristic for cheap to swap types, so
+  // instantiating `ArrayFixedSize` with `ELEM_SIZE > 100` is probably a bad
+  // idea perf wise.
+  static constexpr bool has_fixed_size() { return true; }
+
+  LIBC_INLINE void *get(size_t i) const { return get_internal(i); }
+
+  LIBC_INLINE void swap(size_t i, size_t j) const {
+    alignas(32) cpp::byte tmp[ELEM_SIZE];
+
+    cpp::byte *elem_i = get_internal(i);
+    cpp::byte *elem_j = get_internal(j);
+
+    __builtin_memcpy(tmp, elem_i, ELEM_SIZE);
+    __builtin_memmove(elem_i, elem_j, ELEM_SIZE);
+    __builtin_memcpy(elem_j, tmp, ELEM_SIZE);
   }
 
-  size_t size() const { return array_size; }
+  LIBC_INLINE size_t len() const { return array_len; }
 
-  // Make an Array starting at index |i| and size |s|.
-  LIBC_INLINE Array make_array(size_t i, size_t s) const {
-    return Array(get(i), s, elem_size, compare);
+  // Make an Array starting at index |i| and length |s|.
+  LIBC_INLINE ArrayFixedSize<ELEM_SIZE> make_array(size_t i, size_t s) const {
+    return ArrayFixedSize<ELEM_SIZE>(get_internal(i), s);
   }
 
-  // Reset this Array to point at a different interval of the same items.
-  LIBC_INLINE void reset_bounds(uint8_t *a, size_t s) {
-    array = a;
-    array_size = s;
+  // Reset this Array to point at a different interval of the same
+  // items starting at index |i|.
+  LIBC_INLINE void reset_bounds(size_t i, size_t s) {
+    array_base = get_internal(i);
+    array_len = s;
   }
 };
 
-using SortingRoutine = void(const Array &);
-
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
new file mode 100644
index 0000000..b27e746
--- /dev/null
+++ b/libc/src/stdlib/qsort_pivot.h
@@ -0,0 +1,85 @@
+//===-- Implementation header for qsort utilities ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+#define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+
+#include <stddef.h>  // For size_t
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// Recursively select a pseudomedian if above this threshold.
+constexpr size_t PSEUDO_MEDIAN_REC_THRESHOLD = 64;
+
+// Selects a pivot from `array`. Algorithm taken from glidesort by Orson Peters.
+//
+// This chooses a pivot by sampling an adaptive amount of points, approximating
+// the quality of a median of sqrt(n) elements.
+template <typename A, typename F>
+size_t choose_pivot(const A &array, const F &is_less) {
+  const size_t len = array.len();
+
+  if (len < 8) {
+    return 0;
+  }
+
+  const size_t len_div_8 = len / 8;
+
+  const size_t a = 0;             // [0, floor(n/8))
+  const size_t b = len_div_8 * 4; // [4*floor(n/8), 5*floor(n/8))
+  const size_t c = len_div_8 * 7; // [7*floor(n/8), 8*floor(n/8))
+
+  if (len < PSEUDO_MEDIAN_REC_THRESHOLD)
+    return median3(array, a, b, c, is_less);
+  else
+    return median3_rec(array, a, b, c, len_div_8, is_less);
+}
+
+// Calculates an approximate median of 3 elements from sections a, b, c, or
+// recursively from an approximation of each, if they're large enough. By
+// dividing the size of each section by 8 when recursing we have logarithmic
+// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) =
+// O(n^(log(3)/log(8))) ~= O(n^0.528) elements.
+template <typename A, typename F>
+size_t median3_rec(const A &array, size_t a, size_t b, size_t c, size_t n,
+                   const F &is_less) {
+  if (n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD) {
+    const size_t n8 = n / 8;
+    a = median3_rec(array, a, a + (n8 * 4), a + (n8 * 7), n8, is_less);
+    b = median3_rec(array, b, b + (n8 * 4), b + (n8 * 7), n8, is_less);
+    c = median3_rec(array, c, c + (n8 * 4), c + (n8 * 7), n8, is_less);
+  }
+  return median3(array, a, b, c, is_less);
+}
+
+/// Calculates the median of 3 elements.
+template <typename A, typename F>
+size_t median3(const A &array, size_t a, size_t b, size_t c, const F &is_less) {
+  const void *a_ptr = array.get(a);
+  const void *b_ptr = array.get(b);
+  const void *c_ptr = array.get(c);
+
+  const bool x = is_less(a_ptr, b_ptr);
+  const bool y = is_less(a_ptr, c_ptr);
+  if (x == y) {
+    // If x=y=0 then b, c <= a. In this case we want to return max(b, c).
+    // If x=y=1 then a < b, c. In this case we want to return min(b, c).
+    // By toggling the outcome of b < c using XOR x we get this behavior.
+    const bool z = is_less(b_ptr, c_ptr);
+    return z ^ x ? c : b;
+  } else {
+    // Either c <= a < b or b <= a < c, thus a is our median.
+    return a;
+  }
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp
index bf61a40..4e60998 100644
--- a/libc/src/stdlib/qsort_r.cpp
+++ b/libc/src/stdlib/qsort_r.cpp
@@ -19,13 +19,12 @@ LLVM_LIBC_FUNCTION(void, qsort_r,
                    (void *array, size_t array_size, size_t elem_size,
                     int (*compare)(const void *, const void *, void *),
                     void *arg)) {
-  if (array == nullptr || array_size == 0 || elem_size == 0)
-    return;
-  internal::Comparator c(compare, arg);
-  auto arr = internal::Array(reinterpret_cast<uint8_t *>(array), array_size,
-                             elem_size, c);
 
-  internal::sort(arr);
+  const auto is_less = [compare, arg](const void *a, const void *b) -> bool {
+    return compare(a, b, arg) < 0;
+  };
+
+  internal::unstable_sort(array, array_size, elem_size, is_less);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/qsort_util.h b/libc/src/stdlib/qsort_util.h
index d42adde..7882b82 100644
--- a/libc/src/stdlib/qsort_util.h
+++ b/libc/src/stdlib/qsort_util.h
@@ -27,11 +27,48 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-#if LIBC_QSORT_IMPL == LIBC_QSORT_QUICK_SORT
-constexpr auto sort = quick_sort;
-#elif LIBC_QSORT_IMPL == LIBC_QSORT_HEAP_SORT
-constexpr auto sort = heap_sort;
-#endif
+template <bool USE_QUICKSORT, typename F>
+LIBC_INLINE void unstable_sort_impl(void *array, size_t array_len,
+                                    size_t elem_size, const F &is_less) {
+  if (array == nullptr || array_len == 0 || elem_size == 0)
+    return;
+
+  if constexpr (USE_QUICKSORT) {
+    switch (elem_size) {
+    case 4: {
+      auto arr_fixed_size = internal::ArrayFixedSize<4>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    case 8: {
+      auto arr_fixed_size = internal::ArrayFixedSize<8>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    case 16: {
+      auto arr_fixed_size = internal::ArrayFixedSize<16>(array, array_len);
+      quick_sort(arr_fixed_size, is_less);
+      return;
+    }
+    default:
+      auto arr_generic_size =
+          internal::ArrayGenericSize(array, array_len, elem_size);
+      quick_sort(arr_generic_size, is_less);
+      return;
+    }
+  } else {
+    auto arr_generic_size =
+        internal::ArrayGenericSize(array, array_len, elem_size);
+    heap_sort(arr_generic_size, is_less);
+  }
+}
+
+template <typename F>
+LIBC_INLINE void unstable_sort(void *array, size_t array_len, size_t elem_size,
+                               const F &is_less) {
+#define USE_QUICK_SORT ((LIBC_QSORT_IMPL) == (LIBC_QSORT_QUICK_SORT))
+  unstable_sort_impl<USE_QUICK_SORT, F>(array, array_len, elem_size, is_less);
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
index 82b90a7..9ab2830 100644
--- a/libc/src/stdlib/quick_sort.h
+++ b/libc/src/stdlib/quick_sort.h
@@ -9,84 +9,175 @@
 #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
 
-#include "src/__support/macros/attributes.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/cstddef.h"
 #include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort_pivot.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-// A simple quicksort implementation using the Hoare partition scheme.
-LIBC_INLINE size_t partition(const Array &array) {
-  const size_t array_size = array.size();
-  size_t pivot_index = array_size / 2;
-  uint8_t *pivot = array.get(pivot_index);
-  size_t i = 0;
-  size_t j = array_size - 1;
+// Branchless Lomuto partition based on the implementation by Lukas
+// Bergdoll and Orson Peters
+// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md.
+// Simplified to avoid having to stack allocate.
+template <typename A, typename F>
+LIBC_INLINE size_t partition_lomuto_branchless(const A &array,
+                                               const void *pivot,
+                                               const F &is_less) {
+  const size_t array_len = array.len();
+
+  size_t left = 0;
+  size_t right = 0;
+
+  while (right < array_len) {
+    const bool right_is_lt = is_less(array.get(right), pivot);
+    array.swap(left, right);
+    left += static_cast<size_t>(right_is_lt);
+    right += 1;
+  }
+
+  return left;
+}
+
+// Optimized for large types that are expensive to move. Not optimized
+// for integers. It's possible to use a cyclic permutation here for
+// large types as done in ipnsort but the advantages of this are limited
+// as `is_less` is a small wrapper around a call to a function pointer
+// and won't incur much binary-size overhead. The other reason to use
+// cyclic permutation is to have more efficient swapping, but we don't
+// know the element size so this isn't applicable here either.
+template <typename A, typename F>
+LIBC_INLINE size_t partition_hoare_branchy(const A &array, const void *pivot,
+                                           const F &is_less) {
+  const size_t array_len = array.len();
+
+  size_t left = 0;
+  size_t right = array_len;
 
   while (true) {
-    int compare_i, compare_j;
-
-    while ((compare_i = array.elem_compare(i, pivot)) < 0)
-      ++i;
-    while ((compare_j = array.elem_compare(j, pivot)) > 0)
-      --j;
-
-    // At some point i will crossover j so we will definitely break out of
-    // this while loop.
-    if (i >= j)
-      return j + 1;
-
-    array.swap(i, j);
-
-    // The pivot itself might have got swapped so we will update the pivot.
-    if (i == pivot_index) {
-      pivot = array.get(j);
-      pivot_index = j;
-    } else if (j == pivot_index) {
-      pivot = array.get(i);
-      pivot_index = i;
+    while (left < right && is_less(array.get(left), pivot))
+      ++left;
+
+    while (true) {
+      --right;
+      if (left >= right || is_less(array.get(right), pivot)) {
+        break;
+      }
     }
 
-    if (compare_i == 0 && compare_j == 0) {
-      // If we do not move the pointers, we will end up with an
-      // infinite loop as i and j will be stuck without advancing.
-      ++i;
-      --j;
-    }
+    if (left >= right)
+      break;
+
+    array.swap(left, right);
+    ++left;
+  }
+
+  return left;
+}
+
+template <typename A, typename F>
+LIBC_INLINE size_t partition(const A &array, size_t pivot_index,
+                             const F &is_less) {
+  // Place the pivot at the beginning of the array.
+  if (pivot_index != 0) {
+    array.swap(0, pivot_index);
   }
+
+  const A array_without_pivot = array.make_array(1, array.len() - 1);
+  const void *pivot = array.get(0);
+
+  size_t num_lt;
+  if constexpr (A::has_fixed_size()) {
+    // Branchless Lomuto avoid branch misprediction penalties, but
+    // it also swaps more often which is only faster if the swap is a fast
+    // constant operation.
+    num_lt = partition_lomuto_branchless(array_without_pivot, pivot, is_less);
+  } else {
+    num_lt = partition_hoare_branchy(array_without_pivot, pivot, is_less);
+  }
+
+  // Place the pivot between the two partitions.
+  array.swap(0, num_lt);
+
+  return num_lt;
 }
 
-LIBC_INLINE void quick_sort(Array array) {
+template <typename A, typename F>
+LIBC_INLINE void quick_sort_impl(A &array, const void *ancestor_pivot,
+                                 size_t limit, const F &is_less) {
   while (true) {
-    const size_t array_size = array.size();
-    if (array_size <= 1)
+    const size_t array_len = array.len();
+    if (array_len <= 1)
       return;
-    size_t split_index = partition(array);
-    if (array_size == 2)
-      // The partition operation sorts the two element array.
+
+    // If too many bad pivot choices were made, simply fall back to
+    // heapsort in order to guarantee `O(N x log(N))` worst-case.
+    if (limit == 0) {
+      heap_sort(array, is_less);
       return;
+    }
 
-    // Make Arrays describing the two sublists that still need sorting.
-    Array left = array.make_array(0, split_index);
-    Array right = array.make_array(split_index, array.size() - split_index);
-
-    // Recurse to sort the smaller of the two, and then loop round within this
-    // function to sort the larger. This way, recursive call depth is bounded
-    // by log2 of the total array size, because every recursive call is sorting
-    // a list at most half the length of the one in its caller.
-    if (left.size() < right.size()) {
-      quick_sort(left);
-      array.reset_bounds(right.get(0), right.size());
-    } else {
-      quick_sort(right);
-      array.reset_bounds(left.get(0), left.size());
+    limit -= 1;
+
+    const size_t pivot_index = choose_pivot(array, is_less);
+
+    // If the chosen pivot is equal to the predecessor, then it's the smallest
+    // element in the slice. Partition the slice into elements equal to and
+    // elements greater than the pivot. This case is usually hit when the slice
+    // contains many duplicate elements.
+    if (ancestor_pivot) {
+      if (!is_less(ancestor_pivot, array.get(pivot_index))) {
+        const size_t num_lt =
+            partition(array, pivot_index,
+                      [is_less](const void *a, const void *b) -> bool {
+                        return !is_less(b, a);
+                      });
+
+        // Continue sorting elements greater than the pivot. We know that
+        // `num_lt` cont
+        array.reset_bounds(num_lt + 1, array.len() - (num_lt + 1));
+        ancestor_pivot = nullptr;
+        continue;
+      }
     }
+
+    size_t split_index = partition(array, pivot_index, is_less);
+
+    if (array_len == 2)
+      // The partition operation sorts the two element array.
+      return;
+
+    // Split the array into `left`, `pivot`, and `right`.
+    A left = array.make_array(0, split_index);
+    const void *pivot = array.get(split_index);
+    const size_t right_start = split_index + 1;
+    A right = array.make_array(right_start, array.len() - right_start);
+
+    // Recurse into the left side. We have a fixed recursion limit,
+    // testing shows no real benefit for recursing into the shorter
+    // side.
+    quick_sort_impl(left, ancestor_pivot, limit, is_less);
+
+    // Continue with the right side.
+    array = right;
+    ancestor_pivot = pivot;
   }
 }
 
+constexpr size_t ilog2(size_t n) { return cpp::bit_width(n) - 1; }
+
+template <typename A, typename F>
+LIBC_INLINE void quick_sort(A &array, const F &is_less) {
+  const void *ancestor_pivot = nullptr;
+  // Limit the number of imbalanced partitions to `2 * floor(log2(len))`.
+  // The binary OR by one is used to eliminate the zero-check in the logarithm.
+  const size_t limit = 2 * ilog2((array.len() | 1));
+  quick_sort_impl(array, ancestor_pivot, limit, is_less);
+}
+
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt
index ae835dc..ef9bfe5 100644
--- a/libc/src/time/CMakeLists.txt
+++ b/libc/src/time/CMakeLists.txt
@@ -2,6 +2,17 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
+add_header_library(
+  time_constants
+  HDRS
+    time_constants.h
+  DEPENDS
+    libc.include.time
+    libc.src.__support.CPP.array
+    libc.src.__support.CPP.string_view
+    libc.hdr.types.time_t
+)
+
 add_object_library(
   time_utils
   SRCS
@@ -12,6 +23,10 @@ add_object_library(
     libc.include.time
     libc.src.__support.CPP.limits
     libc.src.errno.errno
+    .time_constants
+    libc.hdr.types.time_t
+    libc.hdr.types.size_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -22,7 +37,9 @@ add_entrypoint_object(
     asctime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -33,7 +50,9 @@ add_entrypoint_object(
     asctime_r.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -44,6 +63,7 @@ add_entrypoint_object(
     ctime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.hdr.types.time_t
     libc.include.time
 )
@@ -56,6 +76,7 @@ add_entrypoint_object(
     ctime_r.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.hdr.types.time_t
     libc.include.time
 )
@@ -68,6 +89,7 @@ add_entrypoint_object(
     difftime.h
   DEPENDS
     libc.include.time
+    libc.hdr.types.time_t
 )
 
 add_entrypoint_object(
@@ -79,6 +101,8 @@ add_entrypoint_object(
   DEPENDS
     .time_utils
     libc.include.time
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -90,6 +114,8 @@ add_entrypoint_object(
   DEPENDS
     .time_utils
     libc.include.time
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -100,8 +126,11 @@ add_entrypoint_object(
     mktime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
     libc.src.errno.errno
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -115,6 +144,7 @@ add_entrypoint_object(
     libc.hdr.types.time_t
     libc.src.__support.time.clock_gettime
     libc.src.errno.errno
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
diff --git a/libc/src/time/asctime.cpp b/libc/src/time/asctime.cpp
index d6fbe73..2b00c41 100644
--- a/libc/src/time/asctime.cpp
+++ b/libc/src/time/asctime.cpp
@@ -9,15 +9,15 @@
 #include "src/time/asctime.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, asctime, (const struct tm *timeptr)) {
-  static char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
-  return time_utils::asctime(timeptr, buffer, TimeConstants::ASCTIME_MAX_BYTES);
+  static char buffer[time_constants::ASCTIME_BUFFER_SIZE];
+  return time_utils::asctime(timeptr, buffer,
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/asctime.h b/libc/src/time/asctime.h
index 623e6df..37325e7 100644
--- a/libc/src/time/asctime.h
+++ b/libc/src/time/asctime.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_ASCTIME_H
 #define LLVM_LIBC_SRC_TIME_ASCTIME_H
 
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/asctime_r.cpp b/libc/src/time/asctime_r.cpp
index caa22f1..bf53bfd 100644
--- a/libc/src/time/asctime_r.cpp
+++ b/libc/src/time/asctime_r.cpp
@@ -9,15 +9,15 @@
 #include "src/time/asctime_r.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, asctime_r,
                    (const struct tm *timeptr, char *buffer)) {
-  return time_utils::asctime(timeptr, buffer, TimeConstants::ASCTIME_MAX_BYTES);
+  return time_utils::asctime(timeptr, buffer,
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/asctime_r.h b/libc/src/time/asctime_r.h
index 328b7df..65a6b84 100644
--- a/libc/src/time/asctime_r.h
+++ b/libc/src/time/asctime_r.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_ASCTIME_R_H
 #define LLVM_LIBC_SRC_TIME_ASCTIME_R_H
 
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/ctime.cpp b/libc/src/time/ctime.cpp
index 8adae9b..ac0ffe5 100644
--- a/libc/src/time/ctime.cpp
+++ b/libc/src/time/ctime.cpp
@@ -6,23 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ctime.h"
+#include "src/time/ctime.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "time_utils.h"
+#include "src/time/time_constants.h"
+#include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, ctime, (const time_t *t_ptr)) {
   if (t_ptr == nullptr || *t_ptr > cpp::numeric_limits<int32_t>::max()) {
     return nullptr;
   }
-  static char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  static char buffer[time_constants::ASCTIME_BUFFER_SIZE];
   return time_utils::asctime(time_utils::localtime(t_ptr), buffer,
-                             TimeConstants::ASCTIME_MAX_BYTES);
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/ctime_r.cpp b/libc/src/time/ctime_r.cpp
index 63d93c4..7224f77 100644
--- a/libc/src/time/ctime_r.cpp
+++ b/libc/src/time/ctime_r.cpp
@@ -6,16 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ctime_r.h"
+#include "src/time/ctime_r.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "time_utils.h"
+#include "src/time/time_constants.h"
+#include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, ctime_r, (const time_t *t_ptr, char *buffer)) {
   if (t_ptr == nullptr || buffer == nullptr ||
       *t_ptr > cpp::numeric_limits<int32_t>::max()) {
@@ -23,7 +22,7 @@ LLVM_LIBC_FUNCTION(char *, ctime_r, (const time_t *t_ptr, char *buffer)) {
   }
 
   return time_utils::asctime(time_utils::localtime(t_ptr), buffer,
-                             TimeConstants::ASCTIME_MAX_BYTES);
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/difftime.h b/libc/src/time/difftime.h
index d5cd593..12de567 100644
--- a/libc/src/time/difftime.h
+++ b/libc/src/time/difftime.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_DIFFTIME_H
 #define LLVM_LIBC_SRC_TIME_DIFFTIME_H
 
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gmtime.h b/libc/src/time/gmtime.h
index 3de3ceb..ac7f1be 100644
--- a/libc/src/time/gmtime.h
+++ b/libc/src/time/gmtime.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_GMTIME_H
 #define LLVM_LIBC_SRC_TIME_GMTIME_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gmtime_r.h b/libc/src/time/gmtime_r.h
index b4f387e..4c88b22 100644
--- a/libc/src/time/gmtime_r.h
+++ b/libc/src/time/gmtime_r.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_GMTIME_R_H
 #define LLVM_LIBC_SRC_TIME_GMTIME_R_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gpu/clock.cpp b/libc/src/time/gpu/clock.cpp
index add5b27..8609c5c 100644
--- a/libc/src/time/gpu/clock.cpp
+++ b/libc/src/time/gpu/clock.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/clock.h"
+
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 
diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index a92f660..d22d9d6 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -8,6 +8,7 @@
 
 #include "src/time/nanosleep.h"
 
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 
diff --git a/libc/src/time/mktime.cpp b/libc/src/time/mktime.cpp
index 72cd2291..3874cad 100644
--- a/libc/src/time/mktime.cpp
+++ b/libc/src/time/mktime.cpp
@@ -9,15 +9,11 @@
 #include "src/time/mktime.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
-static constexpr int NON_LEAP_YEAR_DAYS_IN_MONTH[] = {31, 28, 31, 30, 31, 30,
-                                                      31, 31, 30, 31, 30, 31};
-
 // Returns number of years from (1, year).
 static constexpr int64_t get_num_of_leap_years_before(int64_t year) {
   return (year / 4) - (year / 100) + (year / 400);
@@ -31,12 +27,12 @@ static constexpr bool is_leap_year(const int64_t year) {
 LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
   // Unlike most C Library functions, mktime doesn't just die on bad input.
   // TODO(rtenneti); Handle leap seconds.
-  int64_t tm_year_from_base = tm_out->tm_year + TimeConstants::TIME_YEAR_BASE;
+  int64_t tm_year_from_base = tm_out->tm_year + time_constants::TIME_YEAR_BASE;
 
   // 32-bit end-of-the-world is 03:14:07 UTC on 19 January 2038.
   if (sizeof(time_t) == 4 &&
-      tm_year_from_base >= TimeConstants::END_OF32_BIT_EPOCH_YEAR) {
-    if (tm_year_from_base > TimeConstants::END_OF32_BIT_EPOCH_YEAR)
+      tm_year_from_base >= time_constants::END_OF32_BIT_EPOCH_YEAR) {
+    if (tm_year_from_base > time_constants::END_OF32_BIT_EPOCH_YEAR)
       return time_utils::out_of_range();
     if (tm_out->tm_mon > 0)
       return time_utils::out_of_range();
@@ -64,7 +60,7 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
 
   // Calculate number of months and years from tm_mon.
   int64_t month = tm_out->tm_mon;
-  if (month < 0 || month >= TimeConstants::MONTHS_PER_YEAR - 1) {
+  if (month < 0 || month >= time_constants::MONTHS_PER_YEAR - 1) {
     int64_t years = month / 12;
     month %= 12;
     if (month < 0) {
@@ -78,23 +74,23 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
   // Calculate total number of days based on the month and the day (tm_mday).
   int64_t total_days = tm_out->tm_mday - 1;
   for (int64_t i = 0; i < month; ++i)
-    total_days += NON_LEAP_YEAR_DAYS_IN_MONTH[i];
+    total_days += time_constants::NON_LEAP_YEAR_DAYS_IN_MONTH[i];
   // Add one day if it is a leap year and the month is after February.
   if (tm_year_is_leap && month > 1)
     total_days++;
 
   // Calculate total numbers of days based on the year.
-  total_days += (tm_year_from_base - TimeConstants::EPOCH_YEAR) *
-                TimeConstants::DAYS_PER_NON_LEAP_YEAR;
-  if (tm_year_from_base >= TimeConstants::EPOCH_YEAR) {
+  total_days += (tm_year_from_base - time_constants::EPOCH_YEAR) *
+                time_constants::DAYS_PER_NON_LEAP_YEAR;
+  if (tm_year_from_base >= time_constants::EPOCH_YEAR) {
     total_days += get_num_of_leap_years_before(tm_year_from_base - 1) -
-                  get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR);
+                  get_num_of_leap_years_before(time_constants::EPOCH_YEAR);
   } else if (tm_year_from_base >= 1) {
-    total_days -= get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR) -
+    total_days -= get_num_of_leap_years_before(time_constants::EPOCH_YEAR) -
                   get_num_of_leap_years_before(tm_year_from_base - 1);
   } else {
     // Calculate number of leap years until 0th year.
-    total_days -= get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR) -
+    total_days -= get_num_of_leap_years_before(time_constants::EPOCH_YEAR) -
                   get_num_of_leap_years_before(0);
     if (tm_year_from_base <= 0) {
       total_days -= 1; // Subtract 1 for 0th year.
@@ -106,11 +102,12 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
     }
   }
 
-  // TODO(rtenneti): Need to handle timezone and update of tm_isdst.
+  // TODO: https://github.com/llvm/llvm-project/issues/121962
+  // Need to handle timezone and update of tm_isdst.
   int64_t seconds = tm_out->tm_sec +
-                    tm_out->tm_min * TimeConstants::SECONDS_PER_MIN +
-                    tm_out->tm_hour * TimeConstants::SECONDS_PER_HOUR +
-                    total_days * TimeConstants::SECONDS_PER_DAY;
+                    tm_out->tm_min * time_constants::SECONDS_PER_MIN +
+                    tm_out->tm_hour * time_constants::SECONDS_PER_HOUR +
+                    total_days * time_constants::SECONDS_PER_DAY;
 
   // Update the tm structure's year, month, day, etc. from seconds.
   if (time_utils::update_from_seconds(seconds, tm_out) < 0)
diff --git a/libc/src/time/mktime.h b/libc/src/time/mktime.h
index 2b4c679..985c629 100644
--- a/libc/src/time/mktime.h
+++ b/libc/src/time/mktime.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_MKTIME_H
 #define LLVM_LIBC_SRC_TIME_MKTIME_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/time.cpp b/libc/src/time/time.cpp
index 4a0b614..860909a 100644
--- a/libc/src/time/time.cpp
+++ b/libc/src/time/time.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/time/time_func.h"
+
 #include "hdr/time_macros.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/time_func.h"
 
 namespace LIBC_NAMESPACE_DECL {
 // avoid inconsitent clang-format behavior
diff --git a/libc/src/time/time_constants.h b/libc/src/time/time_constants.h
new file mode 100644
index 0000000..3e25f74
--- /dev/null
+++ b/libc/src/time/time_constants.h
@@ -0,0 +1,100 @@
+//===-- Collection of constants for time functions --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
+#define LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
+
+#include "hdr/types/time_t.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/string_view.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+namespace time_constants {
+
+enum Month : int {
+  JANUARY,
+  FEBRUARY,
+  MARCH,
+  APRIL,
+  MAY,
+  JUNE,
+  JULY,
+  AUGUST,
+  SEPTEMBER,
+  OCTOBER,
+  NOVEMBER,
+  DECEMBER
+};
+
+constexpr int SECONDS_PER_MIN = 60;
+constexpr int MINUTES_PER_HOUR = 60;
+constexpr int HOURS_PER_DAY = 24;
+constexpr int DAYS_PER_WEEK = 7;
+constexpr int MONTHS_PER_YEAR = 12;
+constexpr int DAYS_PER_NON_LEAP_YEAR = 365;
+constexpr int DAYS_PER_LEAP_YEAR = 366;
+
+constexpr int SECONDS_PER_HOUR = SECONDS_PER_MIN * MINUTES_PER_HOUR;
+constexpr int SECONDS_PER_DAY = SECONDS_PER_HOUR * HOURS_PER_DAY;
+constexpr int NUMBER_OF_SECONDS_IN_LEAP_YEAR =
+    DAYS_PER_LEAP_YEAR * SECONDS_PER_DAY;
+
+constexpr int TIME_YEAR_BASE = 1900;
+constexpr int EPOCH_YEAR = 1970;
+constexpr int EPOCH_WEEK_DAY = 4;
+
+// For asctime the behavior is undefined if struct tm's tm_wday or tm_mon are
+// not within the normal ranges as defined in <time.h>, or if struct tm's
+// tm_year exceeds {INT_MAX}-1990, or if the below asctime_internal algorithm
+// would attempt to generate more than 26 bytes of output (including the
+// terminating null).
+constexpr int ASCTIME_BUFFER_SIZE = 256;
+constexpr int ASCTIME_MAX_BYTES = 26;
+
+/* 2000-03-01 (mod 400 year, immediately after feb29 */
+constexpr int64_t SECONDS_UNTIL2000_MARCH_FIRST =
+    (946684800LL + SECONDS_PER_DAY * (31 + 29));
+constexpr int WEEK_DAY_OF2000_MARCH_FIRST = 3;
+
+constexpr int DAYS_PER400_YEARS =
+    (DAYS_PER_NON_LEAP_YEAR * 400) + (400 / 4) - 3;
+constexpr int DAYS_PER100_YEARS =
+    (DAYS_PER_NON_LEAP_YEAR * 100) + (100 / 4) - 1;
+constexpr int DAYS_PER4_YEARS = (DAYS_PER_NON_LEAP_YEAR * 4) + 1;
+
+// The latest time that can be represented in this form is 03:14:07 UTC on
+// Tuesday, 19 January 2038 (corresponding to 2,147,483,647 seconds since the
+// start of the epoch). This means that systems using a 32-bit time_t type are
+// susceptible to the Year 2038 problem.
+constexpr int END_OF32_BIT_EPOCH_YEAR = 2038;
+
+constexpr time_t OUT_OF_RANGE_RETURN_VALUE = -1;
+
+constexpr cpp::array<cpp::string_view, DAYS_PER_WEEK> WEEK_DAY_NAMES = {
+    "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
+
+constexpr cpp::array<cpp::string_view, DAYS_PER_WEEK> WEEK_DAY_FULL_NAMES = {
+    "Sunday",   "Monday", "Tuesday", "Wednesday",
+    "Thursday", "Friday", "Saturday"};
+
+constexpr cpp::array<cpp::string_view, MONTHS_PER_YEAR> MONTH_NAMES = {
+    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
+
+constexpr cpp::array<cpp::string_view, MONTHS_PER_YEAR> MONTH_FULL_NAMES = {
+    "January", "February", "March",     "April",   "May",      "June",
+    "July",    "August",   "September", "October", "November", "December"};
+
+constexpr int NON_LEAP_YEAR_DAYS_IN_MONTH[] = {31, 28, 31, 30, 31, 30,
+                                               31, 31, 30, 31, 30, 31};
+
+} // namespace time_constants
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
diff --git a/libc/src/time/time_utils.cpp b/libc/src/time/time_utils.cpp
index 509cad8..abc93b8 100644
--- a/libc/src/time/time_utils.cpp
+++ b/libc/src/time/time_utils.cpp
@@ -10,12 +10,11 @@
 #include "src/__support/CPP/limits.h" // INT_MIN, INT_MAX
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace time_utils {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 static int64_t computeRemainingYears(int64_t daysPerYears,
                                      int64_t quotientYears,
                                      int64_t *remainingDays) {
@@ -52,36 +51,36 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
       (sizeof(time_t) == 4)
           ? INT_MIN
           : INT_MIN * static_cast<int64_t>(
-                          TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
+                          time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
   constexpr time_t time_max =
       (sizeof(time_t) == 4)
           ? INT_MAX
           : INT_MAX * static_cast<int64_t>(
-                          TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
+                          time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
 
   time_t ts = static_cast<time_t>(total_seconds);
   if (ts < time_min || ts > time_max)
     return time_utils::out_of_range();
 
   int64_t seconds =
-      total_seconds - TimeConstants::SECONDS_UNTIL2000_MARCH_FIRST;
-  int64_t days = seconds / TimeConstants::SECONDS_PER_DAY;
-  int64_t remainingSeconds = seconds % TimeConstants::SECONDS_PER_DAY;
+      total_seconds - time_constants::SECONDS_UNTIL2000_MARCH_FIRST;
+  int64_t days = seconds / time_constants::SECONDS_PER_DAY;
+  int64_t remainingSeconds = seconds % time_constants::SECONDS_PER_DAY;
   if (remainingSeconds < 0) {
-    remainingSeconds += TimeConstants::SECONDS_PER_DAY;
+    remainingSeconds += time_constants::SECONDS_PER_DAY;
     days--;
   }
 
-  int64_t wday = (TimeConstants::WEEK_DAY_OF2000_MARCH_FIRST + days) %
-                 TimeConstants::DAYS_PER_WEEK;
+  int64_t wday = (time_constants::WEEK_DAY_OF2000_MARCH_FIRST + days) %
+                 time_constants::DAYS_PER_WEEK;
   if (wday < 0)
-    wday += TimeConstants::DAYS_PER_WEEK;
+    wday += time_constants::DAYS_PER_WEEK;
 
   // Compute the number of 400 year cycles.
-  int64_t numOfFourHundredYearCycles = days / TimeConstants::DAYS_PER400_YEARS;
-  int64_t remainingDays = days % TimeConstants::DAYS_PER400_YEARS;
+  int64_t numOfFourHundredYearCycles = days / time_constants::DAYS_PER400_YEARS;
+  int64_t remainingDays = days % time_constants::DAYS_PER400_YEARS;
   if (remainingDays < 0) {
-    remainingDays += TimeConstants::DAYS_PER400_YEARS;
+    remainingDays += time_constants::DAYS_PER400_YEARS;
     numOfFourHundredYearCycles--;
   }
 
@@ -89,17 +88,17 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
   // "four hundred year cycles" will be 4 hundred year cycles or less in 400
   // years.
   int64_t numOfHundredYearCycles = computeRemainingYears(
-      TimeConstants::DAYS_PER100_YEARS, 4, &remainingDays);
+      time_constants::DAYS_PER100_YEARS, 4, &remainingDays);
 
   // The remaining number of years after computing the number of
   // "hundred year cycles" will be 25 four year cycles or less in 100 years.
-  int64_t numOfFourYearCycles =
-      computeRemainingYears(TimeConstants::DAYS_PER4_YEARS, 25, &remainingDays);
+  int64_t numOfFourYearCycles = computeRemainingYears(
+      time_constants::DAYS_PER4_YEARS, 25, &remainingDays);
 
   // The remaining number of years after computing the number of
   // "four year cycles" will be 4 one year cycles or less in 4 years.
   int64_t remainingYears = computeRemainingYears(
-      TimeConstants::DAYS_PER_NON_LEAP_YEAR, 4, &remainingDays);
+      time_constants::DAYS_PER_NON_LEAP_YEAR, 4, &remainingDays);
 
   // Calculate number of years from year 2000.
   int64_t years = remainingYears + 4 * numOfFourYearCycles +
@@ -112,8 +111,8 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
   // We add 31 and 28 for the number of days in January and February, since our
   // starting point was March 1st.
   int64_t yday = remainingDays + 31 + 28 + leapDay;
-  if (yday >= TimeConstants::DAYS_PER_NON_LEAP_YEAR + leapDay)
-    yday -= TimeConstants::DAYS_PER_NON_LEAP_YEAR + leapDay;
+  if (yday >= time_constants::DAYS_PER_NON_LEAP_YEAR + leapDay)
+    yday -= time_constants::DAYS_PER_NON_LEAP_YEAR + leapDay;
 
   int64_t months = 0;
   while (daysInMonth[months] <= remainingDays) {
@@ -121,8 +120,8 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
     months++;
   }
 
-  if (months >= TimeConstants::MONTHS_PER_YEAR - 2) {
-    months -= TimeConstants::MONTHS_PER_YEAR;
+  if (months >= time_constants::MONTHS_PER_YEAR - 2) {
+    months -= time_constants::MONTHS_PER_YEAR;
     years++;
   }
 
@@ -131,19 +130,19 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
 
   // All the data (years, month and remaining days) was calculated from
   // March, 2000. Thus adjust the data to be from January, 1900.
-  tm->tm_year = static_cast<int>(years + 2000 - TimeConstants::TIME_YEAR_BASE);
+  tm->tm_year = static_cast<int>(years + 2000 - time_constants::TIME_YEAR_BASE);
   tm->tm_mon = static_cast<int>(months + 2);
   tm->tm_mday = static_cast<int>(remainingDays + 1);
   tm->tm_wday = static_cast<int>(wday);
   tm->tm_yday = static_cast<int>(yday);
 
   tm->tm_hour =
-      static_cast<int>(remainingSeconds / TimeConstants::SECONDS_PER_HOUR);
+      static_cast<int>(remainingSeconds / time_constants::SECONDS_PER_HOUR);
   tm->tm_min =
-      static_cast<int>(remainingSeconds / TimeConstants::SECONDS_PER_MIN %
-                       TimeConstants::SECONDS_PER_MIN);
+      static_cast<int>(remainingSeconds / time_constants::SECONDS_PER_MIN %
+                       time_constants::SECONDS_PER_MIN);
   tm->tm_sec =
-      static_cast<int>(remainingSeconds % TimeConstants::SECONDS_PER_MIN);
+      static_cast<int>(remainingSeconds % time_constants::SECONDS_PER_MIN);
   // TODO(rtenneti): Need to handle timezone and update of tm_isdst.
   tm->tm_isdst = 0;
 
diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h
index 552ea92..5e0a692 100644
--- a/libc/src/time/time_utils.h
+++ b/libc/src/time/time_utils.h
@@ -9,79 +9,19 @@
 #ifndef LLVM_LIBC_SRC_TIME_TIME_UTILS_H
 #define LLVM_LIBC_SRC_TIME_TIME_UTILS_H
 
-#include <stddef.h> // For size_t.
-
+#include "hdr/types/size_t.h"
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/mktime.h"
+#include "time_constants.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace time_utils {
 
-enum Month : int {
-  JANUARY,
-  FEBRUARY,
-  MARCH,
-  APRIL,
-  MAY,
-  JUNE,
-  JULY,
-  AUGUST,
-  SEPTEMBER,
-  OCTOBER,
-  NOVEMBER,
-  DECEMBER
-};
-
-struct TimeConstants {
-  static constexpr int SECONDS_PER_MIN = 60;
-  static constexpr int MINUTES_PER_HOUR = 60;
-  static constexpr int HOURS_PER_DAY = 24;
-  static constexpr int DAYS_PER_WEEK = 7;
-  static constexpr int MONTHS_PER_YEAR = 12;
-  static constexpr int DAYS_PER_NON_LEAP_YEAR = 365;
-  static constexpr int DAYS_PER_LEAP_YEAR = 366;
-
-  static constexpr int SECONDS_PER_HOUR = SECONDS_PER_MIN * MINUTES_PER_HOUR;
-  static constexpr int SECONDS_PER_DAY = SECONDS_PER_HOUR * HOURS_PER_DAY;
-  static constexpr int NUMBER_OF_SECONDS_IN_LEAP_YEAR =
-      DAYS_PER_LEAP_YEAR * SECONDS_PER_DAY;
-
-  static constexpr int TIME_YEAR_BASE = 1900;
-  static constexpr int EPOCH_YEAR = 1970;
-  static constexpr int EPOCH_WEEK_DAY = 4;
-
-  // For asctime the behavior is undefined if struct tm's tm_wday or tm_mon are
-  // not within the normal ranges as defined in <time.h>, or if struct tm's
-  // tm_year exceeds {INT_MAX}-1990, or if the below asctime_internal algorithm
-  // would attempt to generate more than 26 bytes of output (including the
-  // terminating null).
-  static constexpr int ASCTIME_BUFFER_SIZE = 256;
-  static constexpr int ASCTIME_MAX_BYTES = 26;
-
-  /* 2000-03-01 (mod 400 year, immediately after feb29 */
-  static constexpr int64_t SECONDS_UNTIL2000_MARCH_FIRST =
-      (946684800LL + SECONDS_PER_DAY * (31 + 29));
-  static constexpr int WEEK_DAY_OF2000_MARCH_FIRST = 3;
-
-  static constexpr int DAYS_PER400_YEARS =
-      (DAYS_PER_NON_LEAP_YEAR * 400) + (400 / 4) - 3;
-  static constexpr int DAYS_PER100_YEARS =
-      (DAYS_PER_NON_LEAP_YEAR * 100) + (100 / 4) - 1;
-  static constexpr int DAYS_PER4_YEARS = (DAYS_PER_NON_LEAP_YEAR * 4) + 1;
-
-  // The latest time that can be represented in this form is 03:14:07 UTC on
-  // Tuesday, 19 January 2038 (corresponding to 2,147,483,647 seconds since the
-  // start of the epoch). This means that systems using a 32-bit time_t type are
-  // susceptible to the Year 2038 problem.
-  static constexpr int END_OF32_BIT_EPOCH_YEAR = 2038;
-
-  static constexpr time_t OUT_OF_RANGE_RETURN_VALUE = -1;
-};
-
 // Update the "tm" structure's year, month, etc. members from seconds.
 // "total_seconds" is the number of seconds since January 1st, 1970.
 extern int64_t update_from_seconds(int64_t total_seconds, struct tm *tm);
@@ -98,7 +38,7 @@ LIBC_INLINE time_t out_of_range() {
   // require it.
   libc_errno = EOVERFLOW;
 #endif
-  return TimeConstants::OUT_OF_RANGE_RETURN_VALUE;
+  return time_constants::OUT_OF_RANGE_RETURN_VALUE;
 }
 
 LIBC_INLINE void invalid_value() { libc_errno = EINVAL; }
@@ -110,32 +50,23 @@ LIBC_INLINE char *asctime(const struct tm *timeptr, char *buffer,
     return nullptr;
   }
   if (timeptr->tm_wday < 0 ||
-      timeptr->tm_wday > (TimeConstants::DAYS_PER_WEEK - 1)) {
+      timeptr->tm_wday > (time_constants::DAYS_PER_WEEK - 1)) {
     invalid_value();
     return nullptr;
   }
   if (timeptr->tm_mon < 0 ||
-      timeptr->tm_mon > (TimeConstants::MONTHS_PER_YEAR - 1)) {
+      timeptr->tm_mon > (time_constants::MONTHS_PER_YEAR - 1)) {
     invalid_value();
     return nullptr;
   }
 
-  // TODO(rtenneti): i18n the following strings.
-  static const char *week_days_name[TimeConstants::DAYS_PER_WEEK] = {
-      "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
-
-  static const char *months_name[TimeConstants::MONTHS_PER_YEAR] = {
-      "Jan", "Feb", "Mar", "Apr", "May", "Jun",
-      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
-
-  // TODO(michaelr): look into removing this call to __builtin_snprintf that may
-  // be emitted as a call to snprintf. Alternatively, look into using our
-  // internal printf machinery.
+  // TODO(michaelr): move this to use the strftime machinery
   int written_size = __builtin_snprintf(
       buffer, bufferLength, "%.3s %.3s%3d %.2d:%.2d:%.2d %d\n",
-      week_days_name[timeptr->tm_wday], months_name[timeptr->tm_mon],
-      timeptr->tm_mday, timeptr->tm_hour, timeptr->tm_min, timeptr->tm_sec,
-      TimeConstants::TIME_YEAR_BASE + timeptr->tm_year);
+      time_constants::WEEK_DAY_NAMES[timeptr->tm_wday].data(),
+      time_constants::MONTH_NAMES[timeptr->tm_mon].data(), timeptr->tm_mday,
+      timeptr->tm_hour, timeptr->tm_min, timeptr->tm_sec,
+      time_constants::TIME_YEAR_BASE + timeptr->tm_year);
   if (written_size < 0)
     return nullptr;
   if (static_cast<size_t>(written_size) >= bufferLength) {
diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp
index c7c7c1a..7ffc151 100644
--- a/libc/src/unistd/linux/dup2.cpp
+++ b/libc/src/unistd/linux/dup2.cpp
@@ -32,7 +32,6 @@ LLVM_LIBC_FUNCTION(int, dup2, (int oldfd, int newfd)) {
     int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl, oldfd, F_GETFD);
 #elif defined(SYS_fcntl64)
     // Same as fcntl but can handle large offsets
-    static_assert(sizeof(off_t) == 8);
     int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_fcntl64, oldfd, F_GETFD);
 #else
 #error "SYS_fcntl and SYS_fcntl64 syscalls not available."
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 9f2bae3..b8e240b 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -131,11 +131,11 @@ public:
     else if constexpr (cpp::is_complex_type_same<T, _Complex long double>())
       return matchComplex<long double>();
 #ifdef LIBC_TYPES_HAS_CFLOAT16
-    else if constexpr (cpp::is_complex_type_same<T, cfloat16>)
+    else if constexpr (cpp::is_complex_type_same<T, cfloat16>())
       return matchComplex<float16>();
 #endif
 #ifdef LIBC_TYPES_HAS_CFLOAT128
-    else if constexpr (cpp::is_complex_type_same<T, cfloat128>)
+    else if constexpr (cpp::is_complex_type_same<T, cfloat128>())
       return matchComplex<float128>();
 #endif
   }
@@ -148,11 +148,11 @@ public:
     else if constexpr (cpp::is_complex_type_same<T, _Complex long double>())
       return explainErrorComplex<long double>();
 #ifdef LIBC_TYPES_HAS_CFLOAT16
-    else if constexpr (cpp::is_complex_type_same<T, cfloat16>)
+    else if constexpr (cpp::is_complex_type_same<T, cfloat16>())
       return explainErrorComplex<float16>();
 #endif
 #ifdef LIBC_TYPES_HAS_CFLOAT128
-    else if constexpr (cpp::is_complex_type_same<T, cfloat128>)
+    else if constexpr (cpp::is_complex_type_same<T, cfloat128>())
       return explainErrorComplex<float128>();
 #endif
   }
diff --git a/libc/test/src/complex/cimagf128_test.cpp b/libc/test/src/complex/cimagf128_test.cpp
index 50ddc0a..70ad0de 100644
--- a/libc/test/src/complex/cimagf128_test.cpp
+++ b/libc/test/src/complex/cimagf128_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/complex/cimagf128.h"
 
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 LIST_CIMAG_TESTS(cfloat128, float128, LIBC_NAMESPACE::cimagf128)
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/test/src/complex/cimagf16_test.cpp b/libc/test/src/complex/cimagf16_test.cpp
index 65a6978..3842381 100644
--- a/libc/test/src/complex/cimagf16_test.cpp
+++ b/libc/test/src/complex/cimagf16_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/complex/cimagf16.h"
 
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 LIST_CIMAG_TESTS(cfloat16, float16, LIBC_NAMESPACE::cimagf16)
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/test/src/complex/conjf128_test.cpp b/libc/test/src/complex/conjf128_test.cpp
index a1feb9f..4c2a72c 100644
--- a/libc/test/src/complex/conjf128_test.cpp
+++ b/libc/test/src/complex/conjf128_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/complex/conjf128.h"
 
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 LIST_CONJ_TESTS(cfloat128, float128, LIBC_NAMESPACE::conjf128)
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/test/src/complex/conjf16_test.cpp b/libc/test/src/complex/conjf16_test.cpp
index 0de9f44..374f9ec 100644
--- a/libc/test/src/complex/conjf16_test.cpp
+++ b/libc/test/src/complex/conjf16_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/complex/conjf16.h"
 
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 LIST_CONJ_TESTS(cfloat16, float16, LIBC_NAMESPACE::conjf16)
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/test/src/complex/cprojf128_test.cpp b/libc/test/src/complex/cprojf128_test.cpp
index 75708122..7b41eb5 100644
--- a/libc/test/src/complex/cprojf128_test.cpp
+++ b/libc/test/src/complex/cprojf128_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/complex/cprojf128.h"
 
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 LIST_CPROJ_TESTS(cfloat128, float128, LIBC_NAMESPACE::cprojf128)
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/test/src/complex/cprojf16_test.cpp b/libc/test/src/complex/cprojf16_test.cpp
index 628cec0..db9b7b9 100644
--- a/libc/test/src/complex/cprojf16_test.cpp
+++ b/libc/test/src/complex/cprojf16_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/complex/cprojf16.h"
 
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 LIST_CPROJ_TESTS(cfloat16, float16, LIBC_NAMESPACE::cprojf16)
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/test/src/complex/crealf128_test.cpp b/libc/test/src/complex/crealf128_test.cpp
index 7626eee..0d1c26d 100644
--- a/libc/test/src/complex/crealf128_test.cpp
+++ b/libc/test/src/complex/crealf128_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/complex/crealf128.h"
 
-#if defined(LIBC_TYPES_HAS_CFLOAT128)
-
 LIST_CREAL_TESTS(cfloat128, float128, LIBC_NAMESPACE::crealf128)
-
-#endif // LIBC_TYPES_HAS_CFLOAT128
diff --git a/libc/test/src/complex/crealf16_test.cpp b/libc/test/src/complex/crealf16_test.cpp
index 97346aa..b8560d7 100644
--- a/libc/test/src/complex/crealf16_test.cpp
+++ b/libc/test/src/complex/crealf16_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/complex/crealf16.h"
 
-#if defined(LIBC_TYPES_HAS_CFLOAT16)
-
 LIST_CREAL_TESTS(cfloat16, float16, LIBC_NAMESPACE::crealf16)
-
-#endif // LIBC_TYPES_HAS_CFLOAT16
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 4ca2043..8cc0428 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -301,18 +301,6 @@ add_libc_test(
 )
 
 add_libc_test(
-  quick_sort_test
-  SUITE
-    libc-stdlib-tests
-  SRCS
-    quick_sort_test.cpp
-  HDRS
-    SortingTest.h
-  DEPENDS
-    libc.src.stdlib.qsort_util
-)
-
-add_libc_test(
   heap_sort_test
   SUITE
     libc-stdlib-tests
@@ -321,15 +309,15 @@ add_libc_test(
   HDRS
     SortingTest.h
   DEPENDS
-    libc.src.stdlib.qsort_util
+    libc.src.stdlib.qsort
 )
 
 add_libc_test(
-  qsort_test
+  quick_sort_test
   SUITE
     libc-stdlib-tests
   SRCS
-    qsort_test.cpp
+    quick_sort_test.cpp
   HDRS
     SortingTest.h
   DEPENDS
diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h
index d34584e..681a420 100644
--- a/libc/test/src/stdlib/SortingTest.h
+++ b/libc/test/src/stdlib/SortingTest.h
@@ -7,19 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/macros/config.h"
-#include "src/stdlib/qsort_data.h"
+#include "src/stdlib/qsort.h"
 #include "test/UnitTest/Test.h"
 
 class SortingTest : public LIBC_NAMESPACE::testing::Test {
 
-  using Array = LIBC_NAMESPACE::internal::Array;
-  using Comparator = LIBC_NAMESPACE::internal::Comparator;
-  using SortingRoutine = LIBC_NAMESPACE::internal::SortingRoutine;
+  using SortingRoutine = void (*)(void *array, size_t array_len,
+                                  size_t elem_size,
+                                  int (*compare)(const void *, const void *));
 
-public:
   static int int_compare(const void *l, const void *r) {
     int li = *reinterpret_cast<const int *>(l);
     int ri = *reinterpret_cast<const int *>(r);
+
     if (li == ri)
       return 0;
     else if (li > ri)
@@ -28,16 +28,19 @@ public:
       return -1;
   }
 
+  static void int_sort(SortingRoutine sort_func, int *array, size_t array_len) {
+    sort_func(reinterpret_cast<void *>(array), array_len, sizeof(int),
+              int_compare);
+  }
+
+public:
   void test_sorted_array(SortingRoutine sort_func) {
     int array[25] = {10,   23,   33,   35,   55,   70,    71,   100,  110,
                      123,  133,  135,  155,  170,  171,   1100, 1110, 1123,
                      1133, 1135, 1155, 1170, 1171, 11100, 12310};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_LE(array[0], 10);
     ASSERT_LE(array[1], 23);
@@ -69,14 +72,11 @@ public:
   void test_reversed_sorted_array(SortingRoutine sort_func) {
     int array[] = {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
                    12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    int_sort(sort_func, array, ARRAY_LEN);
 
-    sort_func(arr);
-
-    for (int i = 0; i < int(ARRAY_SIZE - 1); ++i)
+    for (int i = 0; i < int(ARRAY_LEN - 1); ++i)
       ASSERT_EQ(array[i], i + 1);
   }
 
@@ -84,14 +84,11 @@ public:
     int array[] = {100, 100, 100, 100, 100, 100, 100, 100, 100,
                    100, 100, 100, 100, 100, 100, 100, 100, 100,
                    100, 100, 100, 100, 100, 100, 100};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
-    for (size_t i = 0; i < ARRAY_SIZE; ++i)
+    for (size_t i = 0; i < ARRAY_LEN; ++i)
       ASSERT_EQ(array[i], 100);
   }
 
@@ -99,12 +96,9 @@ public:
     int array[25] = {10,  23,  8,    35,   55,   45,  40,  100, 110,
                      123, 90,  80,   70,   60,   171, 11,  1,   -1,
                      -5,  -10, 1155, 1170, 1171, 12,  -100};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], -100);
     ASSERT_EQ(array[1], -10);
@@ -135,12 +129,9 @@ public:
 
   void test_unsorted_array_2(SortingRoutine sort_func) {
     int array[7] = {10, 40, 45, 55, 35, 23, 60};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
-
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 10);
     ASSERT_EQ(array[1], 23);
@@ -153,12 +144,9 @@ public:
 
   void test_unsorted_array_duplicated_1(SortingRoutine sort_func) {
     int array[6] = {10, 10, 20, 20, 5, 5};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 5);
     ASSERT_EQ(array[1], 5);
@@ -170,12 +158,9 @@ public:
 
   void test_unsorted_array_duplicated_2(SortingRoutine sort_func) {
     int array[10] = {20, 10, 10, 10, 10, 20, 21, 21, 21, 21};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 10);
     ASSERT_EQ(array[1], 10);
@@ -191,12 +176,9 @@ public:
 
   void test_unsorted_array_duplicated_3(SortingRoutine sort_func) {
     int array[10] = {20, 30, 30, 30, 30, 20, 21, 21, 21, 21};
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
-
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 20);
     ASSERT_EQ(array[1], 20);
@@ -213,12 +195,9 @@ public:
   void test_unsorted_three_element_1(SortingRoutine sort_func) {
     int array[3] = {14999024, 0, 3};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -228,12 +207,9 @@ public:
   void test_unsorted_three_element_2(SortingRoutine sort_func) {
     int array[3] = {3, 14999024, 0};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -243,12 +219,9 @@ public:
   void test_unsorted_three_element_3(SortingRoutine sort_func) {
     int array[3] = {3, 0, 14999024};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
-
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 3);
@@ -258,12 +231,9 @@ public:
   void test_same_three_element(SortingRoutine sort_func) {
     int array[3] = {12345, 12345, 12345};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 12345);
     ASSERT_EQ(array[1], 12345);
@@ -273,12 +243,9 @@ public:
   void test_unsorted_two_element_1(SortingRoutine sort_func) {
     int array[] = {14999024, 0};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 14999024);
@@ -287,12 +254,9 @@ public:
   void test_unsorted_two_element_2(SortingRoutine sort_func) {
     int array[] = {0, 14999024};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
-
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 0);
     ASSERT_EQ(array[1], 14999024);
@@ -301,12 +265,9 @@ public:
   void test_same_two_element(SortingRoutine sort_func) {
     int array[] = {12345, 12345};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 12345);
     ASSERT_EQ(array[1], 12345);
@@ -315,15 +276,80 @@ public:
   void test_single_element(SortingRoutine sort_func) {
     int array[] = {12345};
 
-    constexpr size_t ARRAY_SIZE = sizeof(array) / sizeof(int);
-
-    auto arr = Array(reinterpret_cast<uint8_t *>(array), ARRAY_SIZE,
-                     sizeof(int), Comparator(int_compare));
+    constexpr size_t ARRAY_LEN = sizeof(array) / sizeof(int);
 
-    sort_func(arr);
+    int_sort(sort_func, array, ARRAY_LEN);
 
     ASSERT_EQ(array[0], 12345);
   }
+
+  void test_different_elem_size(SortingRoutine sort_func) {
+    // Random order of values [0,50) to avoid only testing pre-sorted handling.
+    // Long enough to reach interesting code.
+    constexpr uint8_t ARRAY_INITIAL_VALS[] = {
+        42, 13, 8,  4,  17, 28, 20, 32, 22, 29, 7,  2,  46, 37, 26, 49, 24,
+        38, 10, 18, 40, 36, 47, 15, 11, 48, 44, 33, 1,  5,  16, 35, 39, 41,
+        14, 23, 3,  9,  6,  27, 21, 25, 31, 45, 12, 43, 34, 30, 19, 0};
+
+    constexpr size_t ARRAY_LEN = sizeof(ARRAY_INITIAL_VALS);
+    constexpr size_t MAX_ELEM_SIZE = 150;
+    constexpr size_t BUF_SIZE = ARRAY_LEN * MAX_ELEM_SIZE;
+
+    static_assert(ARRAY_LEN < 256); // so we can encode the values.
+
+    // Minimum alignment to test implementation for bugs related to assuming
+    // incorrect association between alignment and element size. The buffer is
+    // 'static' as otherwise it will exhaust the stack on the GPU targets.
+    alignas(1) static uint8_t buf[BUF_SIZE];
+
+    // GCC still requires capturing the constant ARRAY_INITIAL_VALS in the
+    // lambda hence, let's use & to implicitly capture all needed variables
+    // automatically.
+    const auto fill_buf = [&](size_t elem_size) {
+      for (size_t i = 0; i < BUF_SIZE; ++i) {
+        buf[i] = 0;
+      }
+
+      for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) {
+        const uint8_t elem_val = ARRAY_INITIAL_VALS[elem_i];
+        for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) {
+          buf[buf_i] = elem_val;
+          buf_i += 1;
+        }
+      }
+    };
+
+    for (size_t elem_size = 0; elem_size <= MAX_ELEM_SIZE; ++elem_size) {
+      // Fill all bytes with data to ensure mistakes in elem swap are noticed.
+      fill_buf(elem_size);
+
+      sort_func(reinterpret_cast<void *>(buf), ARRAY_LEN, elem_size,
+                [](const void *a, const void *b) -> int {
+                  const uint8_t a_val = *reinterpret_cast<const uint8_t *>(a);
+                  const uint8_t b_val = *reinterpret_cast<const uint8_t *>(b);
+
+                  if (a_val < b_val) {
+                    return -1;
+                  } else if (a_val > b_val) {
+                    return 1;
+                  } else {
+                    return 0;
+                  }
+                });
+
+      for (size_t elem_i = 0, buf_i = 0; elem_i < ARRAY_LEN; ++elem_i) {
+        const uint8_t expected_elem_val = static_cast<uint8_t>(elem_i);
+
+        for (size_t elem_byte_i = 0; elem_byte_i < elem_size; ++elem_byte_i) {
+          const uint8_t buf_val = buf[buf_i];
+          // Check that every byte in the element has the expected value.
+          ASSERT_EQ(buf_val, expected_elem_val)
+              << "elem_size: " << elem_size << " buf_i: " << buf_i << '\n';
+          buf_i += 1;
+        }
+      }
+    }
+  }
 };
 
 #define LIST_SORTING_TESTS(Name, Func)                                         \
@@ -374,4 +400,7 @@ public:
   TEST_F(LlvmLibc##Name##Test, SingleElementArray) {                           \
     test_single_element(Func);                                                 \
   }                                                                            \
+  TEST_F(LlvmLibc##Name##Test, DifferentElemSizeArray) {                       \
+    test_different_elem_size(Func);                                            \
+  }                                                                            \
   static_assert(true)
diff --git a/libc/test/src/stdlib/heap_sort_test.cpp b/libc/test/src/stdlib/heap_sort_test.cpp
index d70e3dc..18d4244 100644
--- a/libc/test/src/stdlib/heap_sort_test.cpp
+++ b/libc/test/src/stdlib/heap_sort_test.cpp
@@ -7,10 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "SortingTest.h"
-#include "src/stdlib/heap_sort.h"
+#include "src/stdlib/qsort_util.h"
 
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
-  LIBC_NAMESPACE::internal::heap_sort(array);
+void heap_sort(void *array, size_t array_size, size_t elem_size,
+               int (*compare)(const void *, const void *)) {
+
+  constexpr bool USE_QUICKSORT = false;
+
+  const auto is_less = [compare](const void *a,
+                                 const void *b) noexcept -> bool {
+    return compare(a, b) < 0;
+  };
+
+  LIBC_NAMESPACE::internal::unstable_sort_impl<USE_QUICKSORT>(
+      array, array_size, elem_size, is_less);
 }
 
-LIST_SORTING_TESTS(HeapSort, sort);
+LIST_SORTING_TESTS(HeapSort, heap_sort);
diff --git a/libc/test/src/stdlib/qsort_r_test.cpp b/libc/test/src/stdlib/qsort_r_test.cpp
index 6893fdc..f189236 100644
--- a/libc/test/src/stdlib/qsort_r_test.cpp
+++ b/libc/test/src/stdlib/qsort_r_test.cpp
@@ -62,9 +62,9 @@ TEST(LlvmLibcQsortRTest, SortedArray) {
   ASSERT_LE(array[23], 11100);
   ASSERT_LE(array[24], 12310);
 
-  // This is a sorted list, but there still have to have been at least N
+  // This is a sorted list, but there still have to have been at least N - 1
   // comparisons made.
-  ASSERT_GE(count, ARRAY_SIZE);
+  ASSERT_GE(count, ARRAY_SIZE - 1);
 }
 
 TEST(LlvmLibcQsortRTest, ReverseSortedArray) {
diff --git a/libc/test/src/stdlib/qsort_test.cpp b/libc/test/src/stdlib/qsort_test.cpp
deleted file mode 100644
index 1e921a8..0000000
--- a/libc/test/src/stdlib/qsort_test.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-//===-- Unittests for qsort -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SortingTest.h"
-#include "src/stdlib/qsort.h"
-
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
-  LIBC_NAMESPACE::qsort(reinterpret_cast<void *>(array.get(0)), array.size(),
-                        sizeof(int), SortingTest::int_compare);
-}
-
-LIST_SORTING_TESTS(Qsort, sort);
diff --git a/libc/test/src/stdlib/quick_sort_test.cpp b/libc/test/src/stdlib/quick_sort_test.cpp
index d6bf77e..2832c85 100644
--- a/libc/test/src/stdlib/quick_sort_test.cpp
+++ b/libc/test/src/stdlib/quick_sort_test.cpp
@@ -1,4 +1,4 @@
-//===-- Unittests for quick sort ------------------------------------------===//
+//===-- Unittests for qsort -----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,10 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "SortingTest.h"
-#include "src/stdlib/quick_sort.h"
+#include "src/stdlib/qsort_util.h"
 
-void sort(const LIBC_NAMESPACE::internal::Array &array) {
-  LIBC_NAMESPACE::internal::quick_sort(array);
+void quick_sort(void *array, size_t array_size, size_t elem_size,
+                int (*compare)(const void *, const void *)) {
+  constexpr bool USE_QUICKSORT = true;
+
+  const auto is_less = [compare](const void *a,
+                                 const void *b) noexcept -> bool {
+    return compare(a, b) < 0;
+  };
+
+  LIBC_NAMESPACE::internal::unstable_sort_impl<USE_QUICKSORT>(
+      array, array_size, elem_size, is_less);
 }
 
-LIST_SORTING_TESTS(QuickSort, sort);
+LIST_SORTING_TESTS(Qsort, quick_sort);
diff --git a/libc/test/src/time/CMakeLists.txt b/libc/test/src/time/CMakeLists.txt
index da3903f..12add22 100644
--- a/libc/test/src/time/CMakeLists.txt
+++ b/libc/test/src/time/CMakeLists.txt
@@ -13,6 +13,8 @@ add_libc_unittest(
     20
   DEPENDS
     libc.src.time.asctime
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_unittest(
@@ -28,6 +30,8 @@ add_libc_unittest(
     20
   DEPENDS
     libc.src.time.asctime_r
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_unittest(
@@ -45,7 +49,8 @@ add_libc_unittest(
     libc.include.time
     libc.hdr.types.time_t
     libc.src.time.ctime
-    libc.src.time.time_utils
+    libc.src.time.time_constants
+    libc.hdr.types.struct_tm
 )
 
 add_libc_unittest(
@@ -63,7 +68,8 @@ add_libc_unittest(
     libc.include.time
     libc.hdr.types.time_t
     libc.src.time.ctime_r
-    libc.src.time.time_utils
+    libc.src.time.time_constants
+    libc.hdr.types.struct_tm
 )
 
 add_libc_test(
@@ -74,6 +80,9 @@ add_libc_test(
     clock_gettime_test.cpp
   DEPENDS
     libc.src.time.clock_gettime
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_timespec
+    libc.hdr.time_macros
 )
 
 add_libc_test(
@@ -94,6 +103,8 @@ add_libc_unittest(
     difftime_test.cpp
   DEPENDS
     libc.src.time.difftime
+    libc.src.time.time_constants
+    libc.src.__support.FPUtil.fp_bits
 )
 
 add_libc_unittest(
@@ -105,6 +116,7 @@ add_libc_unittest(
   DEPENDS
     libc.include.time
     libc.src.time.gettimeofday
+    libc.hdr.types.struct_timeval
 )
 
 add_libc_unittest(
@@ -118,6 +130,8 @@ add_libc_unittest(
   DEPENDS
     libc.src.time.gmtime
     libc.src.__support.CPP.limits
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_unittest(
@@ -130,6 +144,8 @@ add_libc_unittest(
     TmMatcher.h
   DEPENDS
     libc.src.time.gmtime_r
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_unittest(
@@ -146,6 +162,8 @@ add_libc_unittest(
   DEPENDS
     libc.src.time.mktime
     libc.src.__support.CPP.limits
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_test(
@@ -158,6 +176,7 @@ add_libc_test(
     libc.include.time
     libc.src.time.nanosleep
     libc.src.errno.errno
+    libc.hdr.types.struct_timespec
 )
 
 add_libc_unittest(
@@ -180,6 +199,7 @@ add_libc_test(
     timespec_get_test.cpp
   DEPENDS
     libc.src.time.timespec_get
+    libc.hdr.types.struct_timespec
 )
 
 add_libc_test(
@@ -192,4 +212,5 @@ add_libc_test(
     libc.include.time
     libc.src.time.clock
     libc.src.errno.errno
+    libc.hdr.types.clock_t
 )
diff --git a/libc/test/src/time/TmHelper.h b/libc/test/src/time/TmHelper.h
index 5ae2584..1582839 100644
--- a/libc/test/src/time/TmHelper.h
+++ b/libc/test/src/time/TmHelper.h
@@ -9,12 +9,9 @@
 #ifndef LLVM_LIBC_TEST_SRC_TIME_TMHELPER_H
 #define LLVM_LIBC_TEST_SRC_TIME_TMHELPER_H
 
-#include <time.h>
-
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
-#include "src/time/time_utils.h"
-
-using LIBC_NAMESPACE::time_utils::TimeConstants;
+#include "src/time/time_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace tmhelper {
@@ -30,7 +27,7 @@ static inline void initialize_tm_data(struct tm *tm_data, int year, int month,
                     .tm_mday = mday,
                     .tm_mon = month - 1, // tm_mon starts with 0 for Jan
                     // years since 1900
-                    .tm_year = year - TimeConstants::TIME_YEAR_BASE,
+                    .tm_year = year - time_constants::TIME_YEAR_BASE,
                     .tm_wday = wday,
                     .tm_yday = yday,
                     .tm_isdst = 0};
diff --git a/libc/test/src/time/TmMatcher.h b/libc/test/src/time/TmMatcher.h
index 630956b..d39ee39 100644
--- a/libc/test/src/time/TmMatcher.h
+++ b/libc/test/src/time/TmMatcher.h
@@ -9,8 +9,7 @@
 #ifndef LLVM_LIBC_TEST_SRC_TIME_TM_MATCHER_H
 #define LLVM_LIBC_TEST_SRC_TIME_TM_MATCHER_H
 
-#include <time.h>
-
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/time/asctime_r_test.cpp b/libc/test/src/time/asctime_r_test.cpp
index f3aadbb..b595cfe 100644
--- a/libc/test/src/time/asctime_r_test.cpp
+++ b/libc/test/src/time/asctime_r_test.cpp
@@ -8,12 +8,10 @@
 
 #include "src/errno/libc_errno.h"
 #include "src/time/asctime_r.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 static inline char *call_asctime_r(struct tm *tm_data, int year, int month,
                                    int mday, int hour, int min, int sec,
                                    int wday, int yday, char *buffer) {
@@ -30,7 +28,7 @@ TEST(LlvmLibcAsctimeR, Nullptr) {
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_STREQ(nullptr, result);
 
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   result = LIBC_NAMESPACE::asctime_r(nullptr, buffer);
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_STREQ(nullptr, result);
@@ -42,7 +40,7 @@ TEST(LlvmLibcAsctimeR, Nullptr) {
 }
 
 TEST(LlvmLibcAsctimeR, ValidDate) {
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   struct tm tm_data;
   char *result;
   // 1970-01-01 00:00:00. Test with a valid buffer size.
diff --git a/libc/test/src/time/clock_gettime_test.cpp b/libc/test/src/time/clock_gettime_test.cpp
index 43715c0..d3edcae 100644
--- a/libc/test/src/time/clock_gettime_test.cpp
+++ b/libc/test/src/time/clock_gettime_test.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/time_macros.h"
+#include "hdr/types/struct_timespec.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "src/time/clock_gettime.h"
 #include "test/UnitTest/Test.h"
 
-#include <time.h>
-
 TEST(LlvmLibcClockGetTime, RealTime) {
   timespec tp;
   int result;
diff --git a/libc/test/src/time/clock_test.cpp b/libc/test/src/time/clock_test.cpp
index 05082aa..8d8d89d 100644
--- a/libc/test/src/time/clock_test.cpp
+++ b/libc/test/src/time/clock_test.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/types/clock_t.h"
 #include "src/time/clock.h"
 #include "test/UnitTest/Test.h"
 
-#include <time.h>
-
 TEST(LlvmLibcClockTest, SmokeTest) {
   clock_t c1 = LIBC_NAMESPACE::clock();
   ASSERT_GT(c1, clock_t(0));
diff --git a/libc/test/src/time/ctime_r_test.cpp b/libc/test/src/time/ctime_r_test.cpp
index 9ce6f75..27011b7 100644
--- a/libc/test/src/time/ctime_r_test.cpp
+++ b/libc/test/src/time/ctime_r_test.cpp
@@ -8,18 +8,16 @@
 
 #include "src/errno/libc_errno.h"
 #include "src/time/ctime_r.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 TEST(LlvmLibcCtimeR, Nullptr) {
   char *result;
   result = LIBC_NAMESPACE::ctime_r(nullptr, nullptr);
   ASSERT_STREQ(nullptr, result);
 
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   result = LIBC_NAMESPACE::ctime_r(nullptr, buffer);
   ASSERT_STREQ(nullptr, result);
 
@@ -29,7 +27,7 @@ TEST(LlvmLibcCtimeR, Nullptr) {
 }
 
 TEST(LlvmLibcCtimeR, ValidUnixTimestamp0) {
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   time_t t;
   char *result;
   // 1970-01-01 00:00:00. Test with a valid buffer size.
@@ -39,7 +37,7 @@ TEST(LlvmLibcCtimeR, ValidUnixTimestamp0) {
 }
 
 TEST(LlvmLibcCtime, ValidUnixTimestamp32Int) {
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   time_t t;
   char *result;
   // 2038-01-19 03:14:07. Test with a valid buffer size.
@@ -49,7 +47,7 @@ TEST(LlvmLibcCtime, ValidUnixTimestamp32Int) {
 }
 
 TEST(LlvmLibcCtimeR, InvalidArgument) {
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   time_t t;
   char *result;
   t = 2147483648;
diff --git a/libc/test/src/time/difftime_test.cpp b/libc/test/src/time/difftime_test.cpp
index 68ff463..4dab1ac 100644
--- a/libc/test/src/time/difftime_test.cpp
+++ b/libc/test/src/time/difftime_test.cpp
@@ -8,15 +8,12 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/time/difftime.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 TEST(LlvmLibcDifftime, SmokeTest) {
-  time_t t1_seconds = TimeConstants::SECONDS_PER_HOUR;
+  time_t t1_seconds = LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR;
   time_t t2_seconds = 0;
 
   LIBC_NAMESPACE::fputil::FPBits<long double> expected_fp =
diff --git a/libc/test/src/time/gettimeofday_test.cpp b/libc/test/src/time/gettimeofday_test.cpp
index ee934b7..8f9f136 100644
--- a/libc/test/src/time/gettimeofday_test.cpp
+++ b/libc/test/src/time/gettimeofday_test.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <time.h>
-
+#include "hdr/types/struct_timeval.h"
 #include "src/time/gettimeofday.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/time/gmtime_r_test.cpp b/libc/test/src/time/gmtime_r_test.cpp
index 2276b48..9d466f4 100644
--- a/libc/test/src/time/gmtime_r_test.cpp
+++ b/libc/test/src/time/gmtime_r_test.cpp
@@ -7,12 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/gmtime_r.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmMatcher.h"
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 // gmtime and gmtime_r share the same code and thus didn't repeat all the tests
 // from gmtime. Added couple of validation tests.
 TEST(LlvmLibcGmTimeR, EndOf32BitEpochYear) {
@@ -22,16 +20,17 @@ TEST(LlvmLibcGmTimeR, EndOf32BitEpochYear) {
   struct tm tm_data;
   struct tm *tm_data_ptr;
   tm_data_ptr = LIBC_NAMESPACE::gmtime_r(&seconds, &tm_data);
-  EXPECT_TM_EQ((tm{7,  // sec
-                   14, // min
-                   3,  // hr
-                   19, // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2038 - TimeConstants::TIME_YEAR_BASE, // year
-                   2,                                    // wday
-                   7,                                    // yday
-                   0}),
-               *tm_data_ptr);
+  EXPECT_TM_EQ(
+      (tm{7,  // sec
+          14, // min
+          3,  // hr
+          19, // day
+          0,  // tm_mon starts with 0 for Jan
+          2038 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          2,                                                     // wday
+          7,                                                     // yday
+          0}),
+      *tm_data_ptr);
   EXPECT_TM_EQ(*tm_data_ptr, tm_data);
 }
 
@@ -43,15 +42,16 @@ TEST(LlvmLibcGmTimeR, Max64BitYear) {
   struct tm tm_data;
   struct tm *tm_data_ptr;
   tm_data_ptr = LIBC_NAMESPACE::gmtime_r(&seconds, &tm_data);
-  EXPECT_TM_EQ((tm{50, // sec
-                   50, // min
-                   12, // hr
-                   1,  // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2147483647 - TimeConstants::TIME_YEAR_BASE, // year
-                   2,                                          // wday
-                   50,                                         // yday
-                   0}),
-               *tm_data_ptr);
+  EXPECT_TM_EQ(
+      (tm{50, // sec
+          50, // min
+          12, // hr
+          1,  // day
+          0,  // tm_mon starts with 0 for Jan
+          2147483647 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          2,                                                           // wday
+          50,                                                          // yday
+          0}),
+      *tm_data_ptr);
   EXPECT_TM_EQ(*tm_data_ptr, tm_data);
 }
diff --git a/libc/test/src/time/gmtime_test.cpp b/libc/test/src/time/gmtime_test.cpp
index 433fbf6..6af5a18 100644
--- a/libc/test/src/time/gmtime_test.cpp
+++ b/libc/test/src/time/gmtime_test.cpp
@@ -6,32 +6,36 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/types/struct_tm.h"
 #include "src/__support/CPP/limits.h" // INT_MAX, INT_MIN
 #include "src/errno/libc_errno.h"
 #include "src/time/gmtime.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmMatcher.h"
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-using LIBC_NAMESPACE::time_utils::TimeConstants;
 
 TEST(LlvmLibcGmTime, OutOfRange) {
   if (sizeof(time_t) < sizeof(int64_t))
     return;
   time_t seconds =
-      1 + INT_MAX * static_cast<int64_t>(
-                        TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
+      1 +
+      INT_MAX *
+          static_cast<int64_t>(
+              LIBC_NAMESPACE::time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
   struct tm *tm_data = LIBC_NAMESPACE::gmtime(&seconds);
   EXPECT_TRUE(tm_data == nullptr);
   ASSERT_ERRNO_EQ(EOVERFLOW);
 
   LIBC_NAMESPACE::libc_errno = 0;
-  seconds = INT_MIN * static_cast<int64_t>(
-                          TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR) -
-            1;
+  seconds =
+      INT_MIN *
+          static_cast<int64_t>(
+              LIBC_NAMESPACE::time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR) -
+      1;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
   EXPECT_TRUE(tm_data == nullptr);
   ASSERT_ERRNO_EQ(EOVERFLOW);
@@ -43,201 +47,215 @@ TEST(LlvmLibcGmTime, InvalidSeconds) {
   // -1 second from 1970-01-01 00:00:00 returns 1969-12-31 23:59:59.
   seconds = -1;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{59,     // sec
-                   59,     // min
-                   23,     // hr
-                   31,     // day
-                   12 - 1, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   364,                                  // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{59,     // sec
+          59,     // min
+          23,     // hr
+          31,     // day
+          12 - 1, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          364,                                                   // yday
+          0}),
+      *tm_data);
   // 60 seconds from 1970-01-01 00:00:00 returns 1970-01-01 00:01:00.
   seconds = 60;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   1, // min
-                   0, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   4,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          1, // min
+          0, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          4,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidMinutes) {
   time_t seconds = 0;
   struct tm *tm_data = nullptr;
   // -1 minute from 1970-01-01 00:00:00 returns 1969-12-31 23:59:00.
-  seconds = -TimeConstants::SECONDS_PER_MIN;
+  seconds = -LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0,  // sec
-                   59, // min
-                   23, // hr
-                   31, // day
-                   11, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0,  // sec
+          59, // min
+          23, // hr
+          31, // day
+          11, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
   // 60 minutes from 1970-01-01 00:00:00 returns 1970-01-01 01:00:00.
-  seconds = 60 * TimeConstants::SECONDS_PER_MIN;
+  seconds = 60 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   1, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   4,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          1, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          4,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidHours) {
   time_t seconds = 0;
   struct tm *tm_data = nullptr;
   // -1 hour from 1970-01-01 00:00:00 returns 1969-12-31 23:00:00.
-  seconds = -TimeConstants::SECONDS_PER_HOUR;
+  seconds = -LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0,  // sec
-                   0,  // min
-                   23, // hr
-                   31, // day
-                   11, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0,  // sec
+          0,  // min
+          23, // hr
+          31, // day
+          11, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
   // 24 hours from 1970-01-01 00:00:00 returns 1970-01-02 00:00:00.
-  seconds = 24 * TimeConstants::SECONDS_PER_HOUR;
+  seconds = 24 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   2, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   5,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          2, // day
+          0, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          5,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidYear) {
   // -1 year from 1970-01-01 00:00:00 returns 1969-01-01 00:00:00.
-  time_t seconds =
-      -TimeConstants::DAYS_PER_NON_LEAP_YEAR * TimeConstants::SECONDS_PER_DAY;
+  time_t seconds = -LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR *
+                   LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   struct tm *tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidMonths) {
   time_t seconds = 0;
   struct tm *tm_data = nullptr;
   // -1 month from 1970-01-01 00:00:00 returns 1969-12-01 00:00:00.
-  seconds = -31 * TimeConstants::SECONDS_PER_DAY;
+  seconds = -31 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0,      // sec
-                   0,      // min
-                   0,      // hr
-                   1,      // day
-                   12 - 1, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   1,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0,      // sec
+          0,      // min
+          0,      // hr
+          1,      // day
+          12 - 1, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          1,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
   // 1970-13-01 00:00:00 returns 1971-01-01 00:00:00.
-  seconds =
-      TimeConstants::DAYS_PER_NON_LEAP_YEAR * TimeConstants::SECONDS_PER_DAY;
+  seconds = LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR *
+            LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1971 - TimeConstants::TIME_YEAR_BASE, // year
-                   5,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1971 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          5,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidDays) {
   time_t seconds = 0;
   struct tm *tm_data = nullptr;
   // -1 day from 1970-01-01 00:00:00 returns 1969-12-31 00:00:00.
-  seconds = -1 * TimeConstants::SECONDS_PER_DAY;
+  seconds = -1 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0,  // sec
-                   0,  // min
-                   0,  // hr
-                   31, // day
-                   11, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0,  // sec
+          0,  // min
+          0,  // hr
+          31, // day
+          11, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 
   // 1970-01-32 00:00:00 returns 1970-02-01 00:00:00.
-  seconds = 31 * TimeConstants::SECONDS_PER_DAY;
+  seconds = 31 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   0,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          0,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 
   // 1970-02-29 00:00:00 returns 1970-03-01 00:00:00.
-  seconds = 59 * TimeConstants::SECONDS_PER_DAY;
+  seconds = 59 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   2, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   0,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          2, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          0,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 
   // 1972-02-30 00:00:00 returns 1972-03-01 00:00:00.
-  seconds = ((2 * TimeConstants::DAYS_PER_NON_LEAP_YEAR) + 60) *
-            TimeConstants::SECONDS_PER_DAY;
+  seconds =
+      ((2 * LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR) + 60) *
+      LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   2, // tm_mon starts with 0 for Jan
-                   1972 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          2, // tm_mon starts with 0 for Jan
+          1972 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, EndOf32BitEpochYear) {
@@ -245,16 +263,17 @@ TEST(LlvmLibcGmTime, EndOf32BitEpochYear) {
   // Test implementation can encode time for Tue 19 January 2038 03:14:07 UTC.
   time_t seconds = 0x7FFFFFFF;
   struct tm *tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{7,  // sec
-                   14, // min
-                   3,  // hr
-                   19, // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2038 - TimeConstants::TIME_YEAR_BASE, // year
-                   2,                                    // wday
-                   7,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{7,  // sec
+          14, // min
+          3,  // hr
+          19, // day
+          0,  // tm_mon starts with 0 for Jan
+          2038 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          2,                                                     // wday
+          7,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, Max64BitYear) {
@@ -263,28 +282,30 @@ TEST(LlvmLibcGmTime, Max64BitYear) {
   // Mon Jan 1 12:50:50 2170 (200 years from 1970),
   time_t seconds = 6311479850;
   struct tm *tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{50, // sec
-                   50, // min
-                   12, // hr
-                   1,  // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2170 - TimeConstants::TIME_YEAR_BASE, // year
-                   1,                                    // wday
-                   50,                                   // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{50, // sec
+          50, // min
+          12, // hr
+          1,  // day
+          0,  // tm_mon starts with 0 for Jan
+          2170 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          1,                                                     // wday
+          50,                                                    // yday
+          0}),
+      *tm_data);
 
   // Test for Tue Jan 1 12:50:50 in 2,147,483,647th year.
   seconds = 67767976202043050;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{50, // sec
-                   50, // min
-                   12, // hr
-                   1,  // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2147483647 - TimeConstants::TIME_YEAR_BASE, // year
-                   2,                                          // wday
-                   50,                                         // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{50, // sec
+          50, // min
+          12, // hr
+          1,  // day
+          0,  // tm_mon starts with 0 for Jan
+          2147483647 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          2,                                                           // wday
+          50,                                                          // yday
+          0}),
+      *tm_data);
 }
diff --git a/libc/test/src/time/mktime_test.cpp b/libc/test/src/time/mktime_test.cpp
index 84e6c7e..fe1116f 100644
--- a/libc/test/src/time/mktime_test.cpp
+++ b/libc/test/src/time/mktime_test.cpp
@@ -8,7 +8,7 @@
 
 #include "src/__support/CPP/limits.h" // INT_MAX
 #include "src/time/mktime.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
@@ -16,29 +16,37 @@
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-using LIBC_NAMESPACE::time_utils::Month;
+using LIBC_NAMESPACE::time_constants::Month;
 
 static inline constexpr int tm_year(int year) {
-  return year - TimeConstants::TIME_YEAR_BASE;
+  return year - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE;
 }
 
 TEST(LlvmLibcMkTime, FailureSetsErrno) {
-  struct tm tm_data {
-    .tm_sec = INT_MAX, .tm_min = INT_MAX, .tm_hour = INT_MAX,
-    .tm_mday = INT_MAX, .tm_mon = INT_MAX - 1, .tm_year = tm_year(INT_MAX),
-    .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
-  };
+  struct tm tm_data{.tm_sec = INT_MAX,
+                    .tm_min = INT_MAX,
+                    .tm_hour = INT_MAX,
+                    .tm_mday = INT_MAX,
+                    .tm_mon = INT_MAX - 1,
+                    .tm_year = tm_year(INT_MAX),
+                    .tm_wday = 0,
+                    .tm_yday = 0,
+                    .tm_isdst = 0};
   EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
 }
 
 TEST(LlvmLibcMkTime, InvalidSeconds) {
   {
     // -1 second from 1970-01-01 00:00:00 returns 1969-12-31 23:59:59.
-    struct tm tm_data {
-      .tm_sec = -1, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = -1,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-1));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 59,
@@ -54,11 +62,15 @@ TEST(LlvmLibcMkTime, InvalidSeconds) {
 
   {
     // 60 seconds from 1970-01-01 00:00:00 returns 1970-01-01 00:01:00.
-    struct tm tm_data {
-      .tm_sec = 60, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 60,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(60));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 1,
@@ -76,13 +88,17 @@ TEST(LlvmLibcMkTime, InvalidSeconds) {
 TEST(LlvmLibcMkTime, InvalidMinutes) {
   {
     // -1 minute from 1970-01-01 00:00:00 returns 1969-12-31 23:59:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = -1, .tm_hour = 0, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = -1,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(-TimeConstants::SECONDS_PER_MIN));
+                Succeeds(-LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 59,
                      .tm_hour = 23,
@@ -97,13 +113,17 @@ TEST(LlvmLibcMkTime, InvalidMinutes) {
 
   {
     // 60 minutes from 1970-01-01 00:00:00 returns 1970-01-01 01:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 60, .tm_hour = 0, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 60,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(60 * TimeConstants::SECONDS_PER_MIN));
+                Succeeds(60 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 1,
@@ -120,13 +140,17 @@ TEST(LlvmLibcMkTime, InvalidMinutes) {
 TEST(LlvmLibcMkTime, InvalidHours) {
   {
     // -1 hour from 1970-01-01 00:00:00 returns 1969-12-31 23:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = -1, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = -1,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(-TimeConstants::SECONDS_PER_HOUR));
+                Succeeds(-LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 23,
@@ -141,13 +165,18 @@ TEST(LlvmLibcMkTime, InvalidHours) {
 
   {
     // 24 hours from 1970-01-01 00:00:00 returns 1970-01-02 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 24, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
-    EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(24 * TimeConstants::SECONDS_PER_HOUR));
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 24,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
+    EXPECT_THAT(
+        LIBC_NAMESPACE::mktime(&tm_data),
+        Succeeds(24 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -163,14 +192,18 @@ TEST(LlvmLibcMkTime, InvalidHours) {
 
 TEST(LlvmLibcMkTime, InvalidYear) {
   // -1 year from 1970-01-01 00:00:00 returns 1969-01-01 00:00:00.
-  struct tm tm_data {
-    .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
-    .tm_mon = Month::JANUARY, .tm_year = tm_year(1969), .tm_wday = 0,
-    .tm_yday = 0, .tm_isdst = 0
-  };
+  struct tm tm_data{.tm_sec = 0,
+                    .tm_min = 0,
+                    .tm_hour = 0,
+                    .tm_mday = 1,
+                    .tm_mon = Month::JANUARY,
+                    .tm_year = tm_year(1969),
+                    .tm_wday = 0,
+                    .tm_yday = 0,
+                    .tm_isdst = 0};
   EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-              Succeeds(-TimeConstants::DAYS_PER_NON_LEAP_YEAR *
-                       TimeConstants::SECONDS_PER_DAY));
+              Succeeds(-LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR *
+                       LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
   EXPECT_TM_EQ((tm{.tm_sec = 0,
                    .tm_min = 0,
                    .tm_hour = 0,
@@ -188,61 +221,85 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
     return;
   {
     // 2038-01-19 03:14:08 tests overflow of the second in 2038.
-    struct tm tm_data {
-      .tm_sec = 8, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 8,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2038-01-19 03:15:07 tests overflow of the minute in 2038.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 15, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 15,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2038-01-19 04:14:07 tests overflow of the hour in 2038.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 4, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 4,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2038-01-20 03:14:07 tests overflow of the day in 2038.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 20,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 20,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2038-02-19 03:14:07 tests overflow of the month in 2038.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::FEBRUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::FEBRUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2039-01-19 03:14:07 tests overflow of the year.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2039), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2039),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 }
@@ -250,12 +307,18 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
 TEST(LlvmLibcMkTime, InvalidMonths) {
   {
     // -1 month from 1970-01-01 00:00:00 returns 1969-12-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 0, .tm_mon = -1,
-      .tm_year = tm_year(1970), .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
-    };
-    EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(-32 * TimeConstants::SECONDS_PER_DAY));
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 0,
+                      .tm_mon = -1,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
+    EXPECT_THAT(
+        LIBC_NAMESPACE::mktime(&tm_data),
+        Succeeds(-32 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -270,13 +333,19 @@ TEST(LlvmLibcMkTime, InvalidMonths) {
 
   {
     // 1970-13-01 00:00:00 returns 1971-01-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 1, .tm_mon = 12,
-      .tm_year = tm_year(1970), .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
-    };
-    EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(TimeConstants::DAYS_PER_NON_LEAP_YEAR *
-                         TimeConstants::SECONDS_PER_DAY));
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = 12,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
+    EXPECT_THAT(
+        LIBC_NAMESPACE::mktime(&tm_data),
+        Succeeds(LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR *
+                 LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -293,13 +362,17 @@ TEST(LlvmLibcMkTime, InvalidMonths) {
 TEST(LlvmLibcMkTime, InvalidDays) {
   {
     // -1 day from 1970-01-01 00:00:00 returns 1969-12-31 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = (1 - 1),
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = (1 - 1),
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(-1 * TimeConstants::SECONDS_PER_DAY));
+                Succeeds(-1 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -314,13 +387,17 @@ TEST(LlvmLibcMkTime, InvalidDays) {
 
   {
     // 1970-01-32 00:00:00 returns 1970-02-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 32,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 32,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(31 * TimeConstants::SECONDS_PER_DAY));
+                Succeeds(31 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -335,13 +412,17 @@ TEST(LlvmLibcMkTime, InvalidDays) {
 
   {
     // 1970-02-29 00:00:00 returns 1970-03-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 29,
-      .tm_mon = Month::FEBRUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 29,
+                      .tm_mon = Month::FEBRUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(59 * TimeConstants::SECONDS_PER_DAY));
+                Succeeds(59 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -356,14 +437,20 @@ TEST(LlvmLibcMkTime, InvalidDays) {
 
   {
     // 1972-02-30 00:00:00 returns 1972-03-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 30,
-      .tm_mon = Month::FEBRUARY, .tm_year = tm_year(1972), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
-    EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(((2 * TimeConstants::DAYS_PER_NON_LEAP_YEAR) + 60) *
-                         TimeConstants::SECONDS_PER_DAY));
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 30,
+                      .tm_mon = Month::FEBRUARY,
+                      .tm_year = tm_year(1972),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
+    EXPECT_THAT(
+        LIBC_NAMESPACE::mktime(&tm_data),
+        Succeeds(((2 * LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR) +
+                  60) *
+                 LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -381,11 +468,15 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   // Test for maximum value of a signed 32-bit integer.
   // Test implementation can encode time for Tue 19 January 2038 03:14:07 UTC.
   {
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF));
     EXPECT_TM_EQ((tm{.tm_sec = 7,
                      .tm_min = 14,
@@ -403,11 +494,15 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   {
     // 2038-01-19 03:13:59 tests that even a large seconds field is
     // accepted if the minutes field is smaller.
-    struct tm tm_data {
-      .tm_sec = 59, .tm_min = 13, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 59,
+                      .tm_min = 13,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF - 8));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 13,
@@ -424,13 +519,18 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   {
     // 2038-01-19 02:59:59 tests that large seconds and minutes are
     // accepted if the hours field is smaller.
-    struct tm tm_data {
-      .tm_sec = 59, .tm_min = 59, .tm_hour = 2, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 59,
+                      .tm_min = 59,
+                      .tm_hour = 2,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(0x7FFFFFFF - 8 - 14 * TimeConstants::SECONDS_PER_MIN));
+                Succeeds(0x7FFFFFFF - 8 -
+                         14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 59,
                      .tm_hour = 2,
@@ -446,14 +546,19 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   {
     // 2038-01-18 23:59:59 tests that large seconds, minutes and hours
     // are accepted if the days field is smaller.
-    struct tm tm_data {
-      .tm_sec = 59, .tm_min = 59, .tm_hour = 23, .tm_mday = 18,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 59,
+                      .tm_min = 59,
+                      .tm_hour = 23,
+                      .tm_mday = 18,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(0x7FFFFFFF - 8 - 14 * TimeConstants::SECONDS_PER_MIN -
-                         3 * TimeConstants::SECONDS_PER_HOUR));
+                Succeeds(0x7FFFFFFF - 8 -
+                         14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN -
+                         3 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 59,
                      .tm_hour = 23,
@@ -469,15 +574,20 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   {
     // 2038-01-18 23:59:59 tests that the final second of 2037 is
     // accepted.
-    struct tm tm_data {
-      .tm_sec = 59, .tm_min = 59, .tm_hour = 23, .tm_mday = 31,
-      .tm_mon = Month::DECEMBER, .tm_year = tm_year(2037), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 59,
+                      .tm_min = 59,
+                      .tm_hour = 23,
+                      .tm_mday = 31,
+                      .tm_mon = Month::DECEMBER,
+                      .tm_year = tm_year(2037),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(0x7FFFFFFF - 8 - 14 * TimeConstants::SECONDS_PER_MIN -
-                         3 * TimeConstants::SECONDS_PER_HOUR -
-                         18 * TimeConstants::SECONDS_PER_DAY));
+                Succeeds(0x7FFFFFFF - 8 -
+                         14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN -
+                         3 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR -
+                         18 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 59,
                      .tm_hour = 23,
@@ -496,11 +606,15 @@ TEST(LlvmLibcMkTime, Max64BitYear) {
     return;
   {
     // Mon Jan 1 12:50:50 2170 (200 years from 1970),
-    struct tm tm_data {
-      .tm_sec = 50, .tm_min = 50, .tm_hour = 12, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2170), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 50,
+                      .tm_min = 50,
+                      .tm_hour = 12,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2170),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(6311479850));
     EXPECT_TM_EQ((tm{.tm_sec = 50,
                      .tm_min = 50,
@@ -516,11 +630,15 @@ TEST(LlvmLibcMkTime, Max64BitYear) {
 
   {
     // Test for Tue Jan 1 12:50:50 in 2,147,483,647th year.
-    struct tm tm_data {
-      .tm_sec = 50, .tm_min = 50, .tm_hour = 12, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2147483647), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 50,
+                      .tm_min = 50,
+                      .tm_hour = 12,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2147483647),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(67767976202043050));
     EXPECT_TM_EQ((tm{.tm_sec = 50,
                      .tm_min = 50,
diff --git a/libc/test/src/time/nanosleep_test.cpp b/libc/test/src/time/nanosleep_test.cpp
index 2a6eea4..d4f98e2 100644
--- a/libc/test/src/time/nanosleep_test.cpp
+++ b/libc/test/src/time/nanosleep_test.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <time.h>
-
+#include "hdr/types/struct_timespec.h"
 #include "src/errno/libc_errno.h"
 #include "src/time/nanosleep.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt
index 11f2550..a33c13a 100644
--- a/libc/utils/CMakeLists.txt
+++ b/libc/utils/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(hdrgen)
+
 if(LLVM_INCLUDE_TESTS)
   add_subdirectory(MPFRWrapper)
 endif()
diff --git a/libc/utils/docgen/arpa/inet.yaml b/libc/utils/docgen/arpa/inet.yaml
index 7f388cb..d725759 100644
--- a/libc/utils/docgen/arpa/inet.yaml
+++ b/libc/utils/docgen/arpa/inet.yaml
@@ -1,18 +1,18 @@
 functions:
   htonl:
-    posix-definition: ''
+    in-latest-posix: ''
   htons:
-    posix-definition: ''
+    in-latest-posix: ''
   inet_addr:
-    posix-definition: ''
+    in-latest-posix: ''
   inet_ntoa:
-    posix-definition: ''
+    in-latest-posix: ''
   inet_ntop:
-    posix-definition: ''
+    in-latest-posix: ''
   inet_pton:
-    posix-definition: ''
+    in-latest-posix: ''
   ntohl:
-    posix-definition: ''
+    in-latest-posix: ''
   ntohs:
-    posix-definition: ''
+    in-latest-posix: ''
 
diff --git a/libc/utils/docgen/strings.yaml b/libc/utils/docgen/strings.yaml
new file mode 100644
index 0000000..082b92c
--- /dev/null
+++ b/libc/utils/docgen/strings.yaml
@@ -0,0 +1,26 @@
+functions:
+  bcmp:
+    removed-in-posix-2008: ''
+  bcopy:
+    removed-in-posix-2008: ''
+  bzero:
+    removed-in-posix-2008: ''
+  ffs:
+    in-latest-posix: ''
+  ffsl:
+    in-latest-posix: ''
+  ffsll:
+    in-latest-posix: ''
+  index:
+    removed-in-posix-2008: ''
+  rindex:
+    removed-in-posix-2008: ''
+  strcasecmp:
+    in-latest-posix: ''
+  strcasecmp_l:
+    in-latest-posix: ''
+  strncasecmp:
+    in-latest-posix: ''
+  strncasecmp_l:
+    in-latest-posix: ''
+
diff --git a/libc/utils/docgen/sys/mman.yaml b/libc/utils/docgen/sys/mman.yaml
index dba26ca..94c6b9b 100644
--- a/libc/utils/docgen/sys/mman.yaml
+++ b/libc/utils/docgen/sys/mman.yaml
@@ -1,77 +1,77 @@
 functions:
   mlock:
-    posix-definition: ''
+    in-latest-posix: ''
   mlockall:
-    posix-definition: ''
+    in-latest-posix: ''
   mmap:
-    posix-definition: ''
+    in-latest-posix: ''
   mprotect:
-    posix-definition: ''
+    in-latest-posix: ''
   msync:
-    posix-definition: ''
+    in-latest-posix: ''
   munlock:
-    posix-definition: ''
+    in-latest-posix: ''
   munlockall:
-    posix-definition: ''
+    in-latest-posix: ''
   munmap:
-    posix-definition: ''
+    in-latest-posix: ''
   posix_madvise:
-    posix-definition: ''
+    in-latest-posix: ''
   posix_mem_offset:
-    posix-definition: ''
+    in-latest-posix: ''
   posix_typed_mem_get_info:
-    posix-definition: ''
+    in-latest-posix: ''
   posix_typed_mem_open:
-    posix-definition: ''
+    in-latest-posix: ''
   shm_open:
-    posix-definition: ''
+    in-latest-posix: ''
   shm_unlink:
-    posix-definition: ''
+    in-latest-posix: ''
 macros:
   MAP_ANON:
-    posix-definition: ''
+    in-latest-posix: ''
   MAP_ANONYMOUS:
-    posix-definition: ''
+    in-latest-posix: ''
   MAP_FAILED:
-    posix-definition: ''
+    in-latest-posix: ''
   MAP_FIXED:
-    posix-definition: ''
+    in-latest-posix: ''
   MAP_PRIVATE:
-    posix-definition: ''
+    in-latest-posix: ''
   MAP_SHARED:
-    posix-definition: ''
+    in-latest-posix: ''
   MCL_CURRENT:
-    posix-definition: ''
+    in-latest-posix: ''
   MCL_FUTURE:
-    posix-definition: ''
+    in-latest-posix: ''
   MS_ASYNC:
-    posix-definition: ''
+    in-latest-posix: ''
   MS_INVALIDATE:
-    posix-definition: ''
+    in-latest-posix: ''
   MS_SYNC:
-    posix-definition: ''
+    in-latest-posix: ''
   POSIX_MADV_DONTNEED:
-    posix-definition: ''
+    in-latest-posix: ''
   POSIX_MADV_NORMAL:
-    posix-definition: ''
+    in-latest-posix: ''
   POSIX_MADV_RANDOM:
-    posix-definition: ''
+    in-latest-posix: ''
   POSIX_MADV_SEQUENTIAL:
-    posix-definition: ''
+    in-latest-posix: ''
   POSIX_MADV_WILLNEED:
-    posix-definition: ''
+    in-latest-posix: ''
   POSIX_TYPED_MEM_ALLOCATE:
-    posix-definition: ''
+    in-latest-posix: ''
   POSIX_TYPED_MEM_ALLOCATE_CONTIG:
-    posix-definition: ''
+    in-latest-posix: ''
   POSIX_TYPED_MEM_MAP_ALLOCATABLE:
-    posix-definition: ''
+    in-latest-posix: ''
   PROT_EXEC:
-    posix-definition: ''
+    in-latest-posix: ''
   PROT_NONE:
-    posix-definition: ''
+    in-latest-posix: ''
   PROT_READ:
-    posix-definition: ''
+    in-latest-posix: ''
   PROT_WRITE:
-    posix-definition: ''
+    in-latest-posix: ''
 
diff --git a/libc/hdrgen/CMakeLists.txt b/libc/utils/hdrgen/CMakeLists.txt
index 8ebde4e3..c6827da 100644
--- a/libc/hdrgen/CMakeLists.txt
+++ b/libc/utils/hdrgen/CMakeLists.txt
@@ -1,12 +1,12 @@
 if(LLVM_LIBC_FULL_BUILD)
   enable_testing()
 
-  set(NEWHDGEN_TESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  set(HDRGEN_TESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tests)
   set(TEST_OUTPUT_DIR ${CMAKE_BINARY_DIR}/hdrgen/output)
 
   add_test(
     NAME hdrgen_integration_test
-    COMMAND python3 ${NEWHDGEN_TESTS_DIR}/test_integration.py --output_dir ${TEST_OUTPUT_DIR}
+    COMMAND python3 ${HDRGEN_TESTS_DIR}/test_integration.py --output_dir ${TEST_OUTPUT_DIR}
   )
 
   add_custom_target(check-hdrgen
diff --git a/libc/utils/hdrgen/README.rst b/libc/utils/hdrgen/README.rst
new file mode 100644
index 0000000..6db2968
--- /dev/null
+++ b/libc/utils/hdrgen/README.rst
@@ -0,0 +1,4 @@
+This directory also contains the Python sources for hdrgen, which is
+what generates the headers public libc headers. The definitions for these
+headers are in the ``include`` directory. The ``.h.def`` files are the bases
+and the ``.yaml`` files are the contents.
diff --git a/libc/hdrgen/class_implementation/classes/enumeration.py b/libc/utils/hdrgen/enumeration.py
index b9848c0..b9848c0 100644
--- a/libc/hdrgen/class_implementation/classes/enumeration.py
+++ b/libc/utils/hdrgen/enumeration.py
diff --git a/libc/hdrgen/class_implementation/classes/function.py b/libc/utils/hdrgen/function.py
index d97df7f..d97df7f 100644
--- a/libc/hdrgen/class_implementation/classes/function.py
+++ b/libc/utils/hdrgen/function.py
diff --git a/libc/hdrgen/gpu_headers.py b/libc/utils/hdrgen/gpu_headers.py
index b26b3a8..8c4ff6e 100644
--- a/libc/hdrgen/gpu_headers.py
+++ b/libc/utils/hdrgen/gpu_headers.py
@@ -6,31 +6,9 @@
 #
 # ==-------------------------------------------------------------------------==#
 
+from header import HeaderFile
 
-class GpuHeaderFile:
-    def __init__(self, name):
-        self.name = name
-        self.macros = []
-        self.types = []
-        self.enumerations = []
-        self.objects = []
-        self.functions = []
-
-    def add_macro(self, macro):
-        self.macros.append(macro)
-
-    def add_type(self, type_):
-        self.types.append(type_)
-
-    def add_enumeration(self, enumeration):
-        self.enumerations.append(enumeration)
-
-    def add_object(self, object):
-        self.objects.append(object)
-
-    def add_function(self, function):
-        self.functions.append(function)
-
+class GpuHeaderFile(HeaderFile):
     def __str__(self):
         content = []
 
diff --git a/libc/hdrgen/header.py b/libc/utils/hdrgen/header.py
index df8ce61..9339acc 100644
--- a/libc/hdrgen/header.py
+++ b/libc/utils/hdrgen/header.py
@@ -9,6 +9,7 @@
 
 class HeaderFile:
     def __init__(self, name):
+        self.template_file = None
         self.name = name
         self.macros = []
         self.types = []
@@ -31,7 +32,7 @@ class HeaderFile:
     def add_function(self, function):
         self.functions.append(function)
 
-    def __str__(self):
+    def public_api(self):
         content = [""]
 
         for macro in self.macros:
diff --git a/libc/hdrgen/class_implementation/classes/macro.py b/libc/utils/hdrgen/macro.py
index 9a712f2..9a712f2 100644
--- a/libc/hdrgen/class_implementation/classes/macro.py
+++ b/libc/utils/hdrgen/macro.py
diff --git a/libc/utils/hdrgen/main.py b/libc/utils/hdrgen/main.py
new file mode 100755
index 0000000..5dd392a
--- /dev/null
+++ b/libc/utils/hdrgen/main.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+#
+# ===- Generate headers for libc functions  ------------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==------------------------------------------------------------------------==#
+
+import argparse
+import sys
+from pathlib import Path
+
+from header import HeaderFile
+from yaml_to_classes import load_yaml_file, fill_public_api
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate header files from YAML")
+    parser.add_argument(
+        "yaml_file",
+        help="Path to the YAML file containing header specification",
+        metavar="FILE",
+        type=Path,
+        nargs=1,
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Path to write generated header file",
+        type=Path,
+        required=True,
+    )
+    parser.add_argument(
+        "--depfile",
+        help="Path to write a depfile",
+        type=Path,
+    )
+    parser.add_argument(
+        "--write-if-changed",
+        help="Write the output file only if its contents have changed",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "-e",
+        "--entry-point",
+        help="Entry point to include; may be given many times",
+        metavar="SYMBOL",
+        action="append",
+    )
+    args = parser.parse_args()
+
+    [yaml_file] = args.yaml_file
+    files_read = {yaml_file}
+
+    def write_depfile():
+        if not args.depfile:
+            return
+        deps = " ".join(str(f) for f in sorted(files_read))
+        args.depfile.parent.mkdir(parents=True, exist_ok=True)
+        with open(args.depfile, "w") as depfile:
+            depfile.write(f"{args.output}: {deps}\n")
+
+    header = load_yaml_file(yaml_file, HeaderFile, args.entry_point)
+
+    if not header.template_file:
+        print(f"{yaml_file}: Missing header_template", sys.stderr)
+        return 2
+
+    # The header_template path is relative to the containing YAML file.
+    template_path = yaml_file.parent / header.template_file
+
+    files_read.add(template_path)
+    with open(template_path) as template:
+        contents = fill_public_api(header.public_api(), template.read())
+
+    write_depfile()
+
+    if (
+        not args.write_if_changed
+        or not args.output.exists()
+        or args.output.read_text() != contents
+    ):
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        args.output.write_text(contents)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/libc/hdrgen/class_implementation/classes/object.py b/libc/utils/hdrgen/object.py
index f521445..f521445 100644
--- a/libc/hdrgen/class_implementation/classes/object.py
+++ b/libc/utils/hdrgen/object.py
diff --git a/libc/hdrgen/tests/expected_output/test_header.h b/libc/utils/hdrgen/tests/expected_output/test_header.h
index a777976..a777976 100644
--- a/libc/hdrgen/tests/expected_output/test_header.h
+++ b/libc/utils/hdrgen/tests/expected_output/test_header.h
diff --git a/libc/hdrgen/tests/input/test_small.h.def b/libc/utils/hdrgen/tests/input/test_small.h.def
index 075be95..075be95 100644
--- a/libc/hdrgen/tests/input/test_small.h.def
+++ b/libc/utils/hdrgen/tests/input/test_small.h.def
diff --git a/libc/hdrgen/tests/input/test_small.yaml b/libc/utils/hdrgen/tests/input/test_small.yaml
index 772552f..1d4b299 100644
--- a/libc/hdrgen/tests/input/test_small.yaml
+++ b/libc/utils/hdrgen/tests/input/test_small.yaml
@@ -1,4 +1,5 @@
-header: test_header.h
+header: test_small.h
+header_template: test_small.h.def
 macros:
   - macro_name: MACRO_A
     macro_value: 1
@@ -62,5 +63,3 @@ functions:
       - type: float
     standards:
       - stdc
-
-
diff --git a/libc/hdrgen/tests/test_integration.py b/libc/utils/hdrgen/tests/test_integration.py
index 8ea6d8a..49cb08cd 100644
--- a/libc/hdrgen/tests/test_integration.py
+++ b/libc/utils/hdrgen/tests/test_integration.py
@@ -1,36 +1,27 @@
+import argparse
 import subprocess
+import sys
 import unittest
 from pathlib import Path
-import os
-import argparse
-import sys
 
 
 class TestHeaderGenIntegration(unittest.TestCase):
     def setUp(self):
-        self.output_dir = Path(
-            args.output_dir if args.output_dir else "libc/hdrgen/tests/output"
-        )
-
-        self.maxDiff = None
-
-        self.source_dir = Path(__file__).resolve().parent.parent.parent.parent
+        self.output_dir = TestHeaderGenIntegration.output_dir
+        self.source_dir = Path(__file__).parent
+        self.main_script = self.source_dir.parent / "main.py"
 
-    def run_script(self, yaml_file, h_def_file, output_dir, entry_points):
-        yaml_file = self.source_dir / yaml_file
-        h_def_file = self.source_dir / h_def_file
+    def run_script(self, yaml_file, output_file, entry_points):
         command = [
             "python3",
-            str(self.source_dir / "libc/hdrgen/yaml_to_classes.py"),
+            str(self.main_script),
             str(yaml_file),
-            "--h_def_file",
-            str(h_def_file),
-            "--output_dir",
-            str(output_dir),
+            "--output",
+            str(output_file),
         ]
 
         for entry_point in entry_points:
-            command.extend(["--e", entry_point])
+            command.extend(["--entry-point", entry_point])
 
         result = subprocess.run(
             command,
@@ -51,26 +42,23 @@ class TestHeaderGenIntegration(unittest.TestCase):
         self.assertEqual(gen_content, exp_content)
 
     def test_generate_header(self):
-        yaml_file = "libc/hdrgen/tests/input/test_small.yaml"
-        h_def_file = "libc/hdrgen/tests/input/test_small.h.def"
-        expected_output_file = (
-            self.source_dir / "libc/hdrgen/tests/expected_output/test_header.h"
-        )
+        yaml_file = self.source_dir / "input/test_small.yaml"
+        expected_output_file = self.source_dir / "expected_output/test_header.h"
         output_file = self.output_dir / "test_small.h"
         entry_points = {"func_b", "func_a", "func_c", "func_d", "func_e"}
 
-        if not self.output_dir.exists():
-            self.output_dir.mkdir(parents=True)
-
-        self.run_script(yaml_file, h_def_file, self.output_dir, entry_points)
+        self.run_script(yaml_file, output_file, entry_points)
 
         self.compare_files(output_file, expected_output_file)
 
 
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser(description="TestHeaderGenIntegration arguments")
     parser.add_argument(
-        "--output_dir", type=str, help="Output directory for generated headers"
+        "--output_dir",
+        type=Path,
+        help="Output directory for generated headers",
+        required=True,
     )
     args, remaining_argv = parser.parse_known_args()
 
@@ -79,3 +67,7 @@ if __name__ == "__main__":
     sys.argv[1:] = remaining_argv
 
     unittest.main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/libc/hdrgen/class_implementation/classes/type.py b/libc/utils/hdrgen/type.py
index 4ebf803..4ebf803 100644
--- a/libc/hdrgen/class_implementation/classes/type.py
+++ b/libc/utils/hdrgen/type.py
diff --git a/libc/hdrgen/yaml_functions_sorted.py b/libc/utils/hdrgen/yaml_functions_sorted.py
index b960ecaf..b960ecaf 100644
--- a/libc/hdrgen/yaml_functions_sorted.py
+++ b/libc/utils/hdrgen/yaml_functions_sorted.py
diff --git a/libc/hdrgen/yaml_to_classes.py b/libc/utils/hdrgen/yaml_to_classes.py
index 0e8ca2d..d64feaf 100644
--- a/libc/hdrgen/yaml_to_classes.py
+++ b/libc/utils/hdrgen/yaml_to_classes.py
@@ -11,13 +11,14 @@
 import yaml
 import argparse
 from pathlib import Path
-from header import HeaderFile
+
+from enumeration import Enumeration
+from function import Function
 from gpu_headers import GpuHeaderFile as GpuHeader
-from class_implementation.classes.macro import Macro
-from class_implementation.classes.type import Type
-from class_implementation.classes.function import Function
-from class_implementation.classes.enumeration import Enumeration
-from class_implementation.classes.object import Object
+from header import HeaderFile
+from macro import Macro
+from object import Object
+from type import Type
 
 
 def yaml_to_classes(yaml_data, header_class, entry_points=None):
@@ -34,6 +35,7 @@ def yaml_to_classes(yaml_data, header_class, entry_points=None):
     """
     header_name = yaml_data.get("header")
     header = header_class(header_name)
+    header.template_file = yaml_data.get("header_template")
 
     for macro_data in yaml_data.get("macros", []):
         header.add_macro(Macro(macro_data["macro_name"], macro_data["macro_value"]))
@@ -226,10 +228,6 @@ def main():
         help="Directory to output the generated header file",
     )
     parser.add_argument(
-        "--h_def_file",
-        help="Path to the .h.def template file (required if not using --export_decls)",
-    )
-    parser.add_argument(
         "--add_function",
         nargs=6,
         metavar=(
@@ -243,7 +241,10 @@ def main():
         help="Add a function to the YAML file",
     )
     parser.add_argument(
-        "--e", action="append", help="Entry point to include", dest="entry_points"
+        "--entry-point",
+        action="append",
+        help="Entry point to include",
+        dest="entry_points",
     )
     parser.add_argument(
         "--export-decls",
@@ -267,13 +268,7 @@ def main():
     else:
         output_file_path = Path(f"{Path(args.yaml_file).stem}.h")
 
-    if not args.export_decls and args.h_def_file:
-        with open(args.h_def_file, "r") as f:
-            h_def_content = f.read()
-        final_header_content = fill_public_api(header_str, h_def_content)
-        with open(output_file_path, "w") as f:
-            f.write(final_header_content)
-    else:
+    if args.export_decls:
         with open(output_file_path, "w") as f:
             f.write(header_str)
 
diff --git a/libclc/Maintainers.md b/libclc/Maintainers.md
new file mode 100644
index 0000000..ac869b6
--- /dev/null
+++ b/libclc/Maintainers.md
@@ -0,0 +1,17 @@
+# libclc Maintainers
+
+This file is a list of the
+[maintainers](https://llvm.org/docs/DeveloperPolicy.html#maintainers) for
+libclc.
+
+## Current Maintainers
+
+The following people are the active maintainers for the project. Please reach
+out to them for code reviews, questions about their area of expertise, or other
+assistance.
+
+Fraser Cormack \
+fraser@codeplay.com (email), [frasercrmck](https://github.com/frasercrmck) (GitHub)
+
+Tom Stellard \
+tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
diff --git a/libcxx/.clang-tidy b/libcxx/.clang-tidy
index f986e21..ebbfab0 100644
--- a/libcxx/.clang-tidy
+++ b/libcxx/.clang-tidy
@@ -5,6 +5,8 @@ Checks: >
   bugprone-stringview-nullptr,
   bugprone-use-after-move,
 
+  libcpp-*,
+
   llvm-include-order,
   llvm-namespace-comment,
 
diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst
index 42aacfd..d399b94 100644
--- a/libcxx/docs/Hardening.rst
+++ b/libcxx/docs/Hardening.rst
@@ -311,7 +311,10 @@ ABI configuration.
 ABI options
 -----------
 
-Vendors can use the following ABI options to enable additional hardening checks:
+Vendors can use some ABI options at CMake configuration time (when building libc++
+itself) to enable additional hardening checks. This is done by passing these
+macros as ``-DLIBCXX_ABI_DEFINES="_LIBCPP_ABI_FOO;_LIBCPP_ABI_BAR;etc"`` at
+CMake configuration time. The available options are:
 
 - ``_LIBCPP_ABI_BOUNDED_ITERATORS`` -- changes the iterator type of select
   containers (see below) to a bounded iterator that keeps track of whether it's
@@ -341,7 +344,7 @@ Vendors can use the following ABI options to enable additional hardening checks:
 
   ABI impact: changes the iterator type of ``vector`` (except ``vector<bool>``).
 
-- ``_LIBCPP_ABI_BOUNDED_UNIQUE_PTR``` -- tracks the bounds of the array stored inside
+- ``_LIBCPP_ABI_BOUNDED_UNIQUE_PTR`` -- tracks the bounds of the array stored inside
   a ``std::unique_ptr<T[]>``, allowing it to trap when accessed out-of-bounds. This
   requires the ``std::unique_ptr`` to be created using an API like ``std::make_unique``
   or ``std::make_unique_for_overwrite``, otherwise the bounds information is not available
@@ -407,7 +410,7 @@ Hardened containers status
       - ✅
       - ❌
     * - ``forward_list``
-      - ❌
+      - ✅
       - ❌
     * - ``deque``
       - ✅
@@ -458,7 +461,7 @@ Hardened containers status
       - Partial
       - N/A
     * - ``bitset``
-      - ❌
+      - ✅
       - N/A
 
 Note: for ``vector`` and ``string``, the iterator does not check for
diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index c8a07fb..ecfbaa5 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -73,6 +73,39 @@ Improvements and New Features
   optimized, resulting in a performance improvement of up to 2x for trivial element types (e.g., `std::vector<int>`),
   and up to 3.4x for non-trivial element types (e.g., `std::vector<std::vector<int>>`).
 
+- On Windows, ``<system_error>``'s ``std::system_category`` is now distinct from ``std::generic_category``. The behavior
+  on other operating systems is unchanged.
+
+  On Windows -- unlike on Unix systems -- the libc and system APIs use distinct error codes. The libc functions return
+  ``errno.h`` error codes via the ``errno`` global, while Win32 API functions return ``winerror.h`` error codes via
+  ``GetLastError()``.
+
+  The C++ standard's ``std::error_code`` and ``std::error_category`` functionality was designed to support multiple
+  error domains, precisely in order to handle situations such as this. However, libc++ formerly treated
+  ``generic_category()`` and ``system_category()`` as equivalent, even on Windows. It now implements the intended split,
+  where ``system_category`` represents native ``winerror.h`` error codes, and ``generic_category`` represents libc error
+  codes (and, equivalently, ``std::errc::*`` errors).
+
+  This change enables code like ``std::error_code(GetLastError(), std::system_category()) ==
+  std::errc::invalid_argument`` to function as desired: constructing an ``error_code`` with the Windows error number in
+  the "system" category, and then mapping it to a generic code with ``error_condition``, for comparison with the
+  ``std::errc`` constant.
+
+  This is an incompatible change: ``std::error_code(ENOSYS, std::system_category()) ==
+  std::errc::function_not_supported`` would formerly have returned true, but now returns false on Windows. Code
+  providing a number from the ``errno.h`` domain should be migrated to construct a ``generic_category`` error_code,
+  instead. (E.g., use ``std::error_code(ENOSYS, std::generic_category())``). The new behavior matches MSVC.
+
+- On Windows, the ``std::filesystem`` library now returns the Win32 ``system_category`` error codes, where it's feasible
+  to do so. This allows interrogation and reporting of the original error code, which is useful if multiple Windows
+  errors map to a single generic error (such as with ``std::errc::no_such_file_or_directory``).
+
+  This is also a slightly-incompatible API change: code inspecting the raw integer value from the returned error_code
+  expecting an integer from ``generic_category`` (e.g. ``err.value() == ENOTDIR``) will not work as desired. Instead,
+  such code should use the comparison operators which implicitly handle eror mappings, ``err ==
+  std::errc::not_a_directory``, or use ``err.default_error_condition()`` to map to an ``error_condition``, and then test
+  its ``value()`` and ``category()``.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index cf092fa..e98b96b 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -459,6 +459,29 @@ we only want to make sure they don't rot. Do not rely on the results of benchmar
 run through ``check-cxx`` for anything, instead run the benchmarks manually using
 the instructions for running individual tests.
 
+If you want to compare the results of different benchmark runs, we recommend using the
+``libcxx-compare-benchmarks`` helper tool. First, configure CMake in a build directory
+and run the benchmark:
+
+.. code-block:: bash
+
+  $ cmake -S runtimes -B <build1> [...]
+  $ libcxx/utils/libcxx-lit <build1> libcxx/test/benchmarks/string.bench.cpp --param optimization=speed
+
+Then, do the same for the second configuration you want to test. Use a different build
+directory for that configuration:
+
+.. code-block:: bash
+
+  $ cmake -S runtimes -B <build2> [...]
+  $ libcxx/utils/libcxx-lit <build2> libcxx/test/benchmarks/string.bench.cpp --param optimization=speed
+
+Finally, use ``libcxx-compare-benchmarks`` to compare both:
+
+.. code-block:: bash
+
+  $ libcxx/utils/libcxx-compare-benchmarks <build1> <build2> libcxx/test/benchmarks/string.bench.cpp
+
 .. _`Google Benchmark`: https://github.com/google/benchmark
 
 .. _testing-hardening-assertions:
diff --git a/libcxx/include/__algorithm/comp_ref_type.h b/libcxx/include/__algorithm/comp_ref_type.h
index c367fbb..6a9d5ce 100644
--- a/libcxx/include/__algorithm/comp_ref_type.h
+++ b/libcxx/include/__algorithm/comp_ref_type.h
@@ -56,10 +56,10 @@ struct __debug_less {
 // Pass the comparator by lvalue reference. Or in the debug mode, using a debugging wrapper that stores a reference.
 #if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 template <class _Comp>
-using __comp_ref_type = __debug_less<_Comp>;
+using __comp_ref_type _LIBCPP_NODEBUG = __debug_less<_Comp>;
 #else
 template <class _Comp>
-using __comp_ref_type = _Comp&;
+using __comp_ref_type _LIBCPP_NODEBUG = _Comp&;
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4f30b20..962aa90 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -47,7 +47,7 @@ struct __copy_impl {
 
   template <class _InIter, class _OutIter>
   struct _CopySegment {
-    using _Traits = __segmented_iterator_traits<_InIter>;
+    using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InIter>;
 
     _OutIter& __result_;
 
diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 6cdb0ae..e5c89c1 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -48,13 +48,13 @@ struct _RangeAlgPolicy {};
 template <>
 struct _IterOps<_RangeAlgPolicy> {
   template <class _Iter>
-  using __value_type = iter_value_t<_Iter>;
+  using __value_type _LIBCPP_NODEBUG = iter_value_t<_Iter>;
 
   template <class _Iter>
-  using __iterator_category = ranges::__iterator_concept<_Iter>;
+  using __iterator_category _LIBCPP_NODEBUG = ranges::__iterator_concept<_Iter>;
 
   template <class _Iter>
-  using __difference_type = iter_difference_t<_Iter>;
+  using __difference_type _LIBCPP_NODEBUG = iter_difference_t<_Iter>;
 
   static constexpr auto advance      = ranges::advance;
   static constexpr auto distance     = ranges::distance;
@@ -72,13 +72,13 @@ struct _ClassicAlgPolicy {};
 template <>
 struct _IterOps<_ClassicAlgPolicy> {
   template <class _Iter>
-  using __value_type = typename iterator_traits<_Iter>::value_type;
+  using __value_type _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::value_type;
 
   template <class _Iter>
-  using __iterator_category = typename iterator_traits<_Iter>::iterator_category;
+  using __iterator_category _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::iterator_category;
 
   template <class _Iter>
-  using __difference_type = typename iterator_traits<_Iter>::difference_type;
+  using __difference_type _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::difference_type;
 
   // advance
   template <class _Iter, class _Distance>
@@ -94,10 +94,10 @@ struct _IterOps<_ClassicAlgPolicy> {
   }
 
   template <class _Iter>
-  using __deref_t = decltype(*std::declval<_Iter&>());
+  using __deref_t _LIBCPP_NODEBUG = decltype(*std::declval<_Iter&>());
 
   template <class _Iter>
-  using __move_t = decltype(std::move(*std::declval<_Iter&>()));
+  using __move_t _LIBCPP_NODEBUG = decltype(std::move(*std::declval<_Iter&>()));
 
   template <class _Iter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static void __validate_iter_reference() {
@@ -217,7 +217,7 @@ private:
 };
 
 template <class _AlgPolicy, class _Iter>
-using __policy_iter_diff_t = typename _IterOps<_AlgPolicy>::template __difference_type<_Iter>;
+using __policy_iter_diff_t _LIBCPP_NODEBUG = typename _IterOps<_AlgPolicy>::template __difference_type<_Iter>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h
index 005099d..6f3b0eb 100644
--- a/libcxx/include/__algorithm/move.h
+++ b/libcxx/include/__algorithm/move.h
@@ -50,7 +50,7 @@ struct __move_impl {
 
   template <class _InIter, class _OutIter>
   struct _MoveSegment {
-    using _Traits = __segmented_iterator_traits<_InIter>;
+    using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InIter>;
 
     _OutIter& __result_;
 
diff --git a/libcxx/include/__algorithm/ranges_iterator_concept.h b/libcxx/include/__algorithm/ranges_iterator_concept.h
index 2af891d..58790e9 100644
--- a/libcxx/include/__algorithm/ranges_iterator_concept.h
+++ b/libcxx/include/__algorithm/ranges_iterator_concept.h
@@ -44,7 +44,7 @@ consteval auto __get_iterator_concept() {
 }
 
 template <class _Iter>
-using __iterator_concept = decltype(__get_iterator_concept<_Iter>());
+using __iterator_concept _LIBCPP_NODEBUG = decltype(__get_iterator_concept<_Iter>());
 
 } // namespace ranges
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/ranges_unique_copy.h b/libcxx/include/__algorithm/ranges_unique_copy.h
index 3b4a64e..ee7f0a0 100644
--- a/libcxx/include/__algorithm/ranges_unique_copy.h
+++ b/libcxx/include/__algorithm/ranges_unique_copy.h
@@ -60,7 +60,7 @@ struct __unique_copy {
   }
 
   template <class _InIter, class _OutIter>
-  using __algo_tag_t = decltype(__get_algo_tag<_InIter, _OutIter>());
+  using __algo_tag_t _LIBCPP_NODEBUG = decltype(__get_algo_tag<_InIter, _OutIter>());
 
   template <input_iterator _InIter,
             sentinel_for<_InIter> _Sent,
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 4e3e4f2..4e03723 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -70,7 +70,7 @@ struct __get_as_integer_type_impl<8> {
 };
 
 template <class _Tp>
-using __get_as_integer_type_t = typename __get_as_integer_type_impl<sizeof(_Tp)>::type;
+using __get_as_integer_type_t _LIBCPP_NODEBUG = typename __get_as_integer_type_impl<sizeof(_Tp)>::type;
 
 // This isn't specialized for 64 byte vectors on purpose. They have the potential to significantly reduce performance
 // in mixed simd/non-simd workloads and don't provide any performance improvement for currently vectorized algorithms
@@ -90,7 +90,7 @@ inline constexpr size_t __native_vector_size = 1;
 #  endif
 
 template <class _ArithmeticT, size_t _Np>
-using __simd_vector __attribute__((__ext_vector_type__(_Np))) = _ArithmeticT;
+using __simd_vector __attribute__((__ext_vector_type__(_Np))) _LIBCPP_NODEBUG = _ArithmeticT;
 
 template <class _VecT>
 inline constexpr size_t __simd_vector_size_v = []<bool _False = false>() -> size_t {
@@ -106,7 +106,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __simd_vector_underlying_type_impl(__simd_vector<_Tp,
 }
 
 template <class _VecT>
-using __simd_vector_underlying_type_t = decltype(std::__simd_vector_underlying_type_impl(_VecT{}));
+using __simd_vector_underlying_type_t _LIBCPP_NODEBUG = decltype(std::__simd_vector_underlying_type_impl(_VecT{}));
 
 // This isn't inlined without always_inline when loading chars.
 template <class _VecT, class _Iter>
diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
index ed828b6..5c60b23 100644
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -890,10 +890,10 @@ __sort_dispatch(_RandomAccessIterator __first, _RandomAccessIterator __last, _Co
 }
 
 template <class _Type, class... _Options>
-using __is_any_of = _Or<is_same<_Type, _Options>...>;
+using __is_any_of _LIBCPP_NODEBUG = _Or<is_same<_Type, _Options>...>;
 
 template <class _Type>
-using __sort_is_specialized_in_library = __is_any_of<
+using __sort_is_specialized_in_library _LIBCPP_NODEBUG = __is_any_of<
     _Type,
     char,
 #if _LIBCPP_HAS_WIDE_CHARACTERS
diff --git a/libcxx/include/__algorithm/three_way_comp_ref_type.h b/libcxx/include/__algorithm/three_way_comp_ref_type.h
index 5702a1f..f6f7645 100644
--- a/libcxx/include/__algorithm/three_way_comp_ref_type.h
+++ b/libcxx/include/__algorithm/three_way_comp_ref_type.h
@@ -61,10 +61,10 @@ struct __debug_three_way_comp {
 // Pass the comparator by lvalue reference. Or in the debug mode, using a debugging wrapper that stores a reference.
 #  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 template <class _Comp>
-using __three_way_comp_ref_type = __debug_three_way_comp<_Comp>;
+using __three_way_comp_ref_type _LIBCPP_NODEBUG = __debug_three_way_comp<_Comp>;
 #  else
 template <class _Comp>
-using __three_way_comp_ref_type = _Comp&;
+using __three_way_comp_ref_type _LIBCPP_NODEBUG = _Comp&;
 #  endif
 
 #endif // _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__algorithm/unwrap_iter.h b/libcxx/include/__algorithm/unwrap_iter.h
index 8cc0d22..b66a682 100644
--- a/libcxx/include/__algorithm/unwrap_iter.h
+++ b/libcxx/include/__algorithm/unwrap_iter.h
@@ -46,7 +46,7 @@ struct __unwrap_iter_impl {
 // It's a contiguous iterator, so we can use a raw pointer instead
 template <class _Iter>
 struct __unwrap_iter_impl<_Iter, true> {
-  using _ToAddressT = decltype(std::__to_address(std::declval<_Iter>()));
+  using _ToAddressT _LIBCPP_NODEBUG = decltype(std::__to_address(std::declval<_Iter>()));
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __rewrap(_Iter __orig_iter, _ToAddressT __unwrapped_iter) {
     return __orig_iter + (__unwrapped_iter - std::__to_address(__orig_iter));
diff --git a/libcxx/include/__atomic/aliases.h b/libcxx/include/__atomic/aliases.h
index 37d11dd..4fcceba 100644
--- a/libcxx/include/__atomic/aliases.h
+++ b/libcxx/include/__atomic/aliases.h
@@ -84,19 +84,19 @@ using atomic_uintmax_t = atomic<uintmax_t>;
 // C++20 atomic_{signed,unsigned}_lock_free: prefer the contention type most highly, then the largest lock-free type
 #if _LIBCPP_STD_VER >= 20
 #  if ATOMIC_LLONG_LOCK_FREE == 2
-using __largest_lock_free_type = long long;
+using __largest_lock_free_type _LIBCPP_NODEBUG = long long;
 #  elif ATOMIC_INT_LOCK_FREE == 2
-using __largest_lock_free_type = int;
+using __largest_lock_free_type _LIBCPP_NODEBUG = int;
 #  elif ATOMIC_SHORT_LOCK_FREE == 2
-using __largest_lock_free_type = short;
+using __largest_lock_free_type _LIBCPP_NODEBUG = short;
 #  elif ATOMIC_CHAR_LOCK_FREE == 2
-using __largest_lock_free_type = char;
+using __largest_lock_free_type _LIBCPP_NODEBUG = char;
 #  else
 #    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen on unusual platforms)
 #  endif
 
 #  ifndef _LIBCPP_NO_LOCK_FREE_TYPES
-using __contention_t_or_largest =
+using __contention_t_or_largest _LIBCPP_NODEBUG =
     __conditional_t<__libcpp_is_always_lock_free<__cxx_contention_t>::__value,
                     __cxx_contention_t,
                     __largest_lock_free_type>;
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index 8029b52..975a479 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -143,7 +143,7 @@ struct __atomic_base // false
 
 template <class _Tp>
 struct __atomic_base<_Tp, true> : public __atomic_base<_Tp, false> {
-  using __base = __atomic_base<_Tp, false>;
+  using __base _LIBCPP_NODEBUG = __atomic_base<_Tp, false>;
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __atomic_base() _NOEXCEPT = default;
 
@@ -228,9 +228,9 @@ struct __atomic_waitable_traits<__atomic_base<_Tp, _IsIntegral> > {
 
 template <class _Tp>
 struct atomic : public __atomic_base<_Tp> {
-  using __base          = __atomic_base<_Tp>;
-  using value_type      = _Tp;
-  using difference_type = value_type;
+  using __base _LIBCPP_NODEBUG = __atomic_base<_Tp>;
+  using value_type             = _Tp;
+  using difference_type        = value_type;
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI atomic() = default;
@@ -257,9 +257,9 @@ struct atomic : public __atomic_base<_Tp> {
 
 template <class _Tp>
 struct atomic<_Tp*> : public __atomic_base<_Tp*> {
-  using __base          = __atomic_base<_Tp*>;
-  using value_type      = _Tp*;
-  using difference_type = ptrdiff_t;
+  using __base _LIBCPP_NODEBUG = __atomic_base<_Tp*>;
+  using value_type             = _Tp*;
+  using difference_type        = ptrdiff_t;
 
   _LIBCPP_HIDE_FROM_ABI atomic() _NOEXCEPT = default;
 
@@ -389,9 +389,9 @@ private:
   }
 
 public:
-  using __base          = __atomic_base<_Tp>;
-  using value_type      = _Tp;
-  using difference_type = value_type;
+  using __base _LIBCPP_NODEBUG = __atomic_base<_Tp>;
+  using value_type             = _Tp;
+  using difference_type        = value_type;
 
   _LIBCPP_HIDE_FROM_ABI constexpr atomic() noexcept = default;
   _LIBCPP_HIDE_FROM_ABI constexpr atomic(_Tp __d) noexcept : __base(__d) {}
diff --git a/libcxx/include/__atomic/atomic_ref.h b/libcxx/include/__atomic/atomic_ref.h
index eef1598..177ea64 100644
--- a/libcxx/include/__atomic/atomic_ref.h
+++ b/libcxx/include/__atomic/atomic_ref.h
@@ -221,7 +221,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI void notify_all() const noexcept { std::__atomic_notify_all(*this); }
 
 protected:
-  using _Aligned_Tp [[__gnu__::__aligned__(required_alignment)]] = _Tp;
+  using _Aligned_Tp [[__gnu__::__aligned__(required_alignment), __gnu__::__nodebug__]] = _Tp;
   _Aligned_Tp* __ptr_;
 
   _LIBCPP_HIDE_FROM_ABI __atomic_ref_base(_Tp& __obj) : __ptr_(std::addressof(__obj)) {}
@@ -241,7 +241,7 @@ template <class _Tp>
 struct atomic_ref : public __atomic_ref_base<_Tp> {
   static_assert(is_trivially_copyable_v<_Tp>, "std::atomic_ref<T> requires that 'T' be a trivially copyable type");
 
-  using __base = __atomic_ref_base<_Tp>;
+  using __base _LIBCPP_NODEBUG = __atomic_ref_base<_Tp>;
 
   _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp& __obj) : __base(__obj) {
     _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
@@ -259,7 +259,7 @@ struct atomic_ref : public __atomic_ref_base<_Tp> {
 template <class _Tp>
   requires(std::integral<_Tp> && !std::same_as<bool, _Tp>)
 struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
-  using __base = __atomic_ref_base<_Tp>;
+  using __base _LIBCPP_NODEBUG = __atomic_ref_base<_Tp>;
 
   using difference_type = __base::value_type;
 
@@ -305,7 +305,7 @@ struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
 template <class _Tp>
   requires std::floating_point<_Tp>
 struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
-  using __base = __atomic_ref_base<_Tp>;
+  using __base _LIBCPP_NODEBUG = __atomic_ref_base<_Tp>;
 
   using difference_type = __base::value_type;
 
@@ -344,7 +344,7 @@ struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
 
 template <class _Tp>
 struct atomic_ref<_Tp*> : public __atomic_ref_base<_Tp*> {
-  using __base = __atomic_ref_base<_Tp*>;
+  using __base _LIBCPP_NODEBUG = __atomic_ref_base<_Tp*>;
 
   using difference_type = ptrdiff_t;
 
diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h
index 153001e7..ab9bc59 100644
--- a/libcxx/include/__atomic/atomic_sync.h
+++ b/libcxx/include/__atomic/atomic_sync.h
@@ -81,7 +81,7 @@ struct __atomic_wait_backoff_impl {
   _Poll __poll_;
   memory_order __order_;
 
-  using __waitable_traits = __atomic_waitable_traits<__decay_t<_AtomicWaitable> >;
+  using __waitable_traits _LIBCPP_NODEBUG = __atomic_waitable_traits<__decay_t<_AtomicWaitable> >;
 
   _LIBCPP_AVAILABILITY_SYNC
   _LIBCPP_HIDE_FROM_ABI bool
diff --git a/libcxx/include/__atomic/contention_t.h b/libcxx/include/__atomic/contention_t.h
index 6f2a073..5b42a01 100644
--- a/libcxx/include/__atomic/contention_t.h
+++ b/libcxx/include/__atomic/contention_t.h
@@ -20,12 +20,12 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if defined(__linux__) || (defined(_AIX) && !defined(__64BIT__))
-using __cxx_contention_t = int32_t;
+using __cxx_contention_t _LIBCPP_NODEBUG = int32_t;
 #else
-using __cxx_contention_t = int64_t;
+using __cxx_contention_t _LIBCPP_NODEBUG = int64_t;
 #endif // __linux__ || (_AIX && !__64BIT__)
 
-using __cxx_atomic_contention_t = __cxx_atomic_impl<__cxx_contention_t>;
+using __cxx_atomic_contention_t _LIBCPP_NODEBUG = __cxx_atomic_impl<__cxx_contention_t>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__atomic/memory_order.h b/libcxx/include/__atomic/memory_order.h
index 294121d..44790fe 100644
--- a/libcxx/include/__atomic/memory_order.h
+++ b/libcxx/include/__atomic/memory_order.h
@@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // to pin the underlying type in C++20.
 enum __legacy_memory_order { __mo_relaxed, __mo_consume, __mo_acquire, __mo_release, __mo_acq_rel, __mo_seq_cst };
 
-using __memory_order_underlying_t = underlying_type<__legacy_memory_order>::type;
+using __memory_order_underlying_t _LIBCPP_NODEBUG = underlying_type<__legacy_memory_order>::type;
 
 #if _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 9fa24c98..7e27090 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -43,8 +43,8 @@ struct __has_storage_type {
 
 template <class _Cp, bool = __has_storage_type<_Cp>::value>
 class __bit_reference {
-  using __storage_type    = typename _Cp::__storage_type;
-  using __storage_pointer = typename _Cp::__storage_pointer;
+  using __storage_type _LIBCPP_NODEBUG    = typename _Cp::__storage_type;
+  using __storage_pointer _LIBCPP_NODEBUG = typename _Cp::__storage_pointer;
 
   __storage_pointer __seg_;
   __storage_type __mask_;
@@ -55,7 +55,7 @@ class __bit_reference {
   friend class __bit_iterator<_Cp, false>;
 
 public:
-  using __container = typename _Cp::__self;
+  using __container _LIBCPP_NODEBUG = typename _Cp::__self;
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_reference(const __bit_reference&) = default;
 
@@ -135,8 +135,8 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(bool& __x,
 
 template <class _Cp>
 class __bit_const_reference {
-  using __storage_type    = typename _Cp::__storage_type;
-  using __storage_pointer = typename _Cp::__const_storage_pointer;
+  using __storage_type _LIBCPP_NODEBUG    = typename _Cp::__storage_type;
+  using __storage_pointer _LIBCPP_NODEBUG = typename _Cp::__const_storage_pointer;
 
   __storage_pointer __seg_;
   __storage_type __mask_;
@@ -145,7 +145,7 @@ class __bit_const_reference {
   friend class __bit_iterator<_Cp, true>;
 
 public:
-  using __container = typename _Cp::__self;
+  using __container _LIBCPP_NODEBUG = typename _Cp::__self;
 
   _LIBCPP_HIDE_FROM_ABI __bit_const_reference(const __bit_const_reference&) = default;
   __bit_const_reference& operator=(const __bit_const_reference&)            = delete;
@@ -587,10 +587,10 @@ inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> swap_ranges(
 
 template <class _Cp>
 struct __bit_array {
-  using difference_type   = typename _Cp::difference_type;
-  using __storage_type    = typename _Cp::__storage_type;
-  using __storage_pointer = typename _Cp::__storage_pointer;
-  using iterator          = typename _Cp::iterator;
+  using difference_type _LIBCPP_NODEBUG   = typename _Cp::difference_type;
+  using __storage_type _LIBCPP_NODEBUG    = typename _Cp::__storage_type;
+  using __storage_pointer _LIBCPP_NODEBUG = typename _Cp::__storage_pointer;
+  using iterator _LIBCPP_NODEBUG          = typename _Cp::iterator;
 
   static const unsigned __bits_per_word = _Cp::__bits_per_word;
   static const unsigned _Np             = 4;
@@ -790,8 +790,8 @@ public:
   using iterator_category = random_access_iterator_tag;
 
 private:
-  using __storage_type = typename _Cp::__storage_type;
-  using __storage_pointer =
+  using __storage_type _LIBCPP_NODEBUG = typename _Cp::__storage_type;
+  using __storage_pointer _LIBCPP_NODEBUG =
       __conditional_t<_IsConst, typename _Cp::__const_storage_pointer, typename _Cp::__storage_pointer>;
 
   static const unsigned __bits_per_word = _Cp::__bits_per_word;
diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h
index 3671e6a..1086dde 100644
--- a/libcxx/include/__chrono/formatter.h
+++ b/libcxx/include/__chrono/formatter.h
@@ -711,7 +711,7 @@ public:
 template <class _Duration, __fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::sys_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -722,7 +722,7 @@ public:
 template <class _Duration, __fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::file_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -733,7 +733,7 @@ public:
 template <class _Duration, __fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::local_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -745,7 +745,7 @@ public:
 template <class _Rep, class _Period, __fmt_char_type _CharT>
 struct formatter<chrono::duration<_Rep, _Period>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -767,7 +767,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::day, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -778,7 +778,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -789,7 +789,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -800,7 +800,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -811,7 +811,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday_indexed, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -822,7 +822,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -833,7 +833,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_day, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -844,7 +844,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_day_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -855,7 +855,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_weekday, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -866,7 +866,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_weekday_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -877,7 +877,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -888,7 +888,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_day, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -899,7 +899,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_day_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -910,7 +910,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_weekday, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -921,7 +921,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_weekday_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -932,7 +932,7 @@ public:
 template <class _Duration, __fmt_char_type _CharT>
 struct formatter<chrono::hh_mm_ss<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -944,7 +944,7 @@ public:
 template <__fmt_char_type _CharT>
 struct formatter<chrono::sys_info, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -955,7 +955,7 @@ public:
 template <__fmt_char_type _CharT>
 struct formatter<chrono::local_info, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -968,7 +968,7 @@ public:
 template <class _Duration, class _TimeZonePtr, __fmt_char_type _CharT>
 struct formatter<chrono::zoned_time<_Duration, _TimeZonePtr>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
diff --git a/libcxx/include/__chrono/hh_mm_ss.h b/libcxx/include/__chrono/hh_mm_ss.h
index c460b11..6ea8a28 100644
--- a/libcxx/include/__chrono/hh_mm_ss.h
+++ b/libcxx/include/__chrono/hh_mm_ss.h
@@ -30,7 +30,7 @@ template <class _Duration>
 class hh_mm_ss {
 private:
   static_assert(__is_duration_v<_Duration>, "template parameter of hh_mm_ss must be a std::chrono::duration");
-  using __CommonType = common_type_t<_Duration, chrono::seconds>;
+  using __CommonType _LIBCPP_NODEBUG = common_type_t<_Duration, chrono::seconds>;
 
   _LIBCPP_HIDE_FROM_ABI static constexpr uint64_t __pow10(unsigned __exp) {
     uint64_t __ret = 1;
diff --git a/libcxx/include/__chrono/parser_std_format_spec.h b/libcxx/include/__chrono/parser_std_format_spec.h
index 3976864..4df8e60 100644
--- a/libcxx/include/__chrono/parser_std_format_spec.h
+++ b/libcxx/include/__chrono/parser_std_format_spec.h
@@ -140,7 +140,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __validate_time_zone(__flags __flags) {
 
 template <class _CharT>
 class _LIBCPP_TEMPLATE_VIS __parser_chrono {
-  using _ConstIterator = typename basic_format_parse_context<_CharT>::const_iterator;
+  using _ConstIterator _LIBCPP_NODEBUG = typename basic_format_parse_context<_CharT>::const_iterator;
 
 public:
   template <class _ParseContext>
diff --git a/libcxx/include/__chrono/zoned_time.h b/libcxx/include/__chrono/zoned_time.h
index f57e65c..1deba10 100644
--- a/libcxx/include/__chrono/zoned_time.h
+++ b/libcxx/include/__chrono/zoned_time.h
@@ -66,7 +66,7 @@ class zoned_time {
   // Using these constraints in the code causes the compiler to give an
   // error that the constraint depends on itself. To avoid that issue use
   // the fact it is possible to create this object from a _TimeZonePtr.
-  using __traits = zoned_traits<_TimeZonePtr>;
+  using __traits _LIBCPP_NODEBUG = zoned_traits<_TimeZonePtr>;
 
 public:
   using duration = common_type_t<_Duration, seconds>;
@@ -186,7 +186,7 @@ template <class _Duration>
 zoned_time(sys_time<_Duration>) -> zoned_time<common_type_t<_Duration, seconds>>;
 
 template <class _TimeZonePtrOrName>
-using __time_zone_representation =
+using __time_zone_representation _LIBCPP_NODEBUG =
     conditional_t<is_convertible_v<_TimeZonePtrOrName, string_view>,
                   const time_zone*,
                   remove_cvref_t<_TimeZonePtrOrName>>;
diff --git a/libcxx/include/__compare/ordering.h b/libcxx/include/__compare/ordering.h
index 297218e..902ef53 100644
--- a/libcxx/include/__compare/ordering.h
+++ b/libcxx/include/__compare/ordering.h
@@ -120,7 +120,7 @@ inline constexpr partial_ordering partial_ordering::greater(_PartialOrdResult::_
 inline constexpr partial_ordering partial_ordering::unordered(_PartialOrdResult::__unordered);
 
 class weak_ordering {
-  using _ValueT = signed char;
+  using _ValueT _LIBCPP_NODEBUG = signed char;
 
   _LIBCPP_HIDE_FROM_ABI explicit constexpr weak_ordering(_OrdResult __v) noexcept : __value_(_ValueT(__v)) {}
 
@@ -190,7 +190,7 @@ inline constexpr weak_ordering weak_ordering::equivalent(_OrdResult::__equiv);
 inline constexpr weak_ordering weak_ordering::greater(_OrdResult::__greater);
 
 class strong_ordering {
-  using _ValueT = signed char;
+  using _ValueT _LIBCPP_NODEBUG = signed char;
 
   _LIBCPP_HIDE_FROM_ABI explicit constexpr strong_ordering(_OrdResult __v) noexcept : __value_(_ValueT(__v)) {}
 
diff --git a/libcxx/include/__compare/synth_three_way.h b/libcxx/include/__compare/synth_three_way.h
index e48ce49..63bf56d 100644
--- a/libcxx/include/__compare/synth_three_way.h
+++ b/libcxx/include/__compare/synth_three_way.h
@@ -43,7 +43,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr auto __synth_three_way = []<class _Tp, cl
 };
 
 template <class _Tp, class _Up = _Tp>
-using __synth_three_way_result = decltype(std::__synth_three_way(std::declval<_Tp&>(), std::declval<_Up&>()));
+using __synth_three_way_result _LIBCPP_NODEBUG =
+    decltype(std::__synth_three_way(std::declval<_Tp&>(), std::declval<_Up&>()));
 
 #endif // _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 7df46a0..6257e6f 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -66,7 +66,7 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
 
 public:
   // exception_ptr is basically a COW string.
-  using __trivially_relocatable = exception_ptr;
+  using __trivially_relocatable _LIBCPP_NODEBUG = exception_ptr;
 
   _LIBCPP_HIDE_FROM_ABI exception_ptr() _NOEXCEPT : __ptr_() {}
   _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {}
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 3d3f119..03bbd16 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -459,14 +459,14 @@ class expected : private __expected_base<_Tp, _Err> {
   template <class _Up, class _OtherErr>
   friend class expected;
 
-  using __base = __expected_base<_Tp, _Err>;
+  using __base _LIBCPP_NODEBUG = __expected_base<_Tp, _Err>;
 
 public:
   using value_type      = _Tp;
   using error_type      = _Err;
   using unexpected_type = unexpected<_Err>;
 
-  using __trivially_relocatable =
+  using __trivially_relocatable _LIBCPP_NODEBUG =
       __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value && __libcpp_is_trivially_relocatable<_Err>::value,
                       expected,
                       void>;
@@ -505,7 +505,7 @@ public:
 
 private:
   template <class _Up, class _OtherErr, class _UfQual, class _OtherErrQual>
-  using __can_convert = _And<
+  using __can_convert _LIBCPP_NODEBUG = _And<
       is_constructible<_Tp, _UfQual>,
       is_constructible<_Err, _OtherErrQual>,
       _If<_Not<is_same<remove_cv_t<_Tp>, bool>>::value,
@@ -1363,7 +1363,7 @@ class expected<_Tp, _Err> : private __expected_void_base<_Err> {
   friend class expected;
 
   template <class _Up, class _OtherErr, class _OtherErrQual>
-  using __can_convert =
+  using __can_convert _LIBCPP_NODEBUG =
       _And< is_void<_Up>,
             is_constructible<_Err, _OtherErrQual>,
             _Not<is_constructible<unexpected<_Err>, expected<_Up, _OtherErr>&>>,
@@ -1371,7 +1371,7 @@ class expected<_Tp, _Err> : private __expected_void_base<_Err> {
             _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>&>>,
             _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>>>>;
 
-  using __base = __expected_void_base<_Err>;
+  using __base _LIBCPP_NODEBUG = __expected_void_base<_Err>;
 
 public:
   using value_type      = _Tp;
diff --git a/libcxx/include/__expected/unexpected.h b/libcxx/include/__expected/unexpected.h
index cf110bc..6904889 100644
--- a/libcxx/include/__expected/unexpected.h
+++ b/libcxx/include/__expected/unexpected.h
@@ -48,12 +48,12 @@ template <class _Err>
 struct __is_std_unexpected<unexpected<_Err>> : true_type {};
 
 template <class _Tp>
-using __valid_std_unexpected = _BoolConstant< //
-    is_object_v<_Tp> &&                       //
-    !is_array_v<_Tp> &&                       //
-    !__is_std_unexpected<_Tp>::value &&       //
-    !is_const_v<_Tp> &&                       //
-    !is_volatile_v<_Tp>                       //
+using __valid_std_unexpected _LIBCPP_NODEBUG = _BoolConstant< //
+    is_object_v<_Tp> &&                                       //
+    !is_array_v<_Tp> &&                                       //
+    !__is_std_unexpected<_Tp>::value &&                       //
+    !is_const_v<_Tp> &&                                       //
+    !is_volatile_v<_Tp>                                       //
     >;
 
 template <class _Err>
diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h
index 7d0c01b..11e07ac 100644
--- a/libcxx/include/__filesystem/directory_entry.h
+++ b/libcxx/include/__filesystem/directory_entry.h
@@ -22,7 +22,9 @@
 #include <__filesystem/perms.h>
 #include <__fwd/ostream.h>
 #include <__system_error/errc.h>
+#include <__system_error/error_category.h>
 #include <__system_error/error_code.h>
+#include <__system_error/error_condition.h>
 #include <__utility/move.h>
 #include <__utility/unreachable.h>
 #include <cstdint>
@@ -274,15 +276,7 @@ private:
   _LIBCPP_EXPORTED_FROM_ABI error_code __do_refresh() noexcept;
 
   _LIBCPP_HIDE_FROM_ABI static bool __is_dne_error(error_code const& __ec) {
-    if (!__ec)
-      return true;
-    switch (static_cast<errc>(__ec.value())) {
-    case errc::no_such_file_or_directory:
-    case errc::not_a_directory:
-      return true;
-    default:
-      return false;
-    }
+    return !__ec || __ec == errc::no_such_file_or_directory || __ec == errc::not_a_directory;
   }
 
   _LIBCPP_HIDE_FROM_ABI void
diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h
index 509d1cc..0a751ba 100644
--- a/libcxx/include/__filesystem/path.h
+++ b/libcxx/include/__filesystem/path.h
@@ -51,30 +51,30 @@ template <class _Tp>
 struct __can_convert_char<const _Tp> : public __can_convert_char<_Tp> {};
 template <>
 struct __can_convert_char<char> {
-  static const bool value = true;
-  using __char_type       = char;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = char;
 };
 template <>
 struct __can_convert_char<wchar_t> {
-  static const bool value = true;
-  using __char_type       = wchar_t;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = wchar_t;
 };
 #  if _LIBCPP_HAS_CHAR8_T
 template <>
 struct __can_convert_char<char8_t> {
-  static const bool value = true;
-  using __char_type       = char8_t;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = char8_t;
 };
 #  endif
 template <>
 struct __can_convert_char<char16_t> {
-  static const bool value = true;
-  using __char_type       = char16_t;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = char16_t;
 };
 template <>
 struct __can_convert_char<char32_t> {
-  static const bool value = true;
-  using __char_type       = char32_t;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = char32_t;
 };
 
 template <class _ECharT, __enable_if_t<__can_convert_char<_ECharT>::value, int> = 0>
@@ -95,7 +95,7 @@ typedef string __u8_string;
 struct _NullSentinel {};
 
 template <class _Tp>
-using _Void = void;
+using _Void _LIBCPP_NODEBUG = void;
 
 template <class _Tp, class = void>
 struct __is_pathable_string : public false_type {};
@@ -104,7 +104,7 @@ template <class _ECharT, class _Traits, class _Alloc>
 struct __is_pathable_string< basic_string<_ECharT, _Traits, _Alloc>,
                              _Void<typename __can_convert_char<_ECharT>::__char_type> >
     : public __can_convert_char<_ECharT> {
-  using _Str = basic_string<_ECharT, _Traits, _Alloc>;
+  using _Str _LIBCPP_NODEBUG = basic_string<_ECharT, _Traits, _Alloc>;
 
   _LIBCPP_HIDE_FROM_ABI static _ECharT const* __range_begin(_Str const& __s) { return __s.data(); }
 
@@ -117,7 +117,7 @@ template <class _ECharT, class _Traits>
 struct __is_pathable_string< basic_string_view<_ECharT, _Traits>,
                              _Void<typename __can_convert_char<_ECharT>::__char_type> >
     : public __can_convert_char<_ECharT> {
-  using _Str = basic_string_view<_ECharT, _Traits>;
+  using _Str _LIBCPP_NODEBUG = basic_string_view<_ECharT, _Traits>;
 
   _LIBCPP_HIDE_FROM_ABI static _ECharT const* __range_begin(_Str const& __s) { return __s.data(); }
 
@@ -157,7 +157,7 @@ struct __is_pathable_iter<
     true,
     _Void<typename __can_convert_char< typename iterator_traits<_Iter>::value_type>::__char_type> >
     : __can_convert_char<typename iterator_traits<_Iter>::value_type> {
-  using _ECharT = typename iterator_traits<_Iter>::value_type;
+  using _ECharT _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::value_type;
 
   _LIBCPP_HIDE_FROM_ABI static _Iter __range_begin(_Iter __b) { return __b; }
 
@@ -380,13 +380,13 @@ struct _PathExport<char8_t> {
 
 class _LIBCPP_EXPORTED_FROM_ABI path {
   template <class _SourceOrIter, class _Tp = path&>
-  using _EnableIfPathable = __enable_if_t<__is_pathable<_SourceOrIter>::value, _Tp>;
+  using _EnableIfPathable _LIBCPP_NODEBUG = __enable_if_t<__is_pathable<_SourceOrIter>::value, _Tp>;
 
   template <class _Tp>
-  using _SourceChar = typename __is_pathable<_Tp>::__char_type;
+  using _SourceChar _LIBCPP_NODEBUG = typename __is_pathable<_Tp>::__char_type;
 
   template <class _Tp>
-  using _SourceCVT = _PathCVT<_SourceChar<_Tp> >;
+  using _SourceCVT _LIBCPP_NODEBUG = _PathCVT<_SourceChar<_Tp> >;
 
 public:
 #  if defined(_LIBCPP_WIN32API)
diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index b66bc1c..9fe8425 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -90,7 +90,7 @@ class flat_map {
   static_assert(!is_same_v<_MappedContainer, std::vector<bool>>, "vector<bool> is not a sequence container");
 
   template <bool _Const>
-  using __iterator = __key_value_iterator<flat_map, _KeyContainer, _MappedContainer, _Const>;
+  using __iterator _LIBCPP_NODEBUG = __key_value_iterator<flat_map, _KeyContainer, _MappedContainer, _Const>;
 
 public:
   // types
diff --git a/libcxx/include/__flat_map/key_value_iterator.h b/libcxx/include/__flat_map/key_value_iterator.h
index 987ac67..06a23f3 100644
--- a/libcxx/include/__flat_map/key_value_iterator.h
+++ b/libcxx/include/__flat_map/key_value_iterator.h
@@ -41,9 +41,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Owner, class _KeyContainer, class _MappedContainer, bool _Const>
 struct __key_value_iterator {
 private:
-  using __key_iterator    = ranges::iterator_t<const _KeyContainer>;
-  using __mapped_iterator = ranges::iterator_t<__maybe_const<_Const, _MappedContainer>>;
-  using __reference       = _If<_Const, typename _Owner::const_reference, typename _Owner::reference>;
+  using __key_iterator _LIBCPP_NODEBUG    = ranges::iterator_t<const _KeyContainer>;
+  using __mapped_iterator _LIBCPP_NODEBUG = ranges::iterator_t<__maybe_const<_Const, _MappedContainer>>;
+  using __reference _LIBCPP_NODEBUG       = _If<_Const, typename _Owner::const_reference, typename _Owner::reference>;
 
   struct __arrow_proxy {
     __reference __ref_;
diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h
index 618b8ef..9509f19 100644
--- a/libcxx/include/__format/buffer.h
+++ b/libcxx/include/__format/buffer.h
@@ -322,7 +322,7 @@ struct _LIBCPP_TEMPLATE_VIS __back_insert_iterator_container<back_insert_iterato
 template <class _Container>
 class _LIBCPP_TEMPLATE_VIS __writer_container {
 public:
-  using _CharT = typename _Container::value_type;
+  using _CharT _LIBCPP_NODEBUG = typename _Container::value_type;
 
   _LIBCPP_HIDE_FROM_ABI explicit __writer_container(back_insert_iterator<_Container> __out_it)
       : __container_{__out_it.__get_container()} {}
@@ -340,7 +340,7 @@ private:
 /// Selects the type of the writer used for the output iterator.
 template <class _OutIt, class _CharT>
 class _LIBCPP_TEMPLATE_VIS __writer_selector {
-  using _Container = typename __back_insert_iterator_container<_OutIt>::type;
+  using _Container _LIBCPP_NODEBUG = typename __back_insert_iterator_container<_OutIt>::type;
 
 public:
   using type =
@@ -355,7 +355,7 @@ public:
 template <class _OutIt, __fmt_char_type _CharT>
   requires(output_iterator<_OutIt, const _CharT&>)
 class _LIBCPP_TEMPLATE_VIS __format_buffer {
-  using _Storage =
+  using _Storage _LIBCPP_NODEBUG =
       conditional_t<__enable_direct_output<_OutIt, _CharT>, __direct_storage<_CharT>, __internal_storage<_CharT>>;
 
 public:
@@ -408,7 +408,7 @@ private:
 template <class _OutIt, __fmt_char_type _CharT, bool>
   requires(output_iterator<_OutIt, const _CharT&>)
 struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base {
-  using _Size = iter_difference_t<_OutIt>;
+  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
 
 public:
   _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size)
@@ -438,7 +438,7 @@ protected:
 template <class _OutIt, __fmt_char_type _CharT>
   requires(output_iterator<_OutIt, const _CharT&>)
 class _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base<_OutIt, _CharT, true> {
-  using _Size = iter_difference_t<_OutIt>;
+  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
 
 public:
   _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size)
@@ -489,8 +489,8 @@ template <class _OutIt, __fmt_char_type _CharT>
   requires(output_iterator<_OutIt, const _CharT&>)
 struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer final
     : public __format_to_n_buffer_base< _OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>> {
-  using _Base = __format_to_n_buffer_base<_OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>>;
-  using _Size = iter_difference_t<_OutIt>;
+  using _Base _LIBCPP_NODEBUG = __format_to_n_buffer_base<_OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>>;
+  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
 
 public:
   _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer(_OutIt __out_it, _Size __max_size)
@@ -523,7 +523,7 @@ public:
 // would lead to a circular include with formatter for vector<bool>.
 template <__fmt_char_type _CharT>
 class _LIBCPP_TEMPLATE_VIS __retarget_buffer {
-  using _Alloc = allocator<_CharT>;
+  using _Alloc _LIBCPP_NODEBUG = allocator<_CharT>;
 
 public:
   using value_type = _CharT;
diff --git a/libcxx/include/__format/concepts.h b/libcxx/include/__format/concepts.h
index 2c40e3e..28297c6 100644
--- a/libcxx/include/__format/concepts.h
+++ b/libcxx/include/__format/concepts.h
@@ -44,7 +44,7 @@ concept __fmt_char_type =
 // (Note testing for (w)format_context would be a valid choice, but requires
 // selecting the proper one depending on the type of _CharT.)
 template <class _CharT>
-using __fmt_iter_for = _CharT*;
+using __fmt_iter_for _LIBCPP_NODEBUG = _CharT*;
 
 template <class _Tp, class _Context, class _Formatter = typename _Context::template formatter_type<remove_const_t<_Tp>>>
 concept __formattable_with =
diff --git a/libcxx/include/__format/container_adaptor.h b/libcxx/include/__format/container_adaptor.h
index d3be2e1..48d42ee 100644
--- a/libcxx/include/__format/container_adaptor.h
+++ b/libcxx/include/__format/container_adaptor.h
@@ -37,8 +37,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Adaptor, class _CharT>
 struct _LIBCPP_TEMPLATE_VIS __formatter_container_adaptor {
 private:
-  using __maybe_const_container = __fmt_maybe_const<typename _Adaptor::container_type, _CharT>;
-  using __maybe_const_adaptor   = __maybe_const<is_const_v<__maybe_const_container>, _Adaptor>;
+  using __maybe_const_container _LIBCPP_NODEBUG = __fmt_maybe_const<typename _Adaptor::container_type, _CharT>;
+  using __maybe_const_adaptor _LIBCPP_NODEBUG   = __maybe_const<is_const_v<__maybe_const_container>, _Adaptor>;
   formatter<ranges::ref_view<__maybe_const_container>, _CharT> __underlying_;
 
 public:
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index a973ccd..1c530fd 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -208,7 +208,7 @@ _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<
 /// separate arrays.
 template <class _Context>
 class __basic_format_arg_value {
-  using _CharT = typename _Context::char_type;
+  using _CharT _LIBCPP_NODEBUG = typename _Context::char_type;
 
 public:
   /// Contains the implementation for basic_format_arg::handle.
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index 8b2c95c..4c5ee9e 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -257,7 +257,7 @@ struct _LIBCPP_TEMPLATE_VIS __format_arg_store {
     }
   }
 
-  using _Storage =
+  using _Storage _LIBCPP_NODEBUG =
       conditional_t<__format::__use_packed_format_arg_store(sizeof...(_Args)),
                     __format::__packed_format_arg_store<_Context, sizeof...(_Args)>,
                     __format::__unpacked_format_arg_store<_Context, sizeof...(_Args)>>;
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 3991363..b920be5 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -379,7 +379,7 @@ struct _LIBCPP_TEMPLATE_VIS basic_format_string {
 private:
   basic_string_view<_CharT> __str_;
 
-  using _Context = __format::__compile_time_basic_format_context<_CharT>;
+  using _Context _LIBCPP_NODEBUG = __format::__compile_time_basic_format_context<_CharT>;
 
   static constexpr array<__format::__arg_t, sizeof...(_Args)> __types_{
       __format::__determine_arg_t<_Context, remove_cvref_t<_Args>>()...};
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index e04fffb6..ac4be9b 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -141,7 +141,7 @@ struct __traits<double> {
 /// on the stack or the heap.
 template <floating_point _Fp>
 class _LIBCPP_TEMPLATE_VIS __float_buffer {
-  using _Traits = __traits<_Fp>;
+  using _Traits _LIBCPP_NODEBUG = __traits<_Fp>;
 
 public:
   // TODO FMT Improve this constructor to do a better estimate.
diff --git a/libcxx/include/__format/formatter_string.h b/libcxx/include/__format/formatter_string.h
index 826d642..30084e5 100644
--- a/libcxx/include/__format/formatter_string.h
+++ b/libcxx/include/__format/formatter_string.h
@@ -59,7 +59,7 @@ public:
 // Formatter const char*.
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<const _CharT*, _CharT> : public __formatter_string<_CharT> {
-  using _Base = __formatter_string<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_string<_CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator format(const _CharT* __str, _FormatContext& __ctx) const {
@@ -78,7 +78,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<const _CharT*, _CharT> : public __formatte
 // Formatter char*.
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<_CharT*, _CharT> : public formatter<const _CharT*, _CharT> {
-  using _Base = formatter<const _CharT*, _CharT>;
+  using _Base _LIBCPP_NODEBUG = formatter<const _CharT*, _CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator format(_CharT* __str, _FormatContext& __ctx) const {
@@ -89,7 +89,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<_CharT*, _CharT> : public formatter<const
 // Formatter char[].
 template <__fmt_char_type _CharT, size_t _Size>
 struct _LIBCPP_TEMPLATE_VIS formatter<_CharT[_Size], _CharT> : public __formatter_string<_CharT> {
-  using _Base = __formatter_string<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_string<_CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator
@@ -102,7 +102,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<_CharT[_Size], _CharT> : public __formatte
 template <__fmt_char_type _CharT, class _Traits, class _Allocator>
 struct _LIBCPP_TEMPLATE_VIS formatter<basic_string<_CharT, _Traits, _Allocator>, _CharT>
     : public __formatter_string<_CharT> {
-  using _Base = __formatter_string<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_string<_CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator
@@ -115,7 +115,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<basic_string<_CharT, _Traits, _Allocator>,
 // Formatter std::string_view.
 template <__fmt_char_type _CharT, class _Traits>
 struct _LIBCPP_TEMPLATE_VIS formatter<basic_string_view<_CharT, _Traits>, _CharT> : public __formatter_string<_CharT> {
-  using _Base = __formatter_string<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_string<_CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator
diff --git a/libcxx/include/__format/range_default_formatter.h b/libcxx/include/__format/range_default_formatter.h
index fb21b0f..bb4c520 100644
--- a/libcxx/include/__format/range_default_formatter.h
+++ b/libcxx/include/__format/range_default_formatter.h
@@ -40,7 +40,7 @@ concept __const_formattable_range =
     ranges::input_range<const _Rp> && formattable<ranges::range_reference_t<const _Rp>, _CharT>;
 
 template <class _Rp, class _CharT>
-using __fmt_maybe_const = conditional_t<__const_formattable_range<_Rp, _CharT>, const _Rp, _Rp>;
+using __fmt_maybe_const _LIBCPP_NODEBUG = conditional_t<__const_formattable_range<_Rp, _CharT>, const _Rp, _Rp>;
 
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wshadow")
@@ -95,7 +95,7 @@ struct _LIBCPP_TEMPLATE_VIS __range_default_formatter;
 template <ranges::input_range _Rp, class _CharT>
 struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::sequence, _Rp, _CharT> {
 private:
-  using __maybe_const_r = __fmt_maybe_const<_Rp, _CharT>;
+  using __maybe_const_r _LIBCPP_NODEBUG = __fmt_maybe_const<_Rp, _CharT>;
   range_formatter<remove_cvref_t<ranges::range_reference_t<__maybe_const_r>>, _CharT> __underlying_;
 
 public:
@@ -122,8 +122,8 @@ public:
 template <ranges::input_range _Rp, class _CharT>
 struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::map, _Rp, _CharT> {
 private:
-  using __maybe_const_map = __fmt_maybe_const<_Rp, _CharT>;
-  using __element_type    = remove_cvref_t<ranges::range_reference_t<__maybe_const_map>>;
+  using __maybe_const_map _LIBCPP_NODEBUG = __fmt_maybe_const<_Rp, _CharT>;
+  using __element_type _LIBCPP_NODEBUG    = remove_cvref_t<ranges::range_reference_t<__maybe_const_map>>;
   range_formatter<__element_type, _CharT> __underlying_;
 
 public:
@@ -150,8 +150,8 @@ public:
 template <ranges::input_range _Rp, class _CharT>
 struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::set, _Rp, _CharT> {
 private:
-  using __maybe_const_set = __fmt_maybe_const<_Rp, _CharT>;
-  using __element_type    = remove_cvref_t<ranges::range_reference_t<__maybe_const_set>>;
+  using __maybe_const_set _LIBCPP_NODEBUG = __fmt_maybe_const<_Rp, _CharT>;
+  using __element_type _LIBCPP_NODEBUG    = remove_cvref_t<ranges::range_reference_t<__maybe_const_set>>;
   range_formatter<__element_type, _CharT> __underlying_;
 
 public:
diff --git a/libcxx/include/__format/unicode.h b/libcxx/include/__format/unicode.h
index b4f22c7..46096fd 100644
--- a/libcxx/include/__format/unicode.h
+++ b/libcxx/include/__format/unicode.h
@@ -123,7 +123,7 @@ class __code_point_view;
 /// UTF-8 specialization.
 template <>
 class __code_point_view<char> {
-  using _Iterator = basic_string_view<char>::const_iterator;
+  using _Iterator _LIBCPP_NODEBUG = basic_string_view<char>::const_iterator;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
@@ -249,7 +249,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
 /// - 4 UTF-32 (for example Linux)
 template <>
 class __code_point_view<wchar_t> {
-  using _Iterator = typename basic_string_view<wchar_t>::const_iterator;
+  using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<wchar_t>::const_iterator;
 
 public:
   static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value");
@@ -300,8 +300,8 @@ private:
 // This implements the extended rules see
 // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
 class __extended_grapheme_cluster_break {
-  using __EGC_property  = __extended_grapheme_custer_property_boundary::__property;
-  using __inCB_property = __indic_conjunct_break::__property;
+  using __EGC_property _LIBCPP_NODEBUG  = __extended_grapheme_custer_property_boundary::__property;
+  using __inCB_property _LIBCPP_NODEBUG = __indic_conjunct_break::__property;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_break(char32_t __first_code_point)
@@ -527,7 +527,7 @@ private:
 /// Therefore only this code point is extracted.
 template <class _CharT>
 class __extended_grapheme_cluster_view {
-  using _Iterator = typename basic_string_view<_CharT>::const_iterator;
+  using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<_CharT>::const_iterator;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
@@ -572,7 +572,7 @@ __extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cl
 // This makes it easier to write code agnostic of the _LIBCPP_HAS_UNICODE define.
 template <class _CharT>
 class __code_point_view {
-  using _Iterator = typename basic_string_view<_CharT>::const_iterator;
+  using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<_CharT>::const_iterator;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
diff --git a/libcxx/include/__functional/binary_function.h b/libcxx/include/__functional/binary_function.h
index ddee3b1..bde8b03 100644
--- a/libcxx/include/__functional/binary_function.h
+++ b/libcxx/include/__functional/binary_function.h
@@ -42,11 +42,11 @@ struct __binary_function_keep_layout_base {
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
 template <class _Arg1, class _Arg2, class _Result>
-using __binary_function = binary_function<_Arg1, _Arg2, _Result>;
+using __binary_function _LIBCPP_NODEBUG = binary_function<_Arg1, _Arg2, _Result>;
 _LIBCPP_DIAGNOSTIC_POP
 #else
 template <class _Arg1, class _Arg2, class _Result>
-using __binary_function = __binary_function_keep_layout_base<_Arg1, _Arg2, _Result>;
+using __binary_function _LIBCPP_NODEBUG = __binary_function_keep_layout_base<_Arg1, _Arg2, _Result>;
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__functional/bind.h b/libcxx/include/__functional/bind.h
index f82c151..e31ad29 100644
--- a/libcxx/include/__functional/bind.h
+++ b/libcxx/include/__functional/bind.h
@@ -198,7 +198,7 @@ __apply_functor(_Fp& __f, _BoundArgs& __bound_args, __tuple_indices<_Indx...>, _
 template <class _Fp, class... _BoundArgs>
 class __bind : public __weak_result_type<__decay_t<_Fp> > {
 protected:
-  using _Fd = __decay_t<_Fp>;
+  using _Fd _LIBCPP_NODEBUG = __decay_t<_Fp>;
   typedef tuple<__decay_t<_BoundArgs>...> _Td;
 
 private:
diff --git a/libcxx/include/__functional/boyer_moore_searcher.h b/libcxx/include/__functional/boyer_moore_searcher.h
index 52a58d5..1e49cc5 100644
--- a/libcxx/include/__functional/boyer_moore_searcher.h
+++ b/libcxx/include/__functional/boyer_moore_searcher.h
@@ -92,7 +92,7 @@ class _LIBCPP_TEMPLATE_VIS boyer_moore_searcher {
 private:
   using difference_type = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
   using value_type      = typename std::iterator_traits<_RandomAccessIterator1>::value_type;
-  using __skip_table_type =
+  using __skip_table_type _LIBCPP_NODEBUG =
       _BMSkipTable<value_type,
                    difference_type,
                    _Hash,
@@ -223,7 +223,7 @@ class _LIBCPP_TEMPLATE_VIS boyer_moore_horspool_searcher {
 private:
   using difference_type = typename iterator_traits<_RandomAccessIterator1>::difference_type;
   using value_type      = typename iterator_traits<_RandomAccessIterator1>::value_type;
-  using __skip_table_type =
+  using __skip_table_type _LIBCPP_NODEBUG =
       _BMSkipTable<value_type,
                    difference_type,
                    _Hash,
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index a421a3e..b483e8e 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -576,7 +576,7 @@ private:
 // Used to choose between perfect forwarding or pass-by-value. Pass-by-value is
 // faster for types that can be passed in registers.
 template <typename _Tp>
-using __fast_forward = __conditional_t<is_scalar<_Tp>::value, _Tp, _Tp&&>;
+using __fast_forward _LIBCPP_NODEBUG = __conditional_t<is_scalar<_Tp>::value, _Tp, _Tp&&>;
 
 // __policy_invoker calls an instance of __alloc_func held in __policy_storage.
 
@@ -847,7 +847,7 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
   };
 
   template <class _Fp>
-  using _EnableIfLValueCallable = __enable_if_t<__callable<_Fp&>::value>;
+  using _EnableIfLValueCallable _LIBCPP_NODEBUG = __enable_if_t<__callable<_Fp&>::value>;
 
 public:
   typedef _Rp result_type;
diff --git a/libcxx/include/__functional/perfect_forward.h b/libcxx/include/__functional/perfect_forward.h
index 8fd68db..37c3d15 100644
--- a/libcxx/include/__functional/perfect_forward.h
+++ b/libcxx/include/__functional/perfect_forward.h
@@ -94,7 +94,7 @@ public:
 
 // __perfect_forward implements a perfect-forwarding call wrapper as explained in [func.require].
 template <class _Op, class... _Args>
-using __perfect_forward = __perfect_forward_impl<_Op, index_sequence_for<_Args...>, _Args...>;
+using __perfect_forward _LIBCPP_NODEBUG = __perfect_forward_impl<_Op, index_sequence_for<_Args...>, _Args...>;
 
 #endif // _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__functional/unary_function.h b/libcxx/include/__functional/unary_function.h
index 69b1bc9..769ffc9 100644
--- a/libcxx/include/__functional/unary_function.h
+++ b/libcxx/include/__functional/unary_function.h
@@ -39,11 +39,11 @@ struct __unary_function_keep_layout_base {
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
 template <class _Arg, class _Result>
-using __unary_function = unary_function<_Arg, _Result>;
+using __unary_function _LIBCPP_NODEBUG = unary_function<_Arg, _Result>;
 _LIBCPP_DIAGNOSTIC_POP
 #else
 template <class _Arg, class _Result>
-using __unary_function = __unary_function_keep_layout_base<_Arg, _Result>;
+using __unary_function _LIBCPP_NODEBUG = __unary_function_keep_layout_base<_Arg, _Result>;
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__fwd/memory.h b/libcxx/include/__fwd/memory.h
index b9e1518..56400099 100644
--- a/libcxx/include/__fwd/memory.h
+++ b/libcxx/include/__fwd/memory.h
@@ -20,6 +20,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp>
 class _LIBCPP_TEMPLATE_VIS allocator;
 
+template <class _Tp>
+class _LIBCPP_TEMPLATE_VIS shared_ptr;
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___FWD_MEMORY_H
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 9c821ea..7788f68 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -111,8 +111,8 @@ struct __hash_node_base {
 template <class _Tp, class _VoidPtr>
 struct __hash_node : public __hash_node_base< __rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > > {
   typedef _Tp __node_value_type;
-  using _Base          = __hash_node_base<__rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > >;
-  using __next_pointer = typename _Base::__next_pointer;
+  using _Base _LIBCPP_NODEBUG          = __hash_node_base<__rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > >;
+  using __next_pointer _LIBCPP_NODEBUG = typename _Base::__next_pointer;
 
   size_t __hash_;
 
diff --git a/libcxx/include/__iterator/aliasing_iterator.h b/libcxx/include/__iterator/aliasing_iterator.h
index aeb5b4a..e011271 100644
--- a/libcxx/include/__iterator/aliasing_iterator.h
+++ b/libcxx/include/__iterator/aliasing_iterator.h
@@ -31,8 +31,8 @@ struct __aliasing_iterator_wrapper {
   class __iterator {
     _BaseIter __base_ = nullptr;
 
-    using __iter_traits     = iterator_traits<_BaseIter>;
-    using __base_value_type = typename __iter_traits::value_type;
+    using __iter_traits _LIBCPP_NODEBUG     = iterator_traits<_BaseIter>;
+    using __base_value_type _LIBCPP_NODEBUG = typename __iter_traits::value_type;
 
     static_assert(__has_random_access_iterator_category<_BaseIter>::value,
                   "The base iterator has to be a random access iterator!");
@@ -120,7 +120,7 @@ struct __aliasing_iterator_wrapper {
 
 // This is required to avoid ADL instantiations on _BaseT
 template <class _BaseT, class _Alias>
-using __aliasing_iterator = typename __aliasing_iterator_wrapper<_BaseT, _Alias>::__iterator;
+using __aliasing_iterator _LIBCPP_NODEBUG = typename __aliasing_iterator_wrapper<_BaseT, _Alias>::__iterator;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h
index 1c22793..6e5ac1d 100644
--- a/libcxx/include/__iterator/concepts.h
+++ b/libcxx/include/__iterator/concepts.h
@@ -67,10 +67,10 @@ template <class _In>
 concept indirectly_readable = __indirectly_readable_impl<remove_cvref_t<_In>>;
 
 template <class _Tp>
-using __projected_iterator_t = typename _Tp::__projected_iterator;
+using __projected_iterator_t _LIBCPP_NODEBUG = typename _Tp::__projected_iterator;
 
 template <class _Tp>
-using __projected_projection_t = typename _Tp::__projected_projection;
+using __projected_projection_t _LIBCPP_NODEBUG = typename _Tp::__projected_projection;
 
 template <class _Tp>
 concept __specialization_of_projected = requires {
@@ -89,7 +89,7 @@ struct __indirect_value_t_impl<_Tp> {
 };
 
 template <indirectly_readable _Tp>
-using __indirect_value_t = typename __indirect_value_t_impl<_Tp>::type;
+using __indirect_value_t _LIBCPP_NODEBUG = typename __indirect_value_t_impl<_Tp>::type;
 
 template <indirectly_readable _Tp>
 using iter_common_reference_t = common_reference_t<iter_reference_t<_Tp>, __indirect_value_t<_Tp>>;
@@ -274,7 +274,7 @@ concept indirectly_copyable_storable =
 #endif // _LIBCPP_STD_VER >= 20
 
 template <class _Tp>
-using __has_random_access_iterator_category_or_concept
+using __has_random_access_iterator_category_or_concept _LIBCPP_NODEBUG
 #if _LIBCPP_STD_VER >= 20
     = integral_constant<bool, random_access_iterator<_Tp>>;
 #else  // _LIBCPP_STD_VER < 20
diff --git a/libcxx/include/__iterator/insert_iterator.h b/libcxx/include/__iterator/insert_iterator.h
index b331104..e0ee0ce 100644
--- a/libcxx/include/__iterator/insert_iterator.h
+++ b/libcxx/include/__iterator/insert_iterator.h
@@ -29,10 +29,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 template <class _Container>
-using __insert_iterator_iter_t = ranges::iterator_t<_Container>;
+using __insert_iterator_iter_t _LIBCPP_NODEBUG = ranges::iterator_t<_Container>;
 #else
 template <class _Container>
-using __insert_iterator_iter_t = typename _Container::iterator;
+using __insert_iterator_iter_t _LIBCPP_NODEBUG = typename _Container::iterator;
 #endif
 
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h
index eb6ba8b6..db68dd2 100644
--- a/libcxx/include/__iterator/iterator_traits.h
+++ b/libcxx/include/__iterator/iterator_traits.h
@@ -47,7 +47,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 
 template <class _Tp>
-using __with_reference = _Tp&;
+using __with_reference _LIBCPP_NODEBUG = _Tp&;
 
 template <class _Tp>
 concept __can_reference = requires { typename __with_reference<_Tp>; };
@@ -80,19 +80,20 @@ struct __iter_traits_cache {
   using type = _If< __is_primary_template<iterator_traits<_Iter> >::value, _Iter, iterator_traits<_Iter> >;
 };
 template <class _Iter>
-using _ITER_TRAITS = typename __iter_traits_cache<_Iter>::type;
+using _ITER_TRAITS _LIBCPP_NODEBUG = typename __iter_traits_cache<_Iter>::type;
 
 struct __iter_concept_concept_test {
   template <class _Iter>
-  using _Apply = typename _ITER_TRAITS<_Iter>::iterator_concept;
+  using _Apply _LIBCPP_NODEBUG = typename _ITER_TRAITS<_Iter>::iterator_concept;
 };
 struct __iter_concept_category_test {
   template <class _Iter>
-  using _Apply = typename _ITER_TRAITS<_Iter>::iterator_category;
+  using _Apply _LIBCPP_NODEBUG = typename _ITER_TRAITS<_Iter>::iterator_category;
 };
 struct __iter_concept_random_fallback {
   template <class _Iter>
-  using _Apply = __enable_if_t< __is_primary_template<iterator_traits<_Iter> >::value, random_access_iterator_tag >;
+  using _Apply _LIBCPP_NODEBUG =
+      __enable_if_t<__is_primary_template<iterator_traits<_Iter> >::value, random_access_iterator_tag>;
 };
 
 template <class _Iter, class _Tester>
@@ -106,7 +107,7 @@ struct __iter_concept_cache {
 };
 
 template <class _Iter>
-using _ITER_CONCEPT = typename __iter_concept_cache<_Iter>::type::template _Apply<_Iter>;
+using _ITER_CONCEPT _LIBCPP_NODEBUG = typename __iter_concept_cache<_Iter>::type::template _Apply<_Iter>;
 
 template <class _Tp>
 struct __has_iterator_typedefs {
@@ -364,7 +365,7 @@ struct __iterator_traits<_Ip> {
 
 template <class _Ip>
 struct iterator_traits : __iterator_traits<_Ip> {
-  using __primary_template = iterator_traits;
+  using __primary_template _LIBCPP_NODEBUG = iterator_traits;
 };
 
 #else  // _LIBCPP_STD_VER >= 20
@@ -397,7 +398,7 @@ struct __iterator_traits<_Iter, true>
 
 template <class _Iter>
 struct _LIBCPP_TEMPLATE_VIS iterator_traits : __iterator_traits<_Iter, __has_iterator_typedefs<_Iter>::value> {
-  using __primary_template = iterator_traits;
+  using __primary_template _LIBCPP_NODEBUG = iterator_traits;
 };
 #endif // _LIBCPP_STD_VER >= 20
 
@@ -430,16 +431,19 @@ template <class _Tp, class _Up>
 struct __has_iterator_concept_convertible_to<_Tp, _Up, false> : false_type {};
 
 template <class _Tp>
-using __has_input_iterator_category = __has_iterator_category_convertible_to<_Tp, input_iterator_tag>;
+using __has_input_iterator_category _LIBCPP_NODEBUG = __has_iterator_category_convertible_to<_Tp, input_iterator_tag>;
 
 template <class _Tp>
-using __has_forward_iterator_category = __has_iterator_category_convertible_to<_Tp, forward_iterator_tag>;
+using __has_forward_iterator_category _LIBCPP_NODEBUG =
+    __has_iterator_category_convertible_to<_Tp, forward_iterator_tag>;
 
 template <class _Tp>
-using __has_bidirectional_iterator_category = __has_iterator_category_convertible_to<_Tp, bidirectional_iterator_tag>;
+using __has_bidirectional_iterator_category _LIBCPP_NODEBUG =
+    __has_iterator_category_convertible_to<_Tp, bidirectional_iterator_tag>;
 
 template <class _Tp>
-using __has_random_access_iterator_category = __has_iterator_category_convertible_to<_Tp, random_access_iterator_tag>;
+using __has_random_access_iterator_category _LIBCPP_NODEBUG =
+    __has_iterator_category_convertible_to<_Tp, random_access_iterator_tag>;
 
 // __libcpp_is_contiguous_iterator determines if an iterator is known by
 // libc++ to be contiguous, either because it advertises itself as such
@@ -466,48 +470,49 @@ template <class _Iter>
 class __wrap_iter;
 
 template <class _Tp>
-using __has_exactly_input_iterator_category =
+using __has_exactly_input_iterator_category _LIBCPP_NODEBUG =
     integral_constant<bool,
                       __has_iterator_category_convertible_to<_Tp, input_iterator_tag>::value &&
                           !__has_iterator_category_convertible_to<_Tp, forward_iterator_tag>::value>;
 
 template <class _Tp>
-using __has_exactly_forward_iterator_category =
+using __has_exactly_forward_iterator_category _LIBCPP_NODEBUG =
     integral_constant<bool,
                       __has_iterator_category_convertible_to<_Tp, forward_iterator_tag>::value &&
                           !__has_iterator_category_convertible_to<_Tp, bidirectional_iterator_tag>::value>;
 
 template <class _Tp>
-using __has_exactly_bidirectional_iterator_category =
+using __has_exactly_bidirectional_iterator_category _LIBCPP_NODEBUG =
     integral_constant<bool,
                       __has_iterator_category_convertible_to<_Tp, bidirectional_iterator_tag>::value &&
                           !__has_iterator_category_convertible_to<_Tp, random_access_iterator_tag>::value>;
 
 template <class _InputIterator>
-using __iter_value_type = typename iterator_traits<_InputIterator>::value_type;
+using __iter_value_type _LIBCPP_NODEBUG = typename iterator_traits<_InputIterator>::value_type;
 
 template <class _InputIterator>
-using __iter_key_type = __remove_const_t<typename iterator_traits<_InputIterator>::value_type::first_type>;
+using __iter_key_type _LIBCPP_NODEBUG =
+    __remove_const_t<typename iterator_traits<_InputIterator>::value_type::first_type>;
 
 template <class _InputIterator>
-using __iter_mapped_type = typename iterator_traits<_InputIterator>::value_type::second_type;
+using __iter_mapped_type _LIBCPP_NODEBUG = typename iterator_traits<_InputIterator>::value_type::second_type;
 
 template <class _InputIterator>
-using __iter_to_alloc_type =
+using __iter_to_alloc_type _LIBCPP_NODEBUG =
     pair<const typename iterator_traits<_InputIterator>::value_type::first_type,
          typename iterator_traits<_InputIterator>::value_type::second_type>;
 
 template <class _Iter>
-using __iterator_category_type = typename iterator_traits<_Iter>::iterator_category;
+using __iterator_category_type _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::iterator_category;
 
 template <class _Iter>
-using __iterator_pointer_type = typename iterator_traits<_Iter>::pointer;
+using __iterator_pointer_type _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::pointer;
 
 template <class _Iter>
-using __iter_diff_t = typename iterator_traits<_Iter>::difference_type;
+using __iter_diff_t _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::difference_type;
 
 template <class _Iter>
-using __iter_reference = typename iterator_traits<_Iter>::reference;
+using __iter_reference _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::reference;
 
 #if _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__iterator/projected.h b/libcxx/include/__iterator/projected.h
index 1c560ec..d12f016 100644
--- a/libcxx/include/__iterator/projected.h
+++ b/libcxx/include/__iterator/projected.h
@@ -26,9 +26,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _It, class _Proj>
 struct __projected_impl {
   struct __type {
-    using __primary_template     = __type;
-    using __projected_iterator   = _It;
-    using __projected_projection = _Proj;
+    using __primary_template _LIBCPP_NODEBUG     = __type;
+    using __projected_iterator _LIBCPP_NODEBUG   = _It;
+    using __projected_projection _LIBCPP_NODEBUG = _Proj;
 
     using value_type = remove_cvref_t<indirect_result_t<_Proj&, _It>>;
     indirect_result_t<_Proj&, _It> operator*() const; // not defined
@@ -38,9 +38,9 @@ struct __projected_impl {
 template <weakly_incrementable _It, class _Proj>
 struct __projected_impl<_It, _Proj> {
   struct __type {
-    using __primary_template     = __type;
-    using __projected_iterator   = _It;
-    using __projected_projection = _Proj;
+    using __primary_template _LIBCPP_NODEBUG     = __type;
+    using __projected_iterator _LIBCPP_NODEBUG   = _It;
+    using __projected_projection _LIBCPP_NODEBUG = _Proj;
 
     using value_type      = remove_cvref_t<indirect_result_t<_Proj&, _It>>;
     using difference_type = iter_difference_t<_It>;
diff --git a/libcxx/include/__iterator/ranges_iterator_traits.h b/libcxx/include/__iterator/ranges_iterator_traits.h
index 859e708..9a31b65 100644
--- a/libcxx/include/__iterator/ranges_iterator_traits.h
+++ b/libcxx/include/__iterator/ranges_iterator_traits.h
@@ -24,13 +24,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 23
 
 template <ranges::input_range _Range>
-using __range_key_type = __remove_const_t<typename ranges::range_value_t<_Range>::first_type>;
+using __range_key_type _LIBCPP_NODEBUG = __remove_const_t<typename ranges::range_value_t<_Range>::first_type>;
 
 template <ranges::input_range _Range>
-using __range_mapped_type = typename ranges::range_value_t<_Range>::second_type;
+using __range_mapped_type _LIBCPP_NODEBUG = typename ranges::range_value_t<_Range>::second_type;
 
 template <ranges::input_range _Range>
-using __range_to_alloc_type =
+using __range_to_alloc_type _LIBCPP_NODEBUG =
     pair<const typename ranges::range_value_t<_Range>::first_type, typename ranges::range_value_t<_Range>::second_type>;
 
 #endif
diff --git a/libcxx/include/__iterator/reverse_iterator.h b/libcxx/include/__iterator/reverse_iterator.h
index 5e88d86..5bd1f86 100644
--- a/libcxx/include/__iterator/reverse_iterator.h
+++ b/libcxx/include/__iterator/reverse_iterator.h
@@ -329,8 +329,8 @@ __reverse_range(_Range&& __range) {
 
 template <class _Iter, bool __b>
 struct __unwrap_iter_impl<reverse_iterator<reverse_iterator<_Iter> >, __b> {
-  using _UnwrappedIter  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>()));
-  using _ReverseWrapper = reverse_iterator<reverse_iterator<_Iter> >;
+  using _UnwrappedIter _LIBCPP_NODEBUG  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>()));
+  using _ReverseWrapper _LIBCPP_NODEBUG = reverse_iterator<reverse_iterator<_Iter> >;
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ReverseWrapper
   __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter) {
diff --git a/libcxx/include/__iterator/segmented_iterator.h b/libcxx/include/__iterator/segmented_iterator.h
index 8cb54a3..7a8e1ad 100644
--- a/libcxx/include/__iterator/segmented_iterator.h
+++ b/libcxx/include/__iterator/segmented_iterator.h
@@ -72,7 +72,7 @@ template <class _Tp>
 struct __has_specialization<_Tp, sizeof(_Tp) * 0> : true_type {};
 
 template <class _Iterator>
-using __is_segmented_iterator = __has_specialization<__segmented_iterator_traits<_Iterator> >;
+using __is_segmented_iterator _LIBCPP_NODEBUG = __has_specialization<__segmented_iterator_traits<_Iterator> >;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index b675e01..94dc8a0 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -50,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI const _Facet& use_facet(const locale&);
 class _LIBCPP_EXPORTED_FROM_ABI locale {
 public:
   // locale is essentially a shared_ptr that doesn't support weak_ptrs and never got a move constructor.
-  using __trivially_relocatable = locale;
+  using __trivially_relocatable _LIBCPP_NODEBUG = locale;
 
   // types:
   class _LIBCPP_EXPORTED_FROM_ABI facet;
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index c8097be..9957429 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -135,7 +135,7 @@ namespace __locale {
 //
 // Locale management
 //
-using __locale_t = locale_t;
+using __locale_t _LIBCPP_NODEBUG = locale_t;
 
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) {
   return newlocale(__category_mask, __name, __loc);
diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h
index edbc30a..65a6977 100644
--- a/libcxx/include/__mdspan/extents.h
+++ b/libcxx/include/__mdspan/extents.h
@@ -129,14 +129,14 @@ private:
   // Static values member
   static constexpr size_t __size_         = sizeof...(_Values);
   static constexpr size_t __size_dynamic_ = ((_Values == _DynTag) + ... + 0);
-  using _StaticValues                     = __static_array<_TStatic, _Values...>;
-  using _DynamicValues                    = __possibly_empty_array<_TDynamic, __size_dynamic_>;
+  using _StaticValues _LIBCPP_NODEBUG     = __static_array<_TStatic, _Values...>;
+  using _DynamicValues _LIBCPP_NODEBUG    = __possibly_empty_array<_TDynamic, __size_dynamic_>;
 
   // Dynamic values member
   _LIBCPP_NO_UNIQUE_ADDRESS _DynamicValues __dyn_vals_;
 
   // static mapping of indices to the position in the dynamic values array
-  using _DynamicIdxMap = __static_partial_sums<static_cast<size_t>(_Values == _DynTag)...>;
+  using _DynamicIdxMap _LIBCPP_NODEBUG = __static_partial_sums<static_cast<size_t>(_Values == _DynTag)...>;
 
   template <size_t... _Indices>
   _LIBCPP_HIDE_FROM_ABI static constexpr _DynamicValues __zeros(index_sequence<_Indices...>) noexcept {
@@ -292,7 +292,8 @@ private:
   static constexpr rank_type __rank_dynamic_ = ((_Extents == dynamic_extent) + ... + 0);
 
   // internal storage type using __maybe_static_array
-  using _Values = __mdspan_detail::__maybe_static_array<_IndexType, size_t, dynamic_extent, _Extents...>;
+  using _Values _LIBCPP_NODEBUG =
+      __mdspan_detail::__maybe_static_array<_IndexType, size_t, dynamic_extent, _Extents...>;
   [[no_unique_address]] _Values __vals_;
 
 public:
diff --git a/libcxx/include/__memory/allocation_guard.h b/libcxx/include/__memory/allocation_guard.h
index 66d6a50..66edcd9 100644
--- a/libcxx/include/__memory/allocation_guard.h
+++ b/libcxx/include/__memory/allocation_guard.h
@@ -45,8 +45,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // custom allocator.
 template <class _Alloc>
 struct __allocation_guard {
-  using _Pointer = typename allocator_traits<_Alloc>::pointer;
-  using _Size    = typename allocator_traits<_Alloc>::size_type;
+  using _Pointer _LIBCPP_NODEBUG = typename allocator_traits<_Alloc>::pointer;
+  using _Size _LIBCPP_NODEBUG    = typename allocator_traits<_Alloc>::size_type;
 
   template <class _AllocT> // we perform the allocator conversion inside the constructor
   _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n)
diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h
index e35cfb7..afe3d1b 100644
--- a/libcxx/include/__memory/pointer_traits.h
+++ b/libcxx/include/__memory/pointer_traits.h
@@ -176,10 +176,10 @@ public:
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _From, class _To>
-using __rebind_pointer_t = typename pointer_traits<_From>::template rebind<_To>;
+using __rebind_pointer_t _LIBCPP_NODEBUG = typename pointer_traits<_From>::template rebind<_To>;
 #else
 template <class _From, class _To>
-using __rebind_pointer_t = typename pointer_traits<_From>::template rebind<_To>::other;
+using __rebind_pointer_t _LIBCPP_NODEBUG = typename pointer_traits<_From>::template rebind<_To>::other;
 #endif
 
 // to_address
@@ -276,7 +276,7 @@ struct __pointer_of<_Tp> {
 };
 
 template <typename _Tp>
-using __pointer_of_t = typename __pointer_of<_Tp>::type;
+using __pointer_of_t _LIBCPP_NODEBUG = typename __pointer_of<_Tp>::type;
 
 template <class _Tp, class _Up>
 struct __pointer_of_or {
@@ -290,7 +290,7 @@ struct __pointer_of_or<_Tp, _Up> {
 };
 
 template <typename _Tp, typename _Up>
-using __pointer_of_or_t = typename __pointer_of_or<_Tp, _Up>::type;
+using __pointer_of_or_t _LIBCPP_NODEBUG = typename __pointer_of_or<_Tp, _Up>::type;
 
 template <class _Smart>
 concept __resettable_smart_pointer = requires(_Smart __s) { __s.reset(); };
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 97e4031..06b1fc4 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -141,7 +141,7 @@ struct __for_overwrite_tag {};
 
 template <class _Tp, class _Alloc>
 struct __shared_ptr_emplace : __shared_weak_count {
-  using __value_type = __remove_cv_t<_Tp>;
+  using __value_type _LIBCPP_NODEBUG = __remove_cv_t<_Tp>;
 
   template <class... _Args,
             class _Allocator                                                                         = _Alloc,
@@ -293,7 +293,8 @@ struct __shared_ptr_deleter_ctor_reqs {
 };
 
 template <class _Dp>
-using __shared_ptr_nullptr_deleter_ctor_reqs = _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, nullptr_t> >;
+using __shared_ptr_nullptr_deleter_ctor_reqs _LIBCPP_NODEBUG =
+    _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, nullptr_t> >;
 
 #if defined(_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI)
 #  define _LIBCPP_SHARED_PTR_TRIVIAL_ABI __attribute__((__trivial_abi__))
@@ -315,7 +316,7 @@ public:
 
   // A shared_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require
   // any bookkeeping, so it's always trivially relocatable.
-  using __trivially_relocatable = shared_ptr;
+  using __trivially_relocatable _LIBCPP_NODEBUG = shared_ptr;
 
 private:
   element_type* __ptr_;
@@ -1210,7 +1211,7 @@ public:
 
   // A weak_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require
   // any bookkeeping, so it's always trivially relocatable.
-  using __trivially_relocatable = weak_ptr;
+  using __trivially_relocatable _LIBCPP_NODEBUG = weak_ptr;
 
 private:
   element_type* __ptr_;
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index 2368f7b..29d391f 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -153,7 +153,7 @@ public:
   //
   // This unique_ptr implementation only contains a pointer to the unique object and a deleter, so there are no
   // references to itself. This means that the entire structure is trivially relocatable if its members are.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value,
       unique_ptr,
       void>;
@@ -189,7 +189,7 @@ private:
                      (!is_reference<_Dp>::value && is_convertible<_UDel, _Dp>::value) >;
 
   template <class _UDel>
-  using _EnableIfDeleterAssignable = __enable_if_t< is_assignable<_Dp&, _UDel&&>::value >;
+  using _EnableIfDeleterAssignable _LIBCPP_NODEBUG = __enable_if_t< is_assignable<_Dp&, _UDel&&>::value >;
 
 public:
   template <bool _Dummy = true, class = _EnableIfDeleterDefaultConstructible<_Dummy> >
@@ -419,7 +419,7 @@ public:
   //
   // This unique_ptr implementation only contains a pointer to the unique object and a deleter, so there are no
   // references to itself. This means that the entire structure is trivially relocatable if its members are.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value,
       unique_ptr,
       void>;
@@ -430,9 +430,9 @@ private:
 
   _LIBCPP_COMPRESSED_PAIR(pointer, __ptr_, deleter_type, __deleter_);
 #ifdef _LIBCPP_ABI_BOUNDED_UNIQUE_PTR
-  using _BoundsChecker = __unique_ptr_array_bounds_stored;
+  using _BoundsChecker _LIBCPP_NODEBUG = __unique_ptr_array_bounds_stored;
 #else
-  using _BoundsChecker = __unique_ptr_array_bounds_stateless;
+  using _BoundsChecker _LIBCPP_NODEBUG = __unique_ptr_array_bounds_stateless;
 #endif
   _LIBCPP_NO_UNIQUE_ADDRESS _BoundsChecker __checker_;
 
diff --git a/libcxx/include/__memory/unique_temporary_buffer.h b/libcxx/include/__memory/unique_temporary_buffer.h
index ca62923..dea7fa8 100644
--- a/libcxx/include/__memory/unique_temporary_buffer.h
+++ b/libcxx/include/__memory/unique_temporary_buffer.h
@@ -45,7 +45,7 @@ struct __temporary_buffer_deleter {
 };
 
 template <class _Tp>
-using __unique_temporary_buffer = unique_ptr<_Tp, __temporary_buffer_deleter<_Tp> >;
+using __unique_temporary_buffer _LIBCPP_NODEBUG = unique_ptr<_Tp, __temporary_buffer_deleter<_Tp> >;
 
 template <class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _LIBCPP_CONSTEXPR_SINCE_CXX23 __unique_temporary_buffer<_Tp>
diff --git a/libcxx/include/__node_handle b/libcxx/include/__node_handle
index d0b35bf..8f32f2d 100644
--- a/libcxx/include/__node_handle
+++ b/libcxx/include/__node_handle
@@ -188,10 +188,10 @@ struct __map_node_handle_specifics {
 };
 
 template <class _NodeType, class _Alloc>
-using __set_node_handle = __basic_node_handle< _NodeType, _Alloc, __set_node_handle_specifics>;
+using __set_node_handle _LIBCPP_NODEBUG = __basic_node_handle< _NodeType, _Alloc, __set_node_handle_specifics>;
 
 template <class _NodeType, class _Alloc>
-using __map_node_handle = __basic_node_handle< _NodeType, _Alloc, __map_node_handle_specifics>;
+using __map_node_handle _LIBCPP_NODEBUG = __basic_node_handle< _NodeType, _Alloc, __map_node_handle_specifics>;
 
 template <class _Iterator, class _NodeType>
 struct _LIBCPP_TEMPLATE_VIS __insert_return_type {
diff --git a/libcxx/include/__ostream/basic_ostream.h b/libcxx/include/__ostream/basic_ostream.h
index 6d24171..9722647 100644
--- a/libcxx/include/__ostream/basic_ostream.h
+++ b/libcxx/include/__ostream/basic_ostream.h
@@ -14,7 +14,7 @@
 #if _LIBCPP_HAS_LOCALIZATION
 
 #  include <__exception/operations.h>
-#  include <__memory/shared_ptr.h>
+#  include <__fwd/memory.h>
 #  include <__memory/unique_ptr.h>
 #  include <__new/exceptions.h>
 #  include <__ostream/put_character_sequence.h>
@@ -88,6 +88,55 @@ public:
     return *this;
   }
 
+  template <class _Tp>
+  _LIBCPP_HIDE_FROM_ABI basic_ostream& __put_num(_Tp __value) {
+#  if _LIBCPP_HAS_EXCEPTIONS
+    try {
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      sentry __s(*this);
+      if (__s) {
+        using _Fp          = num_put<char_type, ostreambuf_iterator<char_type, traits_type> >;
+        const _Fp& __facet = std::use_facet<_Fp>(this->getloc());
+        if (__facet.put(*this, *this, this->fill(), __value).failed())
+          this->setstate(ios_base::badbit | ios_base::failbit);
+      }
+#  if _LIBCPP_HAS_EXCEPTIONS
+    } catch (...) {
+      this->__set_badbit_and_consider_rethrow();
+    }
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+    return *this;
+  }
+
+  template <class _Tp>
+  _LIBCPP_HIDE_FROM_ABI basic_ostream& __put_num_integer_promote(_Tp __value) {
+#  if _LIBCPP_HAS_EXCEPTIONS
+    try {
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      sentry __s(*this);
+      if (__s) {
+        ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
+
+        using _Fp          = num_put<char_type, ostreambuf_iterator<char_type, traits_type> >;
+        const _Fp& __facet = std::use_facet<_Fp>(this->getloc());
+        if (__facet
+                .put(*this,
+                     *this,
+                     this->fill(),
+                     __flags == ios_base::oct || __flags == ios_base::hex
+                         ? static_cast<__copy_unsigned_t<_Tp, long> >(std::__to_unsigned_like(__value))
+                         : static_cast<__copy_unsigned_t<_Tp, long> >(__value))
+                .failed())
+          this->setstate(ios_base::badbit | ios_base::failbit);
+      }
+#  if _LIBCPP_HAS_EXCEPTIONS
+    } catch (...) {
+      this->__set_badbit_and_consider_rethrow();
+    }
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+    return *this;
+  }
+
   basic_ostream& operator<<(bool __n);
   basic_ostream& operator<<(short __n);
   basic_ostream& operator<<(unsigned short __n);
@@ -225,276 +274,67 @@ basic_ostream<_CharT, _Traits>::operator<<(basic_streambuf<char_type, traits_typ
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(bool __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(short __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this,
-                  *this,
-                  this->fill(),
-                  __flags == ios_base::oct || __flags == ios_base::hex
-                      ? static_cast<long>(static_cast<unsigned short>(__n))
-                      : static_cast<long>(__n))
-              .failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned short __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast<unsigned long>(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(int __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      ios_base::fmtflags __flags = ios_base::flags() & ios_base::basefield;
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this,
-                  *this,
-                  this->fill(),
-                  __flags == ios_base::oct || __flags == ios_base::hex
-                      ? static_cast<long>(static_cast<unsigned int>(__n))
-                      : static_cast<long>(__n))
-              .failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned int __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast<unsigned long>(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num_integer_promote(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(unsigned long long __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(float __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), static_cast<double>(__n)).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return *this << static_cast<double>(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(double __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(long double __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template <class _CharT, class _Traits>
 basic_ostream<_CharT, _Traits>& basic_ostream<_CharT, _Traits>::operator<<(const void* __n) {
-#  if _LIBCPP_HAS_EXCEPTIONS
-  try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-    sentry __s(*this);
-    if (__s) {
-      typedef num_put<char_type, ostreambuf_iterator<char_type, traits_type> > _Fp;
-      const _Fp& __f = std::use_facet<_Fp>(this->getloc());
-      if (__f.put(*this, *this, this->fill(), __n).failed())
-        this->setstate(ios_base::badbit | ios_base::failbit);
-    }
-#  if _LIBCPP_HAS_EXCEPTIONS
-  } catch (...) {
-    this->__set_badbit_and_consider_rethrow();
-  }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-  return *this;
+  return __put_num(__n);
 }
 
 template <class _CharT, class _Traits>
diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h
index 2132e8d..a7d53b6 100644
--- a/libcxx/include/__pstl/backend_fwd.h
+++ b/libcxx/include/__pstl/backend_fwd.h
@@ -53,11 +53,13 @@ struct __serial_backend_tag;
 struct __std_thread_backend_tag;
 
 #  if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
-using __current_configuration = __backend_configuration<__serial_backend_tag, __default_backend_tag>;
+using __current_configuration _LIBCPP_NODEBUG = __backend_configuration<__serial_backend_tag, __default_backend_tag>;
 #  elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
-using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
+using __current_configuration _LIBCPP_NODEBUG =
+    __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
 #  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
-using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
+using __current_configuration _LIBCPP_NODEBUG =
+    __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
 #  else
 
 // ...New vendors can add parallel backends here...
diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h
index ea40fa7..8288423 100644
--- a/libcxx/include/__pstl/dispatch.h
+++ b/libcxx/include/__pstl/dispatch.h
@@ -58,7 +58,8 @@ struct __find_first_implemented<_Algorithm, __backend_configuration<_B1, _Bn...>
           __find_first_implemented<_Algorithm, __backend_configuration<_Bn...>, _ExecutionPolicy> > {};
 
 template <template <class, class> class _Algorithm, class _BackendConfiguration, class _ExecutionPolicy>
-using __dispatch = typename __find_first_implemented<_Algorithm, _BackendConfiguration, _ExecutionPolicy>::type;
+using __dispatch _LIBCPP_NODEBUG =
+    typename __find_first_implemented<_Algorithm, _BackendConfiguration, _ExecutionPolicy>::type;
 
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__ranges/chunk_by_view.h b/libcxx/include/__ranges/chunk_by_view.h
index e4df589..71fee3a 100644
--- a/libcxx/include/__ranges/chunk_by_view.h
+++ b/libcxx/include/__ranges/chunk_by_view.h
@@ -59,7 +59,7 @@ class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS chunk_by_view : public view_interface
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
   // We cache the result of begin() to allow providing an amortized O(1).
-  using _Cache = __non_propagating_cache<iterator_t<_View>>;
+  using _Cache _LIBCPP_NODEBUG = __non_propagating_cache<iterator_t<_View>>;
   _Cache __cached_begin_;
 
   class __iterator;
diff --git a/libcxx/include/__ranges/drop_view.h b/libcxx/include/__ranges/drop_view.h
index 87f66f1..3f963d0 100644
--- a/libcxx/include/__ranges/drop_view.h
+++ b/libcxx/include/__ranges/drop_view.h
@@ -64,7 +64,7 @@ class drop_view : public view_interface<drop_view<_View>> {
   // Note: drop_view<input-range>::begin() is still trivially amortized O(1) because
   // one can't call begin() on it more than once.
   static constexpr bool _UseCache = forward_range<_View> && !(random_access_range<_View> && sized_range<_View>);
-  using _Cache                    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _Cache _LIBCPP_NODEBUG    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Cache __cached_begin_ = _Cache();
   range_difference_t<_View> __count_               = 0;
   _View __base_                                    = _View();
@@ -204,7 +204,7 @@ struct __passthrough_type<subrange<_Iter, _Sent, _Kind>> {
 };
 
 template <class _Tp>
-using __passthrough_type_t = typename __passthrough_type<_Tp>::type;
+using __passthrough_type_t _LIBCPP_NODEBUG = typename __passthrough_type<_Tp>::type;
 
 struct __fn {
   // [range.drop.overview]: the `empty_view` case.
diff --git a/libcxx/include/__ranges/drop_while_view.h b/libcxx/include/__ranges/drop_while_view.h
index 6413ff5..bc7f019 100644
--- a/libcxx/include/__ranges/drop_while_view.h
+++ b/libcxx/include/__ranges/drop_while_view.h
@@ -90,7 +90,7 @@ private:
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
   static constexpr bool _UseCache = forward_range<_View>;
-  using _Cache                    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _Cache _LIBCPP_NODEBUG    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Cache __cached_begin_ = _Cache();
 };
 
diff --git a/libcxx/include/__ranges/elements_view.h b/libcxx/include/__ranges/elements_view.h
index c99282f..5121298 100644
--- a/libcxx/include/__ranges/elements_view.h
+++ b/libcxx/include/__ranges/elements_view.h
@@ -171,7 +171,7 @@ class elements_view<_View, _Np>::__iterator
   template <bool>
   friend class __sentinel;
 
-  using _Base = __maybe_const<_Const, _View>;
+  using _Base _LIBCPP_NODEBUG = __maybe_const<_Const, _View>;
 
   iterator_t<_Base> __current_ = iterator_t<_Base>();
 
@@ -335,7 +335,7 @@ template <input_range _View, size_t _Np>
 template <bool _Const>
 class elements_view<_View, _Np>::__sentinel {
 private:
-  using _Base                                        = __maybe_const<_Const, _View>;
+  using _Base _LIBCPP_NODEBUG                        = __maybe_const<_Const, _View>;
   _LIBCPP_NO_UNIQUE_ADDRESS sentinel_t<_Base> __end_ = sentinel_t<_Base>();
 
   template <bool>
diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h
index 22f67b2d..07980e7 100644
--- a/libcxx/include/__ranges/filter_view.h
+++ b/libcxx/include/__ranges/filter_view.h
@@ -61,7 +61,7 @@ class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS filter_view : public view_interface<f
   // We cache the result of begin() to allow providing an amortized O(1) begin() whenever
   // the underlying range is at least a forward_range.
   static constexpr bool _UseCache = forward_range<_View>;
-  using _Cache                    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _Cache _LIBCPP_NODEBUG    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Cache __cached_begin_ = _Cache();
 
   class __iterator;
@@ -115,7 +115,7 @@ struct __filter_iterator_category {};
 
 template <forward_range _View>
 struct __filter_iterator_category<_View> {
-  using _Cat = typename iterator_traits<iterator_t<_View>>::iterator_category;
+  using _Cat _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<_View>>::iterator_category;
   using iterator_category =
       _If<derived_from<_Cat, bidirectional_iterator_tag>,
           bidirectional_iterator_tag,
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index b2fa958..4b84585 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -68,7 +68,7 @@ struct __get_wider_signed {
 };
 
 template <class _Start>
-using _IotaDiffT =
+using _IotaDiffT _LIBCPP_NODEBUG =
     typename _If< (!integral<_Start> || sizeof(iter_difference_t<_Start>) > sizeof(_Start)),
                   type_identity<iter_difference_t<_Start>>,
                   __get_wider_signed<_Start> >::type;
diff --git a/libcxx/include/__ranges/join_view.h b/libcxx/include/__ranges/join_view.h
index 6aadd38..327b349 100644
--- a/libcxx/include/__ranges/join_view.h
+++ b/libcxx/include/__ranges/join_view.h
@@ -55,8 +55,8 @@ struct __join_view_iterator_category {};
 template <class _View>
   requires is_reference_v<range_reference_t<_View>> && forward_range<_View> && forward_range<range_reference_t<_View>>
 struct __join_view_iterator_category<_View> {
-  using _OuterC = typename iterator_traits<iterator_t<_View>>::iterator_category;
-  using _InnerC = typename iterator_traits<iterator_t<range_reference_t<_View>>>::iterator_category;
+  using _OuterC _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<_View>>::iterator_category;
+  using _InnerC _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<range_reference_t<_View>>>::iterator_category;
 
   using iterator_category =
       _If< derived_from<_OuterC, bidirectional_iterator_tag> && derived_from<_InnerC, bidirectional_iterator_tag> &&
@@ -71,7 +71,7 @@ template <input_range _View>
   requires view<_View> && input_range<range_reference_t<_View>>
 class join_view : public view_interface<join_view<_View>> {
 private:
-  using _InnerRange = range_reference_t<_View>;
+  using _InnerRange _LIBCPP_NODEBUG = range_reference_t<_View>;
 
   template <bool>
   struct __iterator;
@@ -85,11 +85,12 @@ private:
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
 
   static constexpr bool _UseOuterCache = !forward_range<_View>;
-  using _OuterCache                    = _If<_UseOuterCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _OuterCache _LIBCPP_NODEBUG    = _If<_UseOuterCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _OuterCache __outer_;
 
   static constexpr bool _UseInnerCache = !is_reference_v<_InnerRange>;
-  using _InnerCache = _If<_UseInnerCache, __non_propagating_cache<remove_cvref_t<_InnerRange>>, __empty_cache>;
+  using _InnerCache _LIBCPP_NODEBUG =
+      _If<_UseInnerCache, __non_propagating_cache<remove_cvref_t<_InnerRange>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _InnerCache __inner_;
 
 public:
@@ -155,9 +156,9 @@ private:
   template <bool>
   friend struct __sentinel;
 
-  using _Parent            = __maybe_const<_Const, join_view>;
-  using _Base              = __maybe_const<_Const, _View>;
-  sentinel_t<_Base> __end_ = sentinel_t<_Base>();
+  using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, join_view>;
+  using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
+  sentinel_t<_Base> __end_      = sentinel_t<_Base>();
 
 public:
   _LIBCPP_HIDE_FROM_ABI __sentinel() = default;
@@ -190,18 +191,18 @@ struct join_view<_View>::__iterator final : public __join_view_iterator_category
   static constexpr bool __is_join_view_iterator = true;
 
 private:
-  using _Parent     = __maybe_const<_Const, join_view<_View>>;
-  using _Base       = __maybe_const<_Const, _View>;
-  using _Outer      = iterator_t<_Base>;
-  using _Inner      = iterator_t<range_reference_t<_Base>>;
-  using _InnerRange = range_reference_t<_View>;
+  using _Parent _LIBCPP_NODEBUG     = __maybe_const<_Const, join_view<_View>>;
+  using _Base _LIBCPP_NODEBUG       = __maybe_const<_Const, _View>;
+  using _Outer _LIBCPP_NODEBUG      = iterator_t<_Base>;
+  using _Inner _LIBCPP_NODEBUG      = iterator_t<range_reference_t<_Base>>;
+  using _InnerRange _LIBCPP_NODEBUG = range_reference_t<_View>;
 
   static_assert(!_Const || forward_range<_Base>, "Const can only be true when Base models forward_range.");
 
   static constexpr bool __ref_is_glvalue = is_reference_v<range_reference_t<_Base>>;
 
   static constexpr bool _OuterPresent           = forward_range<_Base>;
-  using _OuterType                              = _If<_OuterPresent, _Outer, std::__empty>;
+  using _OuterType _LIBCPP_NODEBUG              = _If<_OuterPresent, _Outer, std::__empty>;
   _LIBCPP_NO_UNIQUE_ADDRESS _OuterType __outer_ = _OuterType();
 
   optional<_Inner> __inner_;
@@ -379,7 +380,7 @@ template <class _JoinViewIterator>
 struct __segmented_iterator_traits<_JoinViewIterator> {
   using __segment_iterator _LIBCPP_NODEBUG =
       __iterator_with_data<typename _JoinViewIterator::_Outer, typename _JoinViewIterator::_Parent*>;
-  using __local_iterator = typename _JoinViewIterator::_Inner;
+  using __local_iterator _LIBCPP_NODEBUG = typename _JoinViewIterator::_Inner;
 
   // TODO: Would it make sense to enable the optimization for other iterator types?
 
diff --git a/libcxx/include/__ranges/lazy_split_view.h b/libcxx/include/__ranges/lazy_split_view.h
index 0dcbc13..cca9191 100644
--- a/libcxx/include/__ranges/lazy_split_view.h
+++ b/libcxx/include/__ranges/lazy_split_view.h
@@ -72,7 +72,8 @@ class lazy_split_view : public view_interface<lazy_split_view<_View, _Pattern>>
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_       = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS _Pattern __pattern_ = _Pattern();
 
-  using _MaybeCurrent = _If<!forward_range<_View>, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _MaybeCurrent _LIBCPP_NODEBUG =
+      _If<!forward_range<_View>, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _MaybeCurrent __current_ = _MaybeCurrent();
 
   template <bool>
@@ -146,11 +147,11 @@ private:
     friend struct __inner_iterator;
     friend __outer_iterator<true>;
 
-    using _Parent = __maybe_const<_Const, lazy_split_view>;
-    using _Base   = __maybe_const<_Const, _View>;
+    using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, lazy_split_view>;
+    using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
 
     _Parent* __parent_                                 = nullptr;
-    using _MaybeCurrent                                = _If<forward_range<_View>, iterator_t<_Base>, __empty_cache>;
+    using _MaybeCurrent _LIBCPP_NODEBUG                = _If<forward_range<_View>, iterator_t<_Base>, __empty_cache>;
     _LIBCPP_NO_UNIQUE_ADDRESS _MaybeCurrent __current_ = _MaybeCurrent();
     bool __trailing_empty_                             = false;
 
@@ -283,7 +284,7 @@ private:
   template <bool _Const>
   struct __inner_iterator : __inner_iterator_category<__maybe_const<_Const, _View>> {
   private:
-    using _Base = __maybe_const<_Const, _View>;
+    using _Base _LIBCPP_NODEBUG = __maybe_const<_Const, _View>;
     // Workaround for a GCC issue.
     static constexpr bool _OuterConst = _Const;
     __outer_iterator<_Const> __i_     = __outer_iterator<_OuterConst>();
diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h
index 93ceaf1..61a8b63 100644
--- a/libcxx/include/__ranges/repeat_view.h
+++ b/libcxx/include/__ranges/repeat_view.h
@@ -61,7 +61,7 @@ struct __repeat_view_iterator_difference<_Tp> {
 };
 
 template <class _Tp>
-using __repeat_view_iterator_difference_t = typename __repeat_view_iterator_difference<_Tp>::type;
+using __repeat_view_iterator_difference_t _LIBCPP_NODEBUG = typename __repeat_view_iterator_difference<_Tp>::type;
 
 namespace views::__drop {
 struct __fn;
@@ -139,7 +139,7 @@ template <move_constructible _Tp, semiregular _Bound>
 class repeat_view<_Tp, _Bound>::__iterator {
   friend class repeat_view;
 
-  using _IndexT = conditional_t<same_as<_Bound, unreachable_sentinel_t>, ptrdiff_t, _Bound>;
+  using _IndexT _LIBCPP_NODEBUG = conditional_t<same_as<_Bound, unreachable_sentinel_t>, ptrdiff_t, _Bound>;
 
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(const _Tp* __value, _IndexT __bound_sentinel = _IndexT())
       : __value_(__value), __current_(__bound_sentinel) {}
diff --git a/libcxx/include/__ranges/reverse_view.h b/libcxx/include/__ranges/reverse_view.h
index 796f5be..80d54b9 100644
--- a/libcxx/include/__ranges/reverse_view.h
+++ b/libcxx/include/__ranges/reverse_view.h
@@ -47,7 +47,8 @@ class reverse_view : public view_interface<reverse_view<_View>> {
   // We cache begin() whenever ranges::next is not guaranteed O(1) to provide an
   // amortized O(1) begin() method.
   static constexpr bool _UseCache = !random_access_range<_View> && !common_range<_View>;
-  using _Cache = _If<_UseCache, __non_propagating_cache<reverse_iterator<iterator_t<_View>>>, __empty_cache>;
+  using _Cache _LIBCPP_NODEBUG =
+      _If<_UseCache, __non_propagating_cache<reverse_iterator<iterator_t<_View>>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Cache __cached_begin_ = _Cache();
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_          = _View();
 
diff --git a/libcxx/include/__ranges/split_view.h b/libcxx/include/__ranges/split_view.h
index 7527281..2ec908b 100644
--- a/libcxx/include/__ranges/split_view.h
+++ b/libcxx/include/__ranges/split_view.h
@@ -52,7 +52,7 @@ class split_view : public view_interface<split_view<_View, _Pattern>> {
 private:
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_       = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS _Pattern __pattern_ = _Pattern();
-  using _Cache                                  = __non_propagating_cache<subrange<iterator_t<_View>>>;
+  using _Cache _LIBCPP_NODEBUG                  = __non_propagating_cache<subrange<iterator_t<_View>>>;
   _Cache __cached_begin_                        = _Cache();
 
   template <class, class>
diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h
index a40eab3..2d006d3 100644
--- a/libcxx/include/__ranges/subrange.h
+++ b/libcxx/include/__ranges/subrange.h
@@ -82,7 +82,7 @@ private:
   struct _Empty {
     _LIBCPP_HIDE_FROM_ABI constexpr _Empty(auto) noexcept {}
   };
-  using _Size = conditional_t<_StoreSize, make_unsigned_t<iter_difference_t<_Iter>>, _Empty>;
+  using _Size _LIBCPP_NODEBUG = conditional_t<_StoreSize, make_unsigned_t<iter_difference_t<_Iter>>, _Empty>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Iter __begin_ = _Iter();
   _LIBCPP_NO_UNIQUE_ADDRESS _Sent __end_   = _Sent();
   _LIBCPP_NO_UNIQUE_ADDRESS _Size __size_  = 0;
diff --git a/libcxx/include/__ranges/take_view.h b/libcxx/include/__ranges/take_view.h
index 39f99ce..5892c1e 100644
--- a/libcxx/include/__ranges/take_view.h
+++ b/libcxx/include/__ranges/take_view.h
@@ -161,9 +161,9 @@ public:
 template <view _View>
 template <bool _Const>
 class take_view<_View>::__sentinel {
-  using _Base = __maybe_const<_Const, _View>;
+  using _Base _LIBCPP_NODEBUG = __maybe_const<_Const, _View>;
   template <bool _OtherConst>
-  using _Iter                                        = counted_iterator<iterator_t<__maybe_const<_OtherConst, _View>>>;
+  using _Iter _LIBCPP_NODEBUG                        = counted_iterator<iterator_t<__maybe_const<_OtherConst, _View>>>;
   _LIBCPP_NO_UNIQUE_ADDRESS sentinel_t<_Base> __end_ = sentinel_t<_Base>();
 
   template <bool>
@@ -244,7 +244,7 @@ struct __passthrough_type<subrange<_Iter, _Sent, _Kind>> {
 };
 
 template <class _Tp>
-using __passthrough_type_t = typename __passthrough_type<_Tp>::type;
+using __passthrough_type_t _LIBCPP_NODEBUG = typename __passthrough_type<_Tp>::type;
 
 struct __fn {
   // [range.take.overview]: the `empty_view` case.
diff --git a/libcxx/include/__ranges/take_while_view.h b/libcxx/include/__ranges/take_while_view.h
index b7cb0ae..4977f13 100644
--- a/libcxx/include/__ranges/take_while_view.h
+++ b/libcxx/include/__ranges/take_while_view.h
@@ -103,7 +103,7 @@ template <view _View, class _Pred>
   requires input_range<_View> && is_object_v<_Pred> && indirect_unary_predicate<const _Pred, iterator_t<_View>>
 template <bool _Const>
 class take_while_view<_View, _Pred>::__sentinel {
-  using _Base = __maybe_const<_Const, _View>;
+  using _Base _LIBCPP_NODEBUG = __maybe_const<_Const, _View>;
 
   sentinel_t<_Base> __end_ = sentinel_t<_Base>();
   const _Pred* __pred_     = nullptr;
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index 0ce1bfe8..4ae21e9 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -159,7 +159,7 @@ struct __transform_view_iterator_category_base {};
 
 template <forward_range _View, class _Fn>
 struct __transform_view_iterator_category_base<_View, _Fn> {
-  using _Cat = typename iterator_traits<iterator_t<_View>>::iterator_category;
+  using _Cat _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<_View>>::iterator_category;
 
   using iterator_category =
       conditional_t< is_reference_v<invoke_result_t<_Fn&, range_reference_t<_View>>>,
@@ -177,8 +177,8 @@ template <bool _Const>
 class transform_view<_View, _Fn>::__iterator
     : public __transform_view_iterator_category_base<_View, __maybe_const<_Const, _Fn>> {
 
-  using _Parent = __maybe_const<_Const, transform_view>;
-  using _Base   = __maybe_const<_Const, _View>;
+  using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, transform_view>;
+  using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
 
   _Parent* __parent_ = nullptr;
 
@@ -338,8 +338,8 @@ template <input_range _View, copy_constructible _Fn>
   requires __transform_view_constraints<_View, _Fn>
 template <bool _Const>
 class transform_view<_View, _Fn>::__sentinel {
-  using _Parent = __maybe_const<_Const, transform_view>;
-  using _Base   = __maybe_const<_Const, _View>;
+  using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, transform_view>;
+  using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
 
   sentinel_t<_Base> __end_ = sentinel_t<_Base>();
 
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index 9d7fd3e..a8f679c 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -51,24 +51,24 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _Allocator = allocator<_Tp> >
 struct __split_buffer {
 public:
-  using value_type      = _Tp;
-  using allocator_type  = _Allocator;
-  using __alloc_rr      = __libcpp_remove_reference_t<allocator_type>;
-  using __alloc_traits  = allocator_traits<__alloc_rr>;
-  using reference       = value_type&;
-  using const_reference = const value_type&;
-  using size_type       = typename __alloc_traits::size_type;
-  using difference_type = typename __alloc_traits::difference_type;
-  using pointer         = typename __alloc_traits::pointer;
-  using const_pointer   = typename __alloc_traits::const_pointer;
-  using iterator        = pointer;
-  using const_iterator  = const_pointer;
+  using value_type                     = _Tp;
+  using allocator_type                 = _Allocator;
+  using __alloc_rr _LIBCPP_NODEBUG     = __libcpp_remove_reference_t<allocator_type>;
+  using __alloc_traits _LIBCPP_NODEBUG = allocator_traits<__alloc_rr>;
+  using reference                      = value_type&;
+  using const_reference                = const value_type&;
+  using size_type                      = typename __alloc_traits::size_type;
+  using difference_type                = typename __alloc_traits::difference_type;
+  using pointer                        = typename __alloc_traits::pointer;
+  using const_pointer                  = typename __alloc_traits::const_pointer;
+  using iterator                       = pointer;
+  using const_iterator                 = const_pointer;
 
   // A __split_buffer contains the following members which may be trivially relocatable:
   // - pointer: may be trivially relocatable, so it's checked
   // - allocator_type: may be trivially relocatable, so it's checked
   // __split_buffer doesn't have any self-references, so it's trivially relocatable if its members are.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       __split_buffer,
       void>;
diff --git a/libcxx/include/__stop_token/stop_state.h b/libcxx/include/__stop_token/stop_state.h
index 84dc208..cc1f1d8 100644
--- a/libcxx/include/__stop_token/stop_state.h
+++ b/libcxx/include/__stop_token/stop_state.h
@@ -27,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_THREADS
 
 struct __stop_callback_base : __intrusive_node_base<__stop_callback_base> {
-  using __callback_fn_t = void(__stop_callback_base*) noexcept;
+  using __callback_fn_t _LIBCPP_NODEBUG = void(__stop_callback_base*) noexcept;
   _LIBCPP_HIDE_FROM_ABI explicit __stop_callback_base(__callback_fn_t* __callback_fn) : __callback_fn_(__callback_fn) {}
 
   _LIBCPP_HIDE_FROM_ABI void __invoke() noexcept { __callback_fn_(this); }
@@ -58,9 +58,9 @@ class __stop_state {
   // It is used by __intrusive_shared_ptr, but it is stored here for better layout
   atomic<uint32_t> __ref_count_ = 0;
 
-  using __state_t            = uint32_t;
-  using __callback_list_lock = __atomic_unique_lock<__state_t, __callback_list_locked_bit>;
-  using __callback_list      = __intrusive_list_view<__stop_callback_base>;
+  using __state_t _LIBCPP_NODEBUG            = uint32_t;
+  using __callback_list_lock _LIBCPP_NODEBUG = __atomic_unique_lock<__state_t, __callback_list_locked_bit>;
+  using __callback_list _LIBCPP_NODEBUG      = __intrusive_list_view<__stop_callback_base>;
 
   __callback_list __callback_list_;
   __thread_id __requesting_thread_;
diff --git a/libcxx/include/__system_error/system_error.h b/libcxx/include/__system_error/system_error.h
index 918effb..36ccf94 100644
--- a/libcxx/include/__system_error/system_error.h
+++ b/libcxx/include/__system_error/system_error.h
@@ -39,6 +39,10 @@ public:
   _LIBCPP_HIDE_FROM_ABI const error_code& code() const _NOEXCEPT { return __ec_; }
 };
 
+// __ev is expected to be an error in the generic_category domain (e.g. from
+// errno, or std::errc::*), not system_category (e.g. from windows syscalls).
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg);
+
 [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI inline void __throw_system_error(error_code __ec, const char* __what_arg) {
 #if _LIBCPP_HAS_EXCEPTIONS
   throw system_error(__ec, __what_arg);
diff --git a/libcxx/include/__thread/support/pthread.h b/libcxx/include/__thread/support/pthread.h
index 531f3e7..14e9207 100644
--- a/libcxx/include/__thread/support/pthread.h
+++ b/libcxx/include/__thread/support/pthread.h
@@ -39,7 +39,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-using __libcpp_timespec_t = ::timespec;
+using __libcpp_timespec_t _LIBCPP_NODEBUG = ::timespec;
 
 //
 // Mutex
diff --git a/libcxx/include/__tuple/make_tuple_types.h b/libcxx/include/__tuple/make_tuple_types.h
index 3d31239..ff95ca4 100644
--- a/libcxx/include/__tuple/make_tuple_types.h
+++ b/libcxx/include/__tuple/make_tuple_types.h
@@ -47,9 +47,9 @@ struct __make_tuple_types_flat<_Tuple<_Types...>, __tuple_indices<_Idx...>> {
 template <class _Vt, size_t _Np, size_t... _Idx>
 struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>> {
   template <size_t>
-  using __value_type = _Vt;
+  using __value_type _LIBCPP_NODEBUG = _Vt;
   template <class _Tp>
-  using __apply_quals = __tuple_types<__copy_cvref_t<_Tp, __value_type<_Idx>>...>;
+  using __apply_quals _LIBCPP_NODEBUG = __tuple_types<__copy_cvref_t<_Tp, __value_type<_Idx>>...>;
 };
 
 template <class _Tp,
@@ -58,9 +58,9 @@ template <class _Tp,
           bool _SameSize = (_Ep == tuple_size<__libcpp_remove_reference_t<_Tp> >::value)>
 struct __make_tuple_types {
   static_assert(_Sp <= _Ep, "__make_tuple_types input error");
-  using _RawTp = __remove_cvref_t<_Tp>;
-  using _Maker = __make_tuple_types_flat<_RawTp, typename __make_tuple_indices<_Ep, _Sp>::type>;
-  using type   = typename _Maker::template __apply_quals<_Tp>;
+  using _RawTp _LIBCPP_NODEBUG = __remove_cvref_t<_Tp>;
+  using _Maker _LIBCPP_NODEBUG = __make_tuple_types_flat<_RawTp, typename __make_tuple_indices<_Ep, _Sp>::type>;
+  using type                   = typename _Maker::template __apply_quals<_Tp>;
 };
 
 template <class... _Types, size_t _Ep>
diff --git a/libcxx/include/__tuple/sfinae_helpers.h b/libcxx/include/__tuple/sfinae_helpers.h
index 9041d1d..4084e8b 100644
--- a/libcxx/include/__tuple/sfinae_helpers.h
+++ b/libcxx/include/__tuple/sfinae_helpers.h
@@ -41,7 +41,7 @@ struct __tuple_sfinae_base {
   static auto __do_test(...) -> false_type;
 
   template <class _FromArgs, class _ToArgs>
-  using __constructible = decltype(__do_test<is_constructible>(_ToArgs{}, _FromArgs{}));
+  using __constructible _LIBCPP_NODEBUG = decltype(__do_test<is_constructible>(_ToArgs{}, _FromArgs{}));
 };
 
 // __tuple_constructible
diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h
index b970280..27d57eb 100644
--- a/libcxx/include/__tuple/tuple_size.h
+++ b/libcxx/include/__tuple/tuple_size.h
@@ -29,7 +29,7 @@ struct _LIBCPP_TEMPLATE_VIS tuple_size;
 
 #if !defined(_LIBCPP_CXX03_LANG)
 template <class _Tp, class...>
-using __enable_if_tuple_size_imp = _Tp;
+using __enable_if_tuple_size_imp _LIBCPP_NODEBUG = _Tp;
 
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS tuple_size<__enable_if_tuple_size_imp< const _Tp,
diff --git a/libcxx/include/__type_traits/add_lvalue_reference.h b/libcxx/include/__type_traits/add_lvalue_reference.h
index 157c8f9..b1ee6ed 100644
--- a/libcxx/include/__type_traits/add_lvalue_reference.h
+++ b/libcxx/include/__type_traits/add_lvalue_reference.h
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if __has_builtin(__add_lvalue_reference)
 
 template <class _Tp>
-using __add_lvalue_reference_t = __add_lvalue_reference(_Tp);
+using __add_lvalue_reference_t _LIBCPP_NODEBUG = __add_lvalue_reference(_Tp);
 
 #else
 
diff --git a/libcxx/include/__type_traits/add_pointer.h b/libcxx/include/__type_traits/add_pointer.h
index f66e1f9..b53d8ea 100644
--- a/libcxx/include/__type_traits/add_pointer.h
+++ b/libcxx/include/__type_traits/add_pointer.h
@@ -23,7 +23,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if !defined(_LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS) && __has_builtin(__add_pointer)
 
 template <class _Tp>
-using __add_pointer_t = __add_pointer(_Tp);
+using __add_pointer_t _LIBCPP_NODEBUG = __add_pointer(_Tp);
 
 #else
 template <class _Tp, bool = __libcpp_is_referenceable<_Tp>::value || is_void<_Tp>::value>
diff --git a/libcxx/include/__type_traits/add_rvalue_reference.h b/libcxx/include/__type_traits/add_rvalue_reference.h
index 2050588..d844ccc 100644
--- a/libcxx/include/__type_traits/add_rvalue_reference.h
+++ b/libcxx/include/__type_traits/add_rvalue_reference.h
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if __has_builtin(__add_rvalue_reference)
 
 template <class _Tp>
-using __add_rvalue_reference_t = __add_rvalue_reference(_Tp);
+using __add_rvalue_reference_t _LIBCPP_NODEBUG = __add_rvalue_reference(_Tp);
 
 #else
 
diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h
index 2e39afb..d987499 100644
--- a/libcxx/include/__type_traits/aligned_storage.h
+++ b/libcxx/include/__type_traits/aligned_storage.h
@@ -34,26 +34,23 @@ struct __struct_double4 {
   double __lx[4];
 };
 
-// clang-format off
-typedef __type_list<__align_type<unsigned char>,
-        __type_list<__align_type<unsigned short>,
-        __type_list<__align_type<unsigned int>,
-        __type_list<__align_type<unsigned long>,
-        __type_list<__align_type<unsigned long long>,
-        __type_list<__align_type<double>,
-        __type_list<__align_type<long double>,
-        __type_list<__align_type<__struct_double>,
-        __type_list<__align_type<__struct_double4>,
-        __type_list<__align_type<int*>,
-        __nat
-        > > > > > > > > > > __all_types;
-// clang-format on
+using __all_types _LIBCPP_NODEBUG =
+    __type_list<__align_type<unsigned char>,
+                __align_type<unsigned short>,
+                __align_type<unsigned int>,
+                __align_type<unsigned long>,
+                __align_type<unsigned long long>,
+                __align_type<double>,
+                __align_type<long double>,
+                __align_type<__struct_double>,
+                __align_type<__struct_double4>,
+                __align_type<int*> >;
 
 template <class _TL, size_t _Len>
 struct __find_max_align;
 
-template <class _Hp, size_t _Len>
-struct __find_max_align<__type_list<_Hp, __nat>, _Len> : public integral_constant<size_t, _Hp::value> {};
+template <class _Head, size_t _Len>
+struct __find_max_align<__type_list<_Head>, _Len> : public integral_constant<size_t, _Head::value> {};
 
 template <size_t _Len, size_t _A1, size_t _A2>
 struct __select_align {
@@ -65,9 +62,11 @@ public:
   static const size_t value = _Len < __max ? __min : __max;
 };
 
-template <class _Hp, class _Tp, size_t _Len>
-struct __find_max_align<__type_list<_Hp, _Tp>, _Len>
-    : public integral_constant<size_t, __select_align<_Len, _Hp::value, __find_max_align<_Tp, _Len>::value>::value> {};
+template <class _Head, class... _Tail, size_t _Len>
+struct __find_max_align<__type_list<_Head, _Tail...>, _Len>
+    : public integral_constant<
+          size_t,
+          __select_align<_Len, _Head::value, __find_max_align<__type_list<_Tail...>, _Len>::value>::value> {};
 
 template <size_t _Len, size_t _Align = __find_max_align<__all_types, _Len>::value>
 struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_TEMPLATE_VIS aligned_storage {
diff --git a/libcxx/include/__type_traits/common_reference.h b/libcxx/include/__type_traits/common_reference.h
index c802902..d436949 100644
--- a/libcxx/include/__type_traits/common_reference.h
+++ b/libcxx/include/__type_traits/common_reference.h
@@ -30,7 +30,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 // Let COND_RES(X, Y) be:
 template <class _Xp, class _Yp>
-using __cond_res = decltype(false ? std::declval<_Xp (&)()>()() : std::declval<_Yp (&)()>()());
+using __cond_res _LIBCPP_NODEBUG = decltype(false ? std::declval<_Xp (&)()>()() : std::declval<_Yp (&)()>()());
 
 // Let `XREF(A)` denote a unary alias template `T` such that `T<U>` denotes the same type as `U`
 // with the addition of `A`'s cv and reference qualifiers, for a non-reference cv-unqualified type
@@ -39,7 +39,7 @@ using __cond_res = decltype(false ? std::declval<_Xp (&)()>()() : std::declval<_
 template <class _Tp>
 struct __xref {
   template <class _Up>
-  using __apply = __copy_cvref_t<_Tp, _Up>;
+  using __apply _LIBCPP_NODEBUG = __copy_cvref_t<_Tp, _Up>;
 };
 
 // Given types A and B, let X be remove_reference_t<A>, let Y be remove_reference_t<B>,
@@ -48,10 +48,10 @@ template <class _Ap, class _Bp, class _Xp = remove_reference_t<_Ap>, class _Yp =
 struct __common_ref;
 
 template <class _Xp, class _Yp>
-using __common_ref_t = typename __common_ref<_Xp, _Yp>::__type;
+using __common_ref_t _LIBCPP_NODEBUG = typename __common_ref<_Xp, _Yp>::__type;
 
 template <class _Xp, class _Yp>
-using __cv_cond_res = __cond_res<__copy_cv_t<_Xp, _Yp>&, __copy_cv_t<_Yp, _Xp>&>;
+using __cv_cond_res _LIBCPP_NODEBUG = __cond_res<__copy_cv_t<_Xp, _Yp>&, __copy_cv_t<_Yp, _Xp>&>;
 
 //    If A and B are both lvalue reference types, COMMON-REF(A, B) is
 //    COND-RES(COPYCV(X, Y)&, COPYCV(Y, X)&) if that type exists and is a reference type.
@@ -61,13 +61,13 @@ template <class _Ap, class _Bp, class _Xp, class _Yp>
     requires { typename __cv_cond_res<_Xp, _Yp>; } &&
     is_reference_v<__cv_cond_res<_Xp, _Yp>>
 struct __common_ref<_Ap&, _Bp&, _Xp, _Yp> {
-  using __type = __cv_cond_res<_Xp, _Yp>;
+  using __type _LIBCPP_NODEBUG = __cv_cond_res<_Xp, _Yp>;
 };
 // clang-format on
 
 //    Otherwise, let C be remove_reference_t<COMMON-REF(X&, Y&)>&&. ...
 template <class _Xp, class _Yp>
-using __common_ref_C = remove_reference_t<__common_ref_t<_Xp&, _Yp&>>&&;
+using __common_ref_C _LIBCPP_NODEBUG = remove_reference_t<__common_ref_t<_Xp&, _Yp&>>&&;
 
 //    .... If A and B are both rvalue reference types, C is well-formed, and
 //    is_convertible_v<A, C> && is_convertible_v<B, C> is true, then COMMON-REF(A, B) is C.
@@ -78,13 +78,13 @@ template <class _Ap, class _Bp, class _Xp, class _Yp>
     is_convertible_v<_Ap&&, __common_ref_C<_Xp, _Yp>> &&
     is_convertible_v<_Bp&&, __common_ref_C<_Xp, _Yp>>
 struct __common_ref<_Ap&&, _Bp&&, _Xp, _Yp> {
-  using __type = __common_ref_C<_Xp, _Yp>;
+  using __type _LIBCPP_NODEBUG = __common_ref_C<_Xp, _Yp>;
 };
 // clang-format on
 
 //    Otherwise, let D be COMMON-REF(const X&, Y&). ...
 template <class _Tp, class _Up>
-using __common_ref_D = __common_ref_t<const _Tp&, _Up&>;
+using __common_ref_D _LIBCPP_NODEBUG = __common_ref_t<const _Tp&, _Up&>;
 
 //    ... If A is an rvalue reference and B is an lvalue reference and D is well-formed and
 //    is_convertible_v<A, D> is true, then COMMON-REF(A, B) is D.
@@ -94,7 +94,7 @@ template <class _Ap, class _Bp, class _Xp, class _Yp>
     requires { typename __common_ref_D<_Xp, _Yp>; } &&
     is_convertible_v<_Ap&&, __common_ref_D<_Xp, _Yp>>
 struct __common_ref<_Ap&&, _Bp&, _Xp, _Yp> {
-  using __type = __common_ref_D<_Xp, _Yp>;
+  using __type _LIBCPP_NODEBUG = __common_ref_D<_Xp, _Yp>;
 };
 // clang-format on
 
@@ -150,7 +150,7 @@ template <class, class, template <class> class, template <class> class>
 struct basic_common_reference {};
 
 template <class _Tp, class _Up>
-using __basic_common_reference_t =
+using __basic_common_reference_t _LIBCPP_NODEBUG =
     typename basic_common_reference<remove_cvref_t<_Tp>,
                                     remove_cvref_t<_Up>,
                                     __xref<_Tp>::template __apply,
diff --git a/libcxx/include/__type_traits/common_type.h b/libcxx/include/__type_traits/common_type.h
index ee5596b..e4c6b49 100644
--- a/libcxx/include/__type_traits/common_type.h
+++ b/libcxx/include/__type_traits/common_type.h
@@ -31,7 +31,7 @@ template <class... _Args>
 struct common_type;
 
 template <class... _Args>
-using __common_type_t = typename common_type<_Args...>::type;
+using __common_type_t _LIBCPP_NODEBUG = typename common_type<_Args...>::type;
 
 template <class... _Args>
 struct common_type : __builtin_common_type<__common_type_t, __type_identity, __empty, _Args...> {};
@@ -40,7 +40,7 @@ struct common_type : __builtin_common_type<__common_type_t, __type_identity, __e
 #  if _LIBCPP_STD_VER >= 20
 // Let COND_RES(X, Y) be:
 template <class _Tp, class _Up>
-using __cond_type = decltype(false ? std::declval<_Tp>() : std::declval<_Up>());
+using __cond_type _LIBCPP_NODEBUG = decltype(false ? std::declval<_Tp>() : std::declval<_Up>());
 
 template <class _Tp, class _Up, class = void>
 struct __common_type3 {};
diff --git a/libcxx/include/__type_traits/conjunction.h b/libcxx/include/__type_traits/conjunction.h
index c299559..4001d6c 100644
--- a/libcxx/include/__type_traits/conjunction.h
+++ b/libcxx/include/__type_traits/conjunction.h
@@ -22,7 +22,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class...>
-using __expand_to_true = true_type;
+using __expand_to_true _LIBCPP_NODEBUG = true_type;
 
 template <class... _Pred>
 __expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int);
diff --git a/libcxx/include/__type_traits/copy_cv.h b/libcxx/include/__type_traits/copy_cv.h
index d482cb4..8378fbd 100644
--- a/libcxx/include/__type_traits/copy_cv.h
+++ b/libcxx/include/__type_traits/copy_cv.h
@@ -22,29 +22,29 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _From>
 struct __copy_cv {
   template <class _To>
-  using __apply = _To;
+  using __apply _LIBCPP_NODEBUG = _To;
 };
 
 template <class _From>
 struct __copy_cv<const _From> {
   template <class _To>
-  using __apply = const _To;
+  using __apply _LIBCPP_NODEBUG = const _To;
 };
 
 template <class _From>
 struct __copy_cv<volatile _From> {
   template <class _To>
-  using __apply = volatile _To;
+  using __apply _LIBCPP_NODEBUG = volatile _To;
 };
 
 template <class _From>
 struct __copy_cv<const volatile _From> {
   template <class _To>
-  using __apply = const volatile _To;
+  using __apply _LIBCPP_NODEBUG = const volatile _To;
 };
 
 template <class _From, class _To>
-using __copy_cv_t = typename __copy_cv<_From>::template __apply<_To>;
+using __copy_cv_t _LIBCPP_NODEBUG = typename __copy_cv<_From>::template __apply<_To>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/copy_cvref.h b/libcxx/include/__type_traits/copy_cvref.h
index 8bbf8efd..511d4e0 100644
--- a/libcxx/include/__type_traits/copy_cvref.h
+++ b/libcxx/include/__type_traits/copy_cvref.h
@@ -36,7 +36,7 @@ struct __copy_cvref<_From&&, _To> {
 };
 
 template <class _From, class _To>
-using __copy_cvref_t = typename __copy_cvref<_From, _To>::type;
+using __copy_cvref_t _LIBCPP_NODEBUG = typename __copy_cvref<_From, _To>::type;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/disjunction.h b/libcxx/include/__type_traits/disjunction.h
index 2c89528..d579de9 100644
--- a/libcxx/include/__type_traits/disjunction.h
+++ b/libcxx/include/__type_traits/disjunction.h
@@ -31,7 +31,7 @@ struct _OrImpl<true> {
 template <>
 struct _OrImpl<false> {
   template <class _Res, class...>
-  using _Result = _Res;
+  using _Result _LIBCPP_NODEBUG = _Res;
 };
 
 // _Or always performs lazy evaluation of its arguments.
diff --git a/libcxx/include/__type_traits/invoke.h b/libcxx/include/__type_traits/invoke.h
index 71db32a..6f641b9 100644
--- a/libcxx/include/__type_traits/invoke.h
+++ b/libcxx/include/__type_traits/invoke.h
@@ -44,12 +44,12 @@ template <class _Fp,
           class _DecayFp = __decay_t<_Fp>,
           class _DecayA0 = __decay_t<_A0>,
           class _ClassT  = typename __member_pointer_class_type<_DecayFp>::type>
-using __enable_if_bullet1 =
+using __enable_if_bullet1 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_function_pointer<_DecayFp>::value &&
                   (is_same<_ClassT, _DecayA0>::value || is_base_of<_ClassT, _DecayA0>::value)>;
 
 template <class _Fp, class _A0, class _DecayFp = __decay_t<_Fp>, class _DecayA0 = __decay_t<_A0> >
-using __enable_if_bullet2 =
+using __enable_if_bullet2 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_function_pointer<_DecayFp>::value && __is_reference_wrapper<_DecayA0>::value>;
 
 template <class _Fp,
@@ -57,7 +57,7 @@ template <class _Fp,
           class _DecayFp = __decay_t<_Fp>,
           class _DecayA0 = __decay_t<_A0>,
           class _ClassT  = typename __member_pointer_class_type<_DecayFp>::type>
-using __enable_if_bullet3 =
+using __enable_if_bullet3 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_function_pointer<_DecayFp>::value &&
                   !(is_same<_ClassT, _DecayA0>::value || is_base_of<_ClassT, _DecayA0>::value) &&
                   !__is_reference_wrapper<_DecayA0>::value>;
@@ -67,12 +67,12 @@ template <class _Fp,
           class _DecayFp = __decay_t<_Fp>,
           class _DecayA0 = __decay_t<_A0>,
           class _ClassT  = typename __member_pointer_class_type<_DecayFp>::type>
-using __enable_if_bullet4 =
+using __enable_if_bullet4 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_object_pointer<_DecayFp>::value &&
                   (is_same<_ClassT, _DecayA0>::value || is_base_of<_ClassT, _DecayA0>::value)>;
 
 template <class _Fp, class _A0, class _DecayFp = __decay_t<_Fp>, class _DecayA0 = __decay_t<_A0> >
-using __enable_if_bullet5 =
+using __enable_if_bullet5 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_object_pointer<_DecayFp>::value && __is_reference_wrapper<_DecayA0>::value>;
 
 template <class _Fp,
@@ -80,7 +80,7 @@ template <class _Fp,
           class _DecayFp = __decay_t<_Fp>,
           class _DecayA0 = __decay_t<_A0>,
           class _ClassT  = typename __member_pointer_class_type<_DecayFp>::type>
-using __enable_if_bullet6 =
+using __enable_if_bullet6 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_object_pointer<_DecayFp>::value &&
                   !(is_same<_ClassT, _DecayA0>::value || is_base_of<_ClassT, _DecayA0>::value) &&
                   !__is_reference_wrapper<_DecayA0>::value>;
@@ -159,7 +159,7 @@ struct __invokable_r {
 
   // FIXME: Check that _Ret, _Fp, and _Args... are all complete types, cv void,
   // or incomplete array types as required by the standard.
-  using _Result = decltype(__try_call<_Fp, _Args...>(0));
+  using _Result _LIBCPP_NODEBUG = decltype(__try_call<_Fp, _Args...>(0));
 
   using type              = __conditional_t<_IsNotSame<_Result, __nat>::value,
                                             __conditional_t<is_void<_Ret>::value, true_type, __is_core_convertible<_Result, _Ret> >,
@@ -167,7 +167,7 @@ struct __invokable_r {
   static const bool value = type::value;
 };
 template <class _Fp, class... _Args>
-using __invokable = __invokable_r<void, _Fp, _Args...>;
+using __invokable _LIBCPP_NODEBUG = __invokable_r<void, _Fp, _Args...>;
 
 template <bool _IsInvokable, bool _IsCVVoid, class _Ret, class _Fp, class... _Args>
 struct __nothrow_invokable_r_imp {
@@ -199,11 +199,12 @@ struct __nothrow_invokable_r_imp<true, true, _Ret, _Fp, _Args...> {
 };
 
 template <class _Ret, class _Fp, class... _Args>
-using __nothrow_invokable_r =
+using __nothrow_invokable_r _LIBCPP_NODEBUG =
     __nothrow_invokable_r_imp<__invokable_r<_Ret, _Fp, _Args...>::value, is_void<_Ret>::value, _Ret, _Fp, _Args...>;
 
 template <class _Fp, class... _Args>
-using __nothrow_invokable = __nothrow_invokable_r_imp<__invokable<_Fp, _Args...>::value, true, void, _Fp, _Args...>;
+using __nothrow_invokable _LIBCPP_NODEBUG =
+    __nothrow_invokable_r_imp<__invokable<_Fp, _Args...>::value, true, void, _Fp, _Args...>;
 
 template <class _Fp, class... _Args>
 struct __invoke_of
diff --git a/libcxx/include/__type_traits/is_always_bitcastable.h b/libcxx/include/__type_traits/is_always_bitcastable.h
index 5bc650b..4c6c43c 100644
--- a/libcxx/include/__type_traits/is_always_bitcastable.h
+++ b/libcxx/include/__type_traits/is_always_bitcastable.h
@@ -31,8 +31,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // considered bit-castable.
 template <class _From, class _To>
 struct __is_always_bitcastable {
-  using _UnqualFrom = __remove_cv_t<_From>;
-  using _UnqualTo   = __remove_cv_t<_To>;
+  using _UnqualFrom _LIBCPP_NODEBUG = __remove_cv_t<_From>;
+  using _UnqualTo _LIBCPP_NODEBUG   = __remove_cv_t<_To>;
 
   // clang-format off
   static const bool value =
diff --git a/libcxx/include/__type_traits/is_char_like_type.h b/libcxx/include/__type_traits/is_char_like_type.h
index 2620584..913c082 100644
--- a/libcxx/include/__type_traits/is_char_like_type.h
+++ b/libcxx/include/__type_traits/is_char_like_type.h
@@ -21,7 +21,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _CharT>
-using _IsCharLikeType = _And<is_standard_layout<_CharT>, is_trivial<_CharT> >;
+using _IsCharLikeType _LIBCPP_NODEBUG = _And<is_standard_layout<_CharT>, is_trivial<_CharT> >;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_equality_comparable.h b/libcxx/include/__type_traits/is_equality_comparable.h
index 4397f74..3ee1839 100644
--- a/libcxx/include/__type_traits/is_equality_comparable.h
+++ b/libcxx/include/__type_traits/is_equality_comparable.h
@@ -80,7 +80,7 @@ struct __libcpp_is_trivially_equality_comparable_impl<_Tp*, _Up*>
 };
 
 template <class _Tp, class _Up>
-using __libcpp_is_trivially_equality_comparable =
+using __libcpp_is_trivially_equality_comparable _LIBCPP_NODEBUG =
     __libcpp_is_trivially_equality_comparable_impl<__remove_cv_t<_Tp>, __remove_cv_t<_Up> >;
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__type_traits/is_execution_policy.h b/libcxx/include/__type_traits/is_execution_policy.h
index 6884f17..a2d876d 100644
--- a/libcxx/include/__type_traits/is_execution_policy.h
+++ b/libcxx/include/__type_traits/is_execution_policy.h
@@ -50,7 +50,7 @@ __remove_parallel_policy(const _ExecutionPolicy& = _ExecutionPolicy{execution::_
 // Removes the "parallel" part of an execution policy.
 // For example, turns par_unseq into unseq, and par into seq.
 template <class _ExecutionPolicy>
-using __remove_parallel_policy_t = decltype(std::__remove_parallel_policy<_ExecutionPolicy>());
+using __remove_parallel_policy_t _LIBCPP_NODEBUG = decltype(std::__remove_parallel_policy<_ExecutionPolicy>());
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_primary_template.h b/libcxx/include/__type_traits/is_primary_template.h
index f308dfadc..5fe6820 100644
--- a/libcxx/include/__type_traits/is_primary_template.h
+++ b/libcxx/include/__type_traits/is_primary_template.h
@@ -21,10 +21,11 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-using __test_for_primary_template = __enable_if_t<_IsSame<_Tp, typename _Tp::__primary_template>::value>;
+using __test_for_primary_template _LIBCPP_NODEBUG =
+    __enable_if_t<_IsSame<_Tp, typename _Tp::__primary_template>::value>;
 
 template <class _Tp>
-using __is_primary_template = _IsValidExpansion<__test_for_primary_template, _Tp>;
+using __is_primary_template _LIBCPP_NODEBUG = _IsValidExpansion<__test_for_primary_template, _Tp>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_same.h b/libcxx/include/__type_traits/is_same.h
index 9561b7b..400f870 100644
--- a/libcxx/include/__type_traits/is_same.h
+++ b/libcxx/include/__type_traits/is_same.h
@@ -34,10 +34,10 @@ inline constexpr bool is_same_v = __is_same(_Tp, _Up);
 // (such as in a dependent return type).
 
 template <class _Tp, class _Up>
-using _IsSame = _BoolConstant<__is_same(_Tp, _Up)>;
+using _IsSame _LIBCPP_NODEBUG = _BoolConstant<__is_same(_Tp, _Up)>;
 
 template <class _Tp, class _Up>
-using _IsNotSame = _BoolConstant<!__is_same(_Tp, _Up)>;
+using _IsNotSame _LIBCPP_NODEBUG = _BoolConstant<!__is_same(_Tp, _Up)>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_swappable.h b/libcxx/include/__type_traits/is_swappable.h
index 221f017..aa5eecd 100644
--- a/libcxx/include/__type_traits/is_swappable.h
+++ b/libcxx/include/__type_traits/is_swappable.h
@@ -41,10 +41,11 @@ inline const bool __is_nothrow_swappable_v = __is_nothrow_swappable_with_v<_Tp&,
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _Tp>
-using __swap_result_t = __enable_if_t<is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value>;
+using __swap_result_t _LIBCPP_NODEBUG =
+    __enable_if_t<is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value>;
 #else
 template <class>
-using __swap_result_t = void;
+using __swap_result_t _LIBCPP_NODEBUG = void;
 #endif
 
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/make_32_64_or_128_bit.h b/libcxx/include/__type_traits/make_32_64_or_128_bit.h
index 70f84fc..7016209 100644
--- a/libcxx/include/__type_traits/make_32_64_or_128_bit.h
+++ b/libcxx/include/__type_traits/make_32_64_or_128_bit.h
@@ -31,7 +31,7 @@ template <class _Tp>
   requires(is_signed_v<_Tp> || is_unsigned_v<_Tp> || is_same_v<_Tp, char>)
 #endif
 // clang-format off
-using __make_32_64_or_128_bit_t =
+using __make_32_64_or_128_bit_t _LIBCPP_NODEBUG =
     __copy_unsigned_t<_Tp,
         __conditional_t<sizeof(_Tp) <= sizeof(int32_t),    int32_t,
         __conditional_t<sizeof(_Tp) <= sizeof(int64_t),    int64_t,
diff --git a/libcxx/include/__type_traits/make_const_lvalue_ref.h b/libcxx/include/__type_traits/make_const_lvalue_ref.h
index 469d4cb..f995533 100644
--- a/libcxx/include/__type_traits/make_const_lvalue_ref.h
+++ b/libcxx/include/__type_traits/make_const_lvalue_ref.h
@@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-using __make_const_lvalue_ref = const __libcpp_remove_reference_t<_Tp>&;
+using __make_const_lvalue_ref _LIBCPP_NODEBUG = const __libcpp_remove_reference_t<_Tp>&;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h
index 8070690..88513fe 100644
--- a/libcxx/include/__type_traits/make_signed.h
+++ b/libcxx/include/__type_traits/make_signed.h
@@ -26,24 +26,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if __has_builtin(__make_signed)
 
 template <class _Tp>
-using __make_signed_t = __make_signed(_Tp);
+using __make_signed_t _LIBCPP_NODEBUG = __make_signed(_Tp);
 
 #else
-// clang-format off
-typedef __type_list<signed char,
-        __type_list<signed short,
-        __type_list<signed int,
-        __type_list<signed long,
-        __type_list<signed long long,
-#  if _LIBCPP_HAS_INT128
-        __type_list<__int128_t,
-#  endif
-        __nat
+using __signed_types =
+    __type_list<signed char,
+                signed short,
+                signed int,
+                signed long,
+                signed long long
 #  if _LIBCPP_HAS_INT128
-        >
+                ,
+                __int128_t
 #  endif
-        > > > > > __signed_types;
-// clang-format on
+                >;
 
 template <class _Tp, bool = is_integral<_Tp>::value || is_enum<_Tp>::value>
 struct __make_signed{};
diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h
index 562f7ba..83ff8b7 100644
--- a/libcxx/include/__type_traits/make_unsigned.h
+++ b/libcxx/include/__type_traits/make_unsigned.h
@@ -28,24 +28,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if __has_builtin(__make_unsigned)
 
 template <class _Tp>
-using __make_unsigned_t = __make_unsigned(_Tp);
+using __make_unsigned_t _LIBCPP_NODEBUG = __make_unsigned(_Tp);
 
 #else
-// clang-format off
-typedef __type_list<unsigned char,
-        __type_list<unsigned short,
-        __type_list<unsigned int,
-        __type_list<unsigned long,
-        __type_list<unsigned long long,
-#  if _LIBCPP_HAS_INT128
-        __type_list<__uint128_t,
-#  endif
-        __nat
+using __unsigned_types =
+    __type_list<unsigned char,
+                unsigned short,
+                unsigned int,
+                unsigned long,
+                unsigned long long
 #  if _LIBCPP_HAS_INT128
-        >
+                ,
+                __uint128_t
 #  endif
-        > > > > > __unsigned_types;
-// clang-format on
+                >;
 
 template <class _Tp, bool = is_integral<_Tp>::value || is_enum<_Tp>::value>
 struct __make_unsigned{};
@@ -92,7 +88,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __make_unsigned_t<_Tp> __to_unsigned_lik
 }
 
 template <class _Tp, class _Up>
-using __copy_unsigned_t = __conditional_t<is_unsigned<_Tp>::value, __make_unsigned_t<_Up>, _Up>;
+using __copy_unsigned_t _LIBCPP_NODEBUG = __conditional_t<is_unsigned<_Tp>::value, __make_unsigned_t<_Up>, _Up>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/maybe_const.h b/libcxx/include/__type_traits/maybe_const.h
index 25fba58..7ef742a 100644
--- a/libcxx/include/__type_traits/maybe_const.h
+++ b/libcxx/include/__type_traits/maybe_const.h
@@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <bool _Const, class _Tp>
-using __maybe_const = __conditional_t<_Const, const _Tp, _Tp>;
+using __maybe_const _LIBCPP_NODEBUG = __conditional_t<_Const, const _Tp, _Tp>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/remove_all_extents.h b/libcxx/include/__type_traits/remove_all_extents.h
index db7dab4..d46a322 100644
--- a/libcxx/include/__type_traits/remove_all_extents.h
+++ b/libcxx/include/__type_traits/remove_all_extents.h
@@ -25,7 +25,7 @@ struct remove_all_extents {
 };
 
 template <class _Tp>
-using __remove_all_extents_t = __remove_all_extents(_Tp);
+using __remove_all_extents_t _LIBCPP_NODEBUG = __remove_all_extents(_Tp);
 #else
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS remove_all_extents {
diff --git a/libcxx/include/__type_traits/remove_const.h b/libcxx/include/__type_traits/remove_const.h
index a3f0648..6250d9f 100644
--- a/libcxx/include/__type_traits/remove_const.h
+++ b/libcxx/include/__type_traits/remove_const.h
@@ -24,7 +24,7 @@ struct remove_const {
 };
 
 template <class _Tp>
-using __remove_const_t = __remove_const(_Tp);
+using __remove_const_t _LIBCPP_NODEBUG = __remove_const(_Tp);
 #else
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS remove_const {
diff --git a/libcxx/include/__type_traits/remove_const_ref.h b/libcxx/include/__type_traits/remove_const_ref.h
index d3b3349..e6583b3 100644
--- a/libcxx/include/__type_traits/remove_const_ref.h
+++ b/libcxx/include/__type_traits/remove_const_ref.h
@@ -20,7 +20,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-using __remove_const_ref_t = __remove_const_t<__libcpp_remove_reference_t<_Tp> >;
+using __remove_const_ref_t _LIBCPP_NODEBUG = __remove_const_t<__libcpp_remove_reference_t<_Tp> >;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/remove_cv.h b/libcxx/include/__type_traits/remove_cv.h
index 50e9f3e..16848e6 100644
--- a/libcxx/include/__type_traits/remove_cv.h
+++ b/libcxx/include/__type_traits/remove_cv.h
@@ -24,10 +24,10 @@ struct remove_cv {
 
 #if defined(_LIBCPP_COMPILER_GCC)
 template <class _Tp>
-using __remove_cv_t = typename remove_cv<_Tp>::type;
+using __remove_cv_t _LIBCPP_NODEBUG = typename remove_cv<_Tp>::type;
 #else
 template <class _Tp>
-using __remove_cv_t = __remove_cv(_Tp);
+using __remove_cv_t _LIBCPP_NODEBUG = __remove_cv(_Tp);
 #endif
 
 #if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/__type_traits/remove_cvref.h b/libcxx/include/__type_traits/remove_cvref.h
index 55f894d..e3c6594 100644
--- a/libcxx/include/__type_traits/remove_cvref.h
+++ b/libcxx/include/__type_traits/remove_cvref.h
@@ -34,7 +34,7 @@ using __remove_cvref_t _LIBCPP_NODEBUG = __remove_cvref(_Tp);
 #endif // __has_builtin(__remove_cvref)
 
 template <class _Tp, class _Up>
-using __is_same_uncvref = _IsSame<__remove_cvref_t<_Tp>, __remove_cvref_t<_Up> >;
+using __is_same_uncvref _LIBCPP_NODEBUG = _IsSame<__remove_cvref_t<_Tp>, __remove_cvref_t<_Up> >;
 
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/remove_extent.h b/libcxx/include/__type_traits/remove_extent.h
index aceeb47..95a7971 100644
--- a/libcxx/include/__type_traits/remove_extent.h
+++ b/libcxx/include/__type_traits/remove_extent.h
@@ -25,7 +25,7 @@ struct remove_extent {
 };
 
 template <class _Tp>
-using __remove_extent_t = __remove_extent(_Tp);
+using __remove_extent_t _LIBCPP_NODEBUG = __remove_extent(_Tp);
 #else
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS remove_extent {
diff --git a/libcxx/include/__type_traits/remove_pointer.h b/libcxx/include/__type_traits/remove_pointer.h
index 6f98ed1..47cd1cd 100644
--- a/libcxx/include/__type_traits/remove_pointer.h
+++ b/libcxx/include/__type_traits/remove_pointer.h
@@ -25,10 +25,10 @@ struct remove_pointer {
 
 #  ifdef _LIBCPP_COMPILER_GCC
 template <class _Tp>
-using __remove_pointer_t = typename remove_pointer<_Tp>::type;
+using __remove_pointer_t _LIBCPP_NODEBUG = typename remove_pointer<_Tp>::type;
 #  else
 template <class _Tp>
-using __remove_pointer_t = __remove_pointer(_Tp);
+using __remove_pointer_t _LIBCPP_NODEBUG = __remove_pointer(_Tp);
 #  endif
 #else
 // clang-format off
diff --git a/libcxx/include/__type_traits/remove_reference.h b/libcxx/include/__type_traits/remove_reference.h
index ba67891..f688156 100644
--- a/libcxx/include/__type_traits/remove_reference.h
+++ b/libcxx/include/__type_traits/remove_reference.h
@@ -24,7 +24,7 @@ struct remove_reference {
 };
 
 template <class _Tp>
-using __libcpp_remove_reference_t = __remove_reference_t(_Tp);
+using __libcpp_remove_reference_t _LIBCPP_NODEBUG = __remove_reference_t(_Tp);
 #elif __has_builtin(__remove_reference)
 template <class _Tp>
 struct remove_reference {
diff --git a/libcxx/include/__type_traits/remove_volatile.h b/libcxx/include/__type_traits/remove_volatile.h
index 7600ae0..099945d 100644
--- a/libcxx/include/__type_traits/remove_volatile.h
+++ b/libcxx/include/__type_traits/remove_volatile.h
@@ -24,7 +24,7 @@ struct remove_volatile {
 };
 
 template <class _Tp>
-using __remove_volatile_t = __remove_volatile(_Tp);
+using __remove_volatile_t _LIBCPP_NODEBUG = __remove_volatile(_Tp);
 #else
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS remove_volatile {
diff --git a/libcxx/include/__type_traits/type_list.h b/libcxx/include/__type_traits/type_list.h
index b4898b3..34d78fc 100644
--- a/libcxx/include/__type_traits/type_list.h
+++ b/libcxx/include/__type_traits/type_list.h
@@ -11,6 +11,7 @@
 
 #include <__config>
 #include <__cstddef/size_t.h>
+#include <__type_traits/enable_if.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -18,23 +19,28 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Hp, class _Tp>
-struct __type_list {
-  typedef _Hp _Head;
-  typedef _Tp _Tail;
+template <class... _Types>
+struct __type_list {};
+
+template <class>
+struct __type_list_head;
+
+template <class _Head, class... _Tail>
+struct __type_list_head<__type_list<_Head, _Tail...> > {
+  using type _LIBCPP_NODEBUG = _Head;
 };
 
-template <class _TypeList, size_t _Size, bool = _Size <= sizeof(typename _TypeList::_Head)>
+template <class _TypeList, size_t _Size, bool = _Size <= sizeof(typename __type_list_head<_TypeList>::type)>
 struct __find_first;
 
-template <class _Hp, class _Tp, size_t _Size>
-struct __find_first<__type_list<_Hp, _Tp>, _Size, true> {
-  using type _LIBCPP_NODEBUG = _Hp;
+template <class _Head, class... _Tail, size_t _Size>
+struct __find_first<__type_list<_Head, _Tail...>, _Size, true> {
+  using type _LIBCPP_NODEBUG = _Head;
 };
 
-template <class _Hp, class _Tp, size_t _Size>
-struct __find_first<__type_list<_Hp, _Tp>, _Size, false> {
-  using type _LIBCPP_NODEBUG = typename __find_first<_Tp, _Size>::type;
+template <class _Head, class... _Tail, size_t _Size>
+struct __find_first<__type_list<_Head, _Tail...>, _Size, false> {
+  using type _LIBCPP_NODEBUG = typename __find_first<__type_list<_Tail...>, _Size>::type;
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__type_traits/unwrap_ref.h b/libcxx/include/__type_traits/unwrap_ref.h
index 74c4fde..11a069d 100644
--- a/libcxx/include/__type_traits/unwrap_ref.h
+++ b/libcxx/include/__type_traits/unwrap_ref.h
@@ -29,6 +29,9 @@ struct __unwrap_reference<reference_wrapper<_Tp> > {
   using type _LIBCPP_NODEBUG = _Tp&;
 };
 
+template <class _Tp>
+using __unwrap_ref_decay_t _LIBCPP_NODEBUG = typename __unwrap_reference<__decay_t<_Tp> >::type;
+
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp>
 struct unwrap_reference : __unwrap_reference<_Tp> {};
@@ -40,19 +43,9 @@ template <class _Tp>
 struct unwrap_ref_decay : unwrap_reference<__decay_t<_Tp> > {};
 
 template <class _Tp>
-using unwrap_ref_decay_t = typename unwrap_ref_decay<_Tp>::type;
+using unwrap_ref_decay_t = __unwrap_ref_decay_t<_Tp>;
 #endif // _LIBCPP_STD_VER >= 20
 
-template <class _Tp>
-struct __unwrap_ref_decay
-#if _LIBCPP_STD_VER >= 20
-    : unwrap_ref_decay<_Tp>
-#else
-    : __unwrap_reference<__decay_t<_Tp> >
-#endif
-{
-};
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_UNWRAP_REF_H
diff --git a/libcxx/include/__type_traits/void_t.h b/libcxx/include/__type_traits/void_t.h
index 985bba0..8adadfa 100644
--- a/libcxx/include/__type_traits/void_t.h
+++ b/libcxx/include/__type_traits/void_t.h
@@ -23,7 +23,7 @@ using void_t = void;
 #endif
 
 template <class...>
-using __void_t = void;
+using __void_t _LIBCPP_NODEBUG = void;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__utility/exception_guard.h b/libcxx/include/__utility/exception_guard.h
index 71e52fd..a6b4ec521 100644
--- a/libcxx/include/__utility/exception_guard.h
+++ b/libcxx/include/__utility/exception_guard.h
@@ -126,10 +126,10 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard_noexceptions);
 
 #if !_LIBCPP_HAS_EXCEPTIONS
 template <class _Rollback>
-using __exception_guard = __exception_guard_noexceptions<_Rollback>;
+using __exception_guard _LIBCPP_NODEBUG = __exception_guard_noexceptions<_Rollback>;
 #else
 template <class _Rollback>
-using __exception_guard = __exception_guard_exceptions<_Rollback>;
+using __exception_guard _LIBCPP_NODEBUG = __exception_guard_exceptions<_Rollback>;
 #endif
 
 template <class _Rollback>
diff --git a/libcxx/include/__utility/forward_like.h b/libcxx/include/__utility/forward_like.h
index 67bdf6d..409f716 100644
--- a/libcxx/include/__utility/forward_like.h
+++ b/libcxx/include/__utility/forward_like.h
@@ -26,13 +26,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 23
 
 template <class _Ap, class _Bp>
-using _CopyConst = _If<is_const_v<_Ap>, const _Bp, _Bp>;
+using _CopyConst _LIBCPP_NODEBUG = _If<is_const_v<_Ap>, const _Bp, _Bp>;
 
 template <class _Ap, class _Bp>
-using _OverrideRef = _If<is_rvalue_reference_v<_Ap>, remove_reference_t<_Bp>&&, _Bp&>;
+using _OverrideRef _LIBCPP_NODEBUG = _If<is_rvalue_reference_v<_Ap>, remove_reference_t<_Bp>&&, _Bp&>;
 
 template <class _Ap, class _Bp>
-using _ForwardLike = _OverrideRef<_Ap&&, _CopyConst<remove_reference_t<_Ap>, remove_reference_t<_Bp>>>;
+using _ForwardLike _LIBCPP_NODEBUG = _OverrideRef<_Ap&&, _CopyConst<remove_reference_t<_Ap>, remove_reference_t<_Bp>>>;
 
 template <class _Tp, class _Up>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto
diff --git a/libcxx/include/__utility/in_place.h b/libcxx/include/__utility/in_place.h
index edaa4e0..9b48446 100644
--- a/libcxx/include/__utility/in_place.h
+++ b/libcxx/include/__utility/in_place.h
@@ -47,7 +47,7 @@ template <class _Tp>
 struct __is_inplace_type_imp<in_place_type_t<_Tp>> : true_type {};
 
 template <class _Tp>
-using __is_inplace_type = __is_inplace_type_imp<__remove_cvref_t<_Tp>>;
+using __is_inplace_type _LIBCPP_NODEBUG = __is_inplace_type_imp<__remove_cvref_t<_Tp>>;
 
 template <class _Tp>
 struct __is_inplace_index_imp : false_type {};
@@ -55,7 +55,7 @@ template <size_t _Idx>
 struct __is_inplace_index_imp<in_place_index_t<_Idx>> : true_type {};
 
 template <class _Tp>
-using __is_inplace_index = __is_inplace_index_imp<__remove_cvref_t<_Tp>>;
+using __is_inplace_index _LIBCPP_NODEBUG = __is_inplace_index_imp<__remove_cvref_t<_Tp>>;
 
 #endif // _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__utility/integer_sequence.h b/libcxx/include/__utility/integer_sequence.h
index 35eb606..2c1ff3c 100644
--- a/libcxx/include/__utility/integer_sequence.h
+++ b/libcxx/include/__utility/integer_sequence.h
@@ -25,19 +25,19 @@ struct __tuple_indices;
 template <class _IdxType, _IdxType... _Values>
 struct __integer_sequence {
   template <template <class _OIdxType, _OIdxType...> class _ToIndexSeq, class _ToIndexType>
-  using __convert = _ToIndexSeq<_ToIndexType, _Values...>;
+  using __convert _LIBCPP_NODEBUG = _ToIndexSeq<_ToIndexType, _Values...>;
 
   template <size_t _Sp>
-  using __to_tuple_indices = __tuple_indices<(_Values + _Sp)...>;
+  using __to_tuple_indices _LIBCPP_NODEBUG = __tuple_indices<(_Values + _Sp)...>;
 };
 
 #if __has_builtin(__make_integer_seq)
 template <size_t _Ep, size_t _Sp>
-using __make_indices_imp =
+using __make_indices_imp _LIBCPP_NODEBUG =
     typename __make_integer_seq<__integer_sequence, size_t, _Ep - _Sp>::template __to_tuple_indices<_Sp>;
 #elif __has_builtin(__integer_pack)
 template <size_t _Ep, size_t _Sp>
-using __make_indices_imp =
+using __make_indices_imp _LIBCPP_NODEBUG =
     typename __integer_sequence<size_t, __integer_pack(_Ep - _Sp)...>::template __to_tuple_indices<_Sp>;
 #else
 #  error "No known way to get an integer pack from the compiler"
diff --git a/libcxx/include/__utility/move.h b/libcxx/include/__utility/move.h
index 015986f..bc16697 100644
--- a/libcxx/include/__utility/move.h
+++ b/libcxx/include/__utility/move.h
@@ -33,7 +33,7 @@ move(_LIBCPP_LIFETIMEBOUND _Tp&& __t) _NOEXCEPT {
 }
 
 template <class _Tp>
-using __move_if_noexcept_result_t =
+using __move_if_noexcept_result_t _LIBCPP_NODEBUG =
     __conditional_t<!is_nothrow_move_constructible<_Tp>::value && is_copy_constructible<_Tp>::value, const _Tp&, _Tp&&>;
 
 template <class _Tp>
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index f9d0f4e..7689ab2 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -71,7 +71,7 @@ struct _LIBCPP_TEMPLATE_VIS pair
   _T1 first;
   _T2 second;
 
-  using __trivially_relocatable =
+  using __trivially_relocatable _LIBCPP_NODEBUG =
       __conditional_t<__libcpp_is_trivially_relocatable<_T1>::value && __libcpp_is_trivially_relocatable<_T2>::value,
                       pair,
                       void>;
@@ -532,11 +532,9 @@ swap(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) noexcept(noexcept(__x
 #endif
 
 template <class _T1, class _T2>
-inline _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX14 pair<typename __unwrap_ref_decay<_T1>::type, typename __unwrap_ref_decay<_T2>::type>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<__unwrap_ref_decay_t<_T1>, __unwrap_ref_decay_t<_T2> >
 make_pair(_T1&& __t1, _T2&& __t2) {
-  return pair<typename __unwrap_ref_decay<_T1>::type, typename __unwrap_ref_decay<_T2>::type>(
-      std::forward<_T1>(__t1), std::forward<_T2>(__t2));
+  return pair<__unwrap_ref_decay_t<_T1>, __unwrap_ref_decay_t<_T2> >(std::forward<_T1>(__t1), std::forward<_T2>(__t2));
 }
 
 template <class _T1, class _T2>
diff --git a/libcxx/include/__utility/swap.h b/libcxx/include/__utility/swap.h
index 666d6d5..b431154 100644
--- a/libcxx/include/__utility/swap.h
+++ b/libcxx/include/__utility/swap.h
@@ -31,10 +31,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _Tp>
-using __swap_result_t = __enable_if_t<is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value>;
+using __swap_result_t _LIBCPP_NODEBUG =
+    __enable_if_t<is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value>;
 #else
 template <class>
-using __swap_result_t = void;
+using __swap_result_t _LIBCPP_NODEBUG = void;
 #endif
 
 template <class _Tp>
diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 6ba7ba7..ddbf123 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -114,7 +114,7 @@ public:
   // - pointer: may be trivially relocatable, so it's checked
   // - allocator_type: may be trivially relocatable, so it's checked
   // vector doesn't contain any self-references, so it's trivially relocatable if its members are.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       vector,
       void>;
diff --git a/libcxx/include/__vector/vector_bool.h b/libcxx/include/__vector/vector_bool.h
index 525fc35..6c6605f 100644
--- a/libcxx/include/__vector/vector_bool.h
+++ b/libcxx/include/__vector/vector_bool.h
@@ -275,17 +275,33 @@ public:
   }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crend() const _NOEXCEPT { return rend(); }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __n) { return __make_ref(__n); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __n) {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < size(), "vector<bool>::operator[] index out of bounds");
+    return __make_ref(__n);
+  }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference operator[](size_type __n) const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < size(), "vector<bool>::operator[] index out of bounds");
     return __make_ref(__n);
   }
-  _LIBCPP_HIDE_FROM_ABI reference at(size_type __n);
-  _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __n) const;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference at(size_type __n);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference at(size_type __n) const;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() { return __make_ref(0); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const { return __make_ref(0); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() { return __make_ref(__size_ - 1); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const { return __make_ref(__size_ - 1); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::front() called on an empty vector");
+    return __make_ref(0);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::front() called on an empty vector");
+    return __make_ref(0);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::back() called on an empty vector");
+    return __make_ref(__size_ - 1);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::back() called on an empty vector");
+    return __make_ref(__size_ - 1);
+  }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void push_back(const value_type& __x);
 #if _LIBCPP_STD_VER >= 14
@@ -310,7 +326,10 @@ public:
   }
 #endif
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void pop_back() { --__size_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void pop_back() {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::pop_back called on an empty vector");
+    --__size_;
+  }
 
 #if _LIBCPP_STD_VER >= 14
   template <class... _Args>
@@ -442,7 +461,6 @@ private:
   template <class _InputIterator, class _Sentinel>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
   __construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n);
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __append(size_type __n, const_reference __x);
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference __make_ref(size_type __pos) _NOEXCEPT {
     return reference(__begin_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
   }
@@ -854,14 +872,15 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<bool, _Allocator>::shrink_to_fit() _NO
 }
 
 template <class _Allocator>
-typename vector<bool, _Allocator>::reference vector<bool, _Allocator>::at(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<bool, _Allocator>::reference vector<bool, _Allocator>::at(size_type __n) {
   if (__n >= size())
     this->__throw_out_of_range();
   return (*this)[__n];
 }
 
 template <class _Allocator>
-typename vector<bool, _Allocator>::const_reference vector<bool, _Allocator>::at(size_type __n) const {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<bool, _Allocator>::const_reference
+vector<bool, _Allocator>::at(size_type __n) const {
   if (__n >= size())
     this->__throw_out_of_range();
   return (*this)[__n];
@@ -995,6 +1014,8 @@ vector<bool, _Allocator>::__insert_with_size(
 template <class _Allocator>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<bool, _Allocator>::iterator
 vector<bool, _Allocator>::erase(const_iterator __position) {
+  _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+      __position != end(), "vector<bool>::erase(iterator) called with a non-dereferenceable iterator");
   iterator __r = __const_iterator_cast(__position);
   std::copy(__position + 1, this->cend(), __r);
   --__size_;
@@ -1004,6 +1025,8 @@ vector<bool, _Allocator>::erase(const_iterator __position) {
 template <class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<bool, _Allocator>::iterator
 vector<bool, _Allocator>::erase(const_iterator __first, const_iterator __last) {
+  _LIBCPP_ASSERT_VALID_INPUT_RANGE(
+      __first <= __last, "vector<bool>::erase(iterator, iterator) called with an invalid range");
   iterator __r        = __const_iterator_cast(__first);
   difference_type __d = __last - __first;
   std::copy(__last, this->cend(), __r);
diff --git a/libcxx/include/any b/libcxx/include/any
index 934c4db..786e86b 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -149,11 +149,11 @@ _LIBCPP_HIDE_FROM_ABI add_pointer_t<_ValueType> any_cast(any*) _NOEXCEPT;
 
 namespace __any_imp {
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-using _Buffer = aligned_storage_t<3 * sizeof(void*), alignof(void*)>;
+using _Buffer _LIBCPP_NODEBUG = aligned_storage_t<3 * sizeof(void*), alignof(void*)>;
 _LIBCPP_SUPPRESS_DEPRECATED_POP
 
 template <class _Tp>
-using _IsSmallObject =
+using _IsSmallObject _LIBCPP_NODEBUG =
     integral_constant<bool,
                       sizeof(_Tp) <= sizeof(_Buffer) && alignof(_Buffer) % alignof(_Tp) == 0 &&
                           is_nothrow_move_constructible<_Tp>::value >;
@@ -185,7 +185,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool __compare_typeid(type_info const* __id, const
 }
 
 template <class _Tp>
-using _Handler = conditional_t< _IsSmallObject<_Tp>::value, _SmallHandler<_Tp>, _LargeHandler<_Tp>>;
+using _Handler _LIBCPP_NODEBUG = conditional_t< _IsSmallObject<_Tp>::value, _SmallHandler<_Tp>, _LargeHandler<_Tp>>;
 
 } // namespace __any_imp
 
@@ -278,8 +278,9 @@ public:
 #    endif
 
 private:
-  typedef __any_imp::_Action _Action;
-  using _HandleFuncPtr = void* (*)(_Action, any const*, any*, const type_info*, const void* __fallback_info);
+  using _Action _LIBCPP_NODEBUG = __any_imp::_Action;
+  using _HandleFuncPtr
+      _LIBCPP_NODEBUG = void* (*)(_Action, any const*, any*, const type_info*, const void* __fallback_info);
 
   union _Storage {
     _LIBCPP_HIDE_FROM_ABI constexpr _Storage() : __ptr(nullptr) {}
diff --git a/libcxx/include/array b/libcxx/include/array
index 516d965..1b9bcd6 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -173,15 +173,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, size_t _Size>
 struct _LIBCPP_TEMPLATE_VIS array {
-  using __trivially_relocatable = __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, array, void>;
+  using __trivially_relocatable _LIBCPP_NODEBUG =
+      __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, array, void>;
 
   // types:
-  using __self          = array;
-  using value_type      = _Tp;
-  using reference       = value_type&;
-  using const_reference = const value_type&;
-  using pointer         = value_type*;
-  using const_pointer   = const value_type*;
+  using __self _LIBCPP_NODEBUG = array;
+  using value_type             = _Tp;
+  using reference              = value_type&;
+  using const_reference        = const value_type&;
+  using pointer                = value_type*;
+  using const_pointer          = const value_type*;
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
   using iterator       = __static_bounded_iter<pointer, _Size>;
   using const_iterator = __static_bounded_iter<const_pointer, _Size>;
@@ -299,12 +300,12 @@ struct _LIBCPP_TEMPLATE_VIS array {
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0> {
   // types:
-  using __self          = array;
-  using value_type      = _Tp;
-  using reference       = value_type&;
-  using const_reference = const value_type&;
-  using pointer         = value_type*;
-  using const_pointer   = const value_type*;
+  using __self _LIBCPP_NODEBUG = array;
+  using value_type             = _Tp;
+  using reference              = value_type&;
+  using const_reference        = const value_type&;
+  using pointer                = value_type*;
+  using const_pointer          = const value_type*;
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
   using iterator       = __static_bounded_iter<pointer, 0>;
   using const_iterator = __static_bounded_iter<const_pointer, 0>;
@@ -320,7 +321,7 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0> {
   using reverse_iterator       = std::reverse_iterator<iterator>;
   using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
-  using _EmptyType = __conditional_t<is_const<_Tp>::value, const __empty, __empty>;
+  using _EmptyType _LIBCPP_NODEBUG = __conditional_t<is_const<_Tp>::value, const __empty, __empty>;
 
   struct _ArrayInStructT {
     _Tp __data_[1];
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index 6861532b..91dfa97 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -95,7 +95,7 @@ It looks different from literature pseudocode for two main reasons:
 
 */
 
-using __barrier_phase_t = uint8_t;
+using __barrier_phase_t _LIBCPP_NODEBUG = uint8_t;
 
 class __barrier_algorithm_base;
 
diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index 8b36182..919d2a0 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -133,6 +133,7 @@ template <size_t N> struct hash<std::bitset<N>>;
 #  include <__algorithm/fill.h>
 #  include <__algorithm/fill_n.h>
 #  include <__algorithm/find.h>
+#  include <__assert>
 #  include <__bit_reference>
 #  include <__config>
 #  include <__functional/hash.h>
@@ -683,13 +684,18 @@ public:
 
   // element access:
 #  ifdef _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator[](size_t __p) const { return __base::__make_ref(__p); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator[](size_t __p) const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__p < _Size, "bitset::operator[] index out of bounds");
+    return __base::__make_ref(__p);
+  }
 #  else
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference operator[](size_t __p) const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__p < _Size, "bitset::operator[] index out of bounds");
     return __base::__make_ref(__p);
   }
 #  endif
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 reference operator[](size_t __p) {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__p < _Size, "bitset::operator[] index out of bounds");
     return __base::__make_ref(__p);
   }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const;
diff --git a/libcxx/include/ccomplex b/libcxx/include/ccomplex
index 10eb8a3..ee7e088 100644
--- a/libcxx/include/ccomplex
+++ b/libcxx/include/ccomplex
@@ -28,13 +28,14 @@
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_ccomplex _LIBCPP_DEPRECATED_("removed in C++20. Include <complex> instead.") = void;
-using __use_standard_header_ccomplex = __standard_header_ccomplex;
+using __standard_header_ccomplex
+    _LIBCPP_DEPRECATED_("removed in C++20. Include <complex> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ccomplex _LIBCPP_NODEBUG                                    = __standard_header_ccomplex;
 
 #  elif _LIBCPP_STD_VER >= 17
 
-using __standard_header_ccomplex _LIBCPP_DEPRECATED_("Include <complex> instead.") = void;
-using __use_standard_header_ccomplex                                               = __standard_header_ccomplex;
+using __standard_header_ccomplex _LIBCPP_DEPRECATED_("Include <complex> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ccomplex _LIBCPP_NODEBUG = __standard_header_ccomplex;
 
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/ciso646 b/libcxx/include/ciso646
index 5b95640..3416436 100644
--- a/libcxx/include/ciso646
+++ b/libcxx/include/ciso646
@@ -26,8 +26,9 @@
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_ciso646 _LIBCPP_DEPRECATED_("removed in C++20. Include <version> instead.") = void;
-using __use_standard_header_ciso646 = __standard_header_ciso646;
+using __standard_header_ciso646
+    _LIBCPP_DEPRECATED_("removed in C++20. Include <version> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ciso646 _LIBCPP_NODEBUG                                     = __standard_header_ciso646;
 
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/complex b/libcxx/include/complex
index bc73f31..df18159 100644
--- a/libcxx/include/complex
+++ b/libcxx/include/complex
@@ -400,7 +400,7 @@ class _LIBCPP_TEMPLATE_VIS complex<long double>;
 struct __from_builtin_tag {};
 
 template <class _Tp>
-using __complex_t =
+using __complex_t _LIBCPP_NODEBUG =
     __conditional_t<is_same<_Tp, float>::value,
                     _Complex float,
                     __conditional_t<is_same<_Tp, double>::value, _Complex double, _Complex long double> >;
diff --git a/libcxx/include/cstdalign b/libcxx/include/cstdalign
index 6a277e4..7f8dd1e 100644
--- a/libcxx/include/cstdalign
+++ b/libcxx/include/cstdalign
@@ -45,13 +45,13 @@ Macros:
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_cstdalign _LIBCPP_DEPRECATED_("removed in C++20.") = void;
-using __use_standard_header_cstdalign                                      = __standard_header_cstdalign;
+using __standard_header_cstdalign _LIBCPP_DEPRECATED_("removed in C++20.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_cstdalign _LIBCPP_NODEBUG = __standard_header_cstdalign;
 
 #  elif _LIBCPP_STD_VER >= 17
 
-using __standard_header_cstdalign _LIBCPP_DEPRECATED = void;
-using __use_standard_header_cstdalign                = __standard_header_cstdalign;
+using __standard_header_cstdalign _LIBCPP_DEPRECATED _LIBCPP_NODEBUG = void;
+using __use_standard_header_cstdalign _LIBCPP_NODEBUG                = __standard_header_cstdalign;
 
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/cstdbool b/libcxx/include/cstdbool
index a12954f..a432d5f 100644
--- a/libcxx/include/cstdbool
+++ b/libcxx/include/cstdbool
@@ -33,13 +33,13 @@ Macros:
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_cstdbool _LIBCPP_DEPRECATED_("removed in C++20.") = void;
-using __use_standard_header_cstdbool                                      = __standard_header_cstdbool;
+using __standard_header_cstdbool _LIBCPP_DEPRECATED_("removed in C++20.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_cstdbool _LIBCPP_NODEBUG                                      = __standard_header_cstdbool;
 
 #  elif _LIBCPP_STD_VER >= 17
 
-using __standard_header_cstdbool _LIBCPP_DEPRECATED = void;
-using __use_standard_header_cstdbool                = __standard_header_cstdbool;
+using __standard_header_cstdbool _LIBCPP_DEPRECATED _LIBCPP_NODEBUG = void;
+using __use_standard_header_cstdbool _LIBCPP_NODEBUG                = __standard_header_cstdbool;
 
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/ctgmath b/libcxx/include/ctgmath
index 237f474..db0786f 100644
--- a/libcxx/include/ctgmath
+++ b/libcxx/include/ctgmath
@@ -30,13 +30,14 @@
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_ctgmath _LIBCPP_DEPRECATED_("removed in C++20. Include <cmath> and <complex> instead.") = void;
-using __use_standard_header_ctgmath = __standard_header_ctgmath;
+using __standard_header_ctgmath
+    _LIBCPP_DEPRECATED_("removed in C++20. Include <cmath> and <complex> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ctgmath _LIBCPP_NODEBUG = __standard_header_ctgmath;
 
 #  elif _LIBCPP_STD_VER >= 17
 
-using __standard_header_ctgmath _LIBCPP_DEPRECATED_("Include <cmath> and <complex> instead.") = void;
-using __use_standard_header_ctgmath = __standard_header_ctgmath;
+using __standard_header_ctgmath _LIBCPP_DEPRECATED_("Include <cmath> and <complex> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ctgmath _LIBCPP_NODEBUG = __standard_header_ctgmath;
 
 #  endif
 
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 883332f..df3094c 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -456,12 +456,13 @@ template <class _ValueType, class _Pointer, class _Reference, class _MapPointer,
 struct __segmented_iterator_traits<
     __deque_iterator<_ValueType, _Pointer, _Reference, _MapPointer, _DiffType, _BlockSize> > {
 private:
-  using _Iterator = __deque_iterator<_ValueType, _Pointer, _Reference, _MapPointer, _DiffType, _BlockSize>;
+  using _Iterator _LIBCPP_NODEBUG =
+      __deque_iterator<_ValueType, _Pointer, _Reference, _MapPointer, _DiffType, _BlockSize>;
 
 public:
-  using __is_segmented_iterator = true_type;
-  using __segment_iterator      = _MapPointer;
-  using __local_iterator        = _Pointer;
+  using __is_segmented_iterator _LIBCPP_NODEBUG = true_type;
+  using __segment_iterator _LIBCPP_NODEBUG      = _MapPointer;
+  using __local_iterator _LIBCPP_NODEBUG        = _Pointer;
 
   static _LIBCPP_HIDE_FROM_ABI __segment_iterator __segment(_Iterator __iter) { return __iter.__m_iter_; }
   static _LIBCPP_HIDE_FROM_ABI __local_iterator __local(_Iterator __iter) { return __iter.__ptr_; }
@@ -491,8 +492,8 @@ public:
 
   using value_type = _Tp;
 
-  using allocator_type = _Allocator;
-  using __alloc_traits = allocator_traits<allocator_type>;
+  using allocator_type                 = _Allocator;
+  using __alloc_traits _LIBCPP_NODEBUG = allocator_traits<allocator_type>;
   static_assert(__check_valid_allocator<allocator_type>::value, "");
   static_assert(is_same<typename allocator_type::value_type, value_type>::value,
                 "Allocator::value_type must be same type as value_type");
@@ -503,13 +504,13 @@ public:
   using pointer       = typename __alloc_traits::pointer;
   using const_pointer = typename __alloc_traits::const_pointer;
 
-  using __pointer_allocator       = __rebind_alloc<__alloc_traits, pointer>;
-  using __const_pointer_allocator = __rebind_alloc<__alloc_traits, const_pointer>;
-  using __map                     = __split_buffer<pointer, __pointer_allocator>;
-  using __map_alloc_traits        = allocator_traits<__pointer_allocator>;
-  using __map_pointer             = typename __map_alloc_traits::pointer;
-  using __map_const_pointer       = typename allocator_traits<__const_pointer_allocator>::const_pointer;
-  using __map_const_iterator      = typename __map::const_iterator;
+  using __pointer_allocator _LIBCPP_NODEBUG       = __rebind_alloc<__alloc_traits, pointer>;
+  using __const_pointer_allocator _LIBCPP_NODEBUG = __rebind_alloc<__alloc_traits, const_pointer>;
+  using __map _LIBCPP_NODEBUG                     = __split_buffer<pointer, __pointer_allocator>;
+  using __map_alloc_traits _LIBCPP_NODEBUG        = allocator_traits<__pointer_allocator>;
+  using __map_pointer _LIBCPP_NODEBUG             = typename __map_alloc_traits::pointer;
+  using __map_const_pointer _LIBCPP_NODEBUG       = typename allocator_traits<__const_pointer_allocator>::const_pointer;
+  using __map_const_iterator _LIBCPP_NODEBUG      = typename __map::const_iterator;
 
   using reference       = value_type&;
   using const_reference = const value_type&;
@@ -525,7 +526,7 @@ public:
   // - size_type: is always trivially relocatable, since it is required to be an integral type
   // - allocator_type: may not be trivially relocatable, so it's checked
   // None of these are referencing the `deque` itself, so if all of them are trivially relocatable, `deque` is too.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<__map>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       deque,
       void>;
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index da318d2..20c8b02 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -49,8 +49,8 @@ struct __mask_storage<_Tp, simd_abi::__scalar> : __simd_storage<bool, simd_abi::
 
 template <class _Tp>
 struct __simd_operations<_Tp, simd_abi::__scalar> {
-  using _SimdStorage = __simd_storage<_Tp, simd_abi::__scalar>;
-  using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>;
+  using _SimdStorage _LIBCPP_NODEBUG = __simd_storage<_Tp, simd_abi::__scalar>;
+  using _MaskStorage _LIBCPP_NODEBUG = __mask_storage<_Tp, simd_abi::__scalar>;
 
   static _LIBCPP_HIDE_FROM_ABI _SimdStorage __broadcast(_Tp __v) noexcept { return {__v}; }
 
@@ -86,7 +86,7 @@ struct __simd_operations<_Tp, simd_abi::__scalar> {
 
 template <class _Tp>
 struct __mask_operations<_Tp, simd_abi::__scalar> {
-  using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>;
+  using _MaskStorage _LIBCPP_NODEBUG = __mask_storage<_Tp, simd_abi::__scalar>;
 
   static _LIBCPP_HIDE_FROM_ABI _MaskStorage __broadcast(bool __v) noexcept { return {__v}; }
 
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index fd919e7..2fd2b26 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -43,8 +43,8 @@ public:
 // TODO: implement simd class
 template <class _Tp, class _Abi>
 class simd : public __simd_int_operators<simd<_Tp, _Abi>, __simd_operations<_Tp, _Abi>, is_integral_v<_Tp>> {
-  using _Impl    = __simd_operations<_Tp, _Abi>;
-  using _Storage = typename _Impl::_SimdStorage;
+  using _Impl _LIBCPP_NODEBUG    = __simd_operations<_Tp, _Abi>;
+  using _Storage _LIBCPP_NODEBUG = typename _Impl::_SimdStorage;
 
   _Storage __s_;
 
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index 6b6f671..a117665 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -27,8 +27,8 @@ inline namespace parallelism_v2 {
 // TODO: implement simd_mask class
 template <class _Tp, class _Abi>
 class simd_mask {
-  using _Impl    = __mask_operations<_Tp, _Abi>;
-  using _Storage = typename _Impl::_MaskStorage;
+  using _Impl _LIBCPP_NODEBUG    = __mask_operations<_Tp, _Abi>;
+  using _Storage _LIBCPP_NODEBUG = typename _Impl::_MaskStorage;
 
   _Storage __s_;
 
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index abc7e95..2a4b8c7 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -55,8 +55,8 @@ struct __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>
 
 template <class _Tp, int _Np>
 struct __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {
-  using _SimdStorage = __simd_storage<_Tp, simd_abi::__vec_ext<_Np>>;
-  using _MaskStorage = __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>;
+  using _SimdStorage _LIBCPP_NODEBUG = __simd_storage<_Tp, simd_abi::__vec_ext<_Np>>;
+  using _MaskStorage _LIBCPP_NODEBUG = __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>;
 
   static _LIBCPP_HIDE_FROM_ABI _SimdStorage __broadcast(_Tp __v) noexcept {
     _SimdStorage __result;
@@ -101,7 +101,7 @@ struct __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {
 
 template <class _Tp, int _Np>
 struct __mask_operations<_Tp, simd_abi::__vec_ext<_Np>> {
-  using _MaskStorage = __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>;
+  using _MaskStorage _LIBCPP_NODEBUG = __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>;
 
   static _LIBCPP_HIDE_FROM_ABI _MaskStorage __broadcast(bool __v) noexcept {
     _MaskStorage __result;
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index c1ab155..f3b9617 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -202,6 +202,7 @@ template <class T, class Allocator, class Predicate>
 #  include <__algorithm/lexicographical_compare.h>
 #  include <__algorithm/lexicographical_compare_three_way.h>
 #  include <__algorithm/min.h>
+#  include <__assert>
 #  include <__config>
 #  include <__cstddef/nullptr_t.h>
 #  include <__iterator/distance.h>
@@ -316,7 +317,8 @@ struct __forward_begin_node {
 };
 
 template <class _Tp, class _VoidPtr>
-using __begin_node_of = __forward_begin_node<__rebind_pointer_t<_VoidPtr, __forward_list_node<_Tp, _VoidPtr> > >;
+using __begin_node_of _LIBCPP_NODEBUG =
+    __forward_begin_node<__rebind_pointer_t<_VoidPtr, __forward_list_node<_Tp, _VoidPtr> > >;
 
 template <class _Tp, class _VoidPtr>
 struct __forward_list_node : public __begin_node_of<_Tp, _VoidPtr> {
@@ -766,8 +768,14 @@ public:
     return std::min<size_type>(__node_traits::max_size(this->__alloc_), numeric_limits<difference_type>::max());
   }
 
-  _LIBCPP_HIDE_FROM_ABI reference front() { return __base::__before_begin()->__next_->__get_value(); }
-  _LIBCPP_HIDE_FROM_ABI const_reference front() const { return __base::__before_begin()->__next_->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference front() {
+    _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
+    return __base::__before_begin()->__next_->__get_value();
+  }
+  _LIBCPP_HIDE_FROM_ABI const_reference front() const {
+    _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
+    return __base::__before_begin()->__next_->__get_value();
+  }
 
 #  ifndef _LIBCPP_CXX03_LANG
 #    if _LIBCPP_STD_VER >= 17
@@ -1085,6 +1093,7 @@ void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) {
 
 template <class _Tp, class _Alloc>
 void forward_list<_Tp, _Alloc>::pop_front() {
+  _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::pop_front called on an empty list");
   __node_pointer __p                = __base::__before_begin()->__next_;
   __base::__before_begin()->__next_ = __p->__next_;
   this->__delete_node(__p);
diff --git a/libcxx/include/ios b/libcxx/include/ios
index 7c2ee83..98a0882 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -629,9 +629,9 @@ private:
   basic_ostream<char_type, traits_type>* __tie_;
 
 #    if defined(_LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE)
-  using _FillType = _FillHelper<traits_type>;
+  using _FillType _LIBCPP_NODEBUG = _FillHelper<traits_type>;
 #    else
-  using _FillType = _SentinelValueFill<traits_type>;
+  using _FillType _LIBCPP_NODEBUG = _SentinelValueFill<traits_type>;
 #    endif
   mutable _FillType __fill_;
 };
diff --git a/libcxx/include/optional b/libcxx/include/optional
index 165e0f1..c325140 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -353,8 +353,8 @@ struct __optional_destruct_base<_Tp, true> {
 
 template <class _Tp, bool = is_reference<_Tp>::value>
 struct __optional_storage_base : __optional_destruct_base<_Tp> {
-  using __base     = __optional_destruct_base<_Tp>;
-  using value_type = _Tp;
+  using __base _LIBCPP_NODEBUG = __optional_destruct_base<_Tp>;
+  using value_type             = _Tp;
   using __base::__base;
 
   _LIBCPP_HIDE_FROM_ABI constexpr bool has_value() const noexcept { return this->__engaged_; }
@@ -396,8 +396,8 @@ struct __optional_storage_base : __optional_destruct_base<_Tp> {
 // to ensure we can make the change in an ABI-compatible manner.
 template <class _Tp>
 struct __optional_storage_base<_Tp, true> {
-  using value_type = _Tp;
-  using __raw_type = remove_reference_t<_Tp>;
+  using value_type                 = _Tp;
+  using __raw_type _LIBCPP_NODEBUG = remove_reference_t<_Tp>;
   __raw_type* __value_;
 
   template <class _Up>
@@ -555,11 +555,11 @@ struct __optional_move_assign_base<_Tp, false> : __optional_copy_assign_base<_Tp
 };
 
 template <class _Tp>
-using __optional_sfinae_ctor_base_t =
+using __optional_sfinae_ctor_base_t _LIBCPP_NODEBUG =
     __sfinae_ctor_base< is_copy_constructible<_Tp>::value, is_move_constructible<_Tp>::value >;
 
 template <class _Tp>
-using __optional_sfinae_assign_base_t =
+using __optional_sfinae_assign_base_t _LIBCPP_NODEBUG =
     __sfinae_assign_base< (is_copy_constructible<_Tp>::value && is_copy_assignable<_Tp>::value),
                           (is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value) >;
 
@@ -583,12 +583,13 @@ class _LIBCPP_DECLSPEC_EMPTY_BASES optional
     : private __optional_move_assign_base<_Tp>,
       private __optional_sfinae_ctor_base_t<_Tp>,
       private __optional_sfinae_assign_base_t<_Tp> {
-  using __base = __optional_move_assign_base<_Tp>;
+  using __base _LIBCPP_NODEBUG = __optional_move_assign_base<_Tp>;
 
 public:
   using value_type = _Tp;
 
-  using __trivially_relocatable = conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, optional, void>;
+  using __trivially_relocatable _LIBCPP_NODEBUG =
+      conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, optional, void>;
 
 private:
   // Disable the reference extension using this static assert.
@@ -613,7 +614,7 @@ private:
     }
   };
   template <class _Up>
-  using _CheckOptionalArgsCtor =
+  using _CheckOptionalArgsCtor _LIBCPP_NODEBUG =
       _If< _IsNotSame<__remove_cvref_t<_Up>, in_place_t>::value && _IsNotSame<__remove_cvref_t<_Up>, optional>::value &&
                (!is_same_v<remove_cv_t<_Tp>, bool> || !__is_std_optional<__remove_cvref_t<_Up>>::value),
            _CheckOptionalArgsConstructor,
@@ -621,7 +622,7 @@ private:
   template <class _QualUp>
   struct _CheckOptionalLikeConstructor {
     template <class _Up, class _Opt = optional<_Up>>
-    using __check_constructible_from_opt =
+    using __check_constructible_from_opt _LIBCPP_NODEBUG =
         _Or< is_constructible<_Tp, _Opt&>,
              is_constructible<_Tp, _Opt const&>,
              is_constructible<_Tp, _Opt&&>,
@@ -631,7 +632,7 @@ private:
              is_convertible<_Opt&&, _Tp>,
              is_convertible<_Opt const&&, _Tp> >;
     template <class _Up, class _Opt = optional<_Up>>
-    using __check_assignable_from_opt =
+    using __check_assignable_from_opt _LIBCPP_NODEBUG =
         _Or< is_assignable<_Tp&, _Opt&>,
              is_assignable<_Tp&, _Opt const&>,
              is_assignable<_Tp&, _Opt&&>,
@@ -655,12 +656,12 @@ private:
   };
 
   template <class _Up, class _QualUp>
-  using _CheckOptionalLikeCtor =
+  using _CheckOptionalLikeCtor _LIBCPP_NODEBUG =
       _If< _And< _IsNotSame<_Up, _Tp>, is_constructible<_Tp, _QualUp> >::value,
            _CheckOptionalLikeConstructor<_QualUp>,
            __check_tuple_constructor_fail >;
   template <class _Up, class _QualUp>
-  using _CheckOptionalLikeAssign =
+  using _CheckOptionalLikeAssign _LIBCPP_NODEBUG =
       _If< _And< _IsNotSame<_Up, _Tp>, is_constructible<_Tp, _QualUp>, is_assignable<_Tp&, _QualUp> >::value,
            _CheckOptionalLikeConstructor<_QualUp>,
            __check_tuple_constructor_fail >;
diff --git a/libcxx/include/ratio b/libcxx/include/ratio
index b35e2bd..2b5e34c 100644
--- a/libcxx/include/ratio
+++ b/libcxx/include/ratio
@@ -465,7 +465,7 @@ struct _LIBCPP_TEMPLATE_VIS ratio_greater_equal : _BoolConstant<!ratio_less<_R1,
 };
 
 template <class _R1, class _R2>
-using __ratio_gcd = ratio<__static_gcd<_R1::num, _R2::num>, __static_lcm<_R1::den, _R2::den> >;
+using __ratio_gcd _LIBCPP_NODEBUG = ratio<__static_gcd<_R1::num, _R2::num>, __static_lcm<_R1::den, _R2::den> >;
 
 #  if _LIBCPP_STD_VER >= 17
 template <class _R1, class _R2>
diff --git a/libcxx/include/regex b/libcxx/include/regex
index 15ec15a..5cad0bc 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -4229,7 +4229,8 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator==(const sub_match<_BiIter>& __x, cons
 
 #  if _LIBCPP_STD_VER >= 20
 template <class _BiIter>
-using __sub_match_cat = compare_three_way_result_t<basic_string<typename iterator_traits<_BiIter>::value_type>>;
+using __sub_match_cat _LIBCPP_NODEBUG =
+    compare_three_way_result_t<basic_string<typename iterator_traits<_BiIter>::value_type>>;
 
 template <class _BiIter>
 _LIBCPP_HIDE_FROM_ABI auto operator<=>(const sub_match<_BiIter>& __x, const sub_match<_BiIter>& __y) {
diff --git a/libcxx/include/source_location b/libcxx/include/source_location
index bbbb86b..b4777ce 100644
--- a/libcxx/include/source_location
+++ b/libcxx/include/source_location
@@ -55,7 +55,7 @@ class source_location {
   // in constant evaluation, so we don't want to use `void*` as the argument
   // type unless the builtin returned that, anyhow, and the invalid cast is
   // unavoidable.
-  using __bsl_ty = decltype(__builtin_source_location());
+  using __bsl_ty _LIBCPP_NODEBUG = decltype(__builtin_source_location());
 
 public:
   // The defaulted __ptr argument is necessary so that the builtin is evaluated
diff --git a/libcxx/include/string b/libcxx/include/string
index 7808f56..39982d5 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -764,7 +764,7 @@ struct __padding<0> {};
 template <class _CharT, class _Traits, class _Allocator>
 class basic_string {
 private:
-  using __default_allocator_type = allocator<_CharT>;
+  using __default_allocator_type _LIBCPP_NODEBUG = allocator<_CharT>;
 
 public:
   typedef basic_string __self;
@@ -798,7 +798,7 @@ public:
   // Therefore it's crucial to ensure the destructor is called.
   using __trivially_relocatable = void;
 #  else
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<allocator_type>::value && __libcpp_is_trivially_relocatable<pointer>::value,
       basic_string,
       void>;
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index b247874..aca14ba 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -552,7 +552,8 @@ class _LIBCPP_TEMPLATE_VIS tuple {
   get(const tuple<_Up...>&&) _NOEXCEPT;
 
 public:
-  using __trivially_relocatable = __conditional_t<_And<__libcpp_is_trivially_relocatable<_Tp>...>::value, tuple, void>;
+  using __trivially_relocatable _LIBCPP_NODEBUG =
+      __conditional_t<_And<__libcpp_is_trivially_relocatable<_Tp>...>::value, tuple, void>;
 
   // [tuple.cnstr]
 
@@ -1125,9 +1126,9 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 tuple<_Tp&...> tie(_T
 }
 
 template <class... _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 tuple<typename __unwrap_ref_decay<_Tp>::type...>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 tuple<__unwrap_ref_decay_t<_Tp>...>
 make_tuple(_Tp&&... __t) {
-  return tuple<typename __unwrap_ref_decay<_Tp>::type...>(std::forward<_Tp>(__t)...);
+  return tuple<__unwrap_ref_decay_t<_Tp>...>(std::forward<_Tp>(__t)...);
 }
 
 template <class... _Tp>
diff --git a/libcxx/include/valarray b/libcxx/include/valarray
index d0b76ee0..abc7d39 100644
--- a/libcxx/include/valarray
+++ b/libcxx/include/valarray
@@ -821,9 +821,15 @@ public:
   _LIBCPP_HIDE_FROM_ABI valarray& operator=(const __val_expr<_ValExpr>& __v);
 
   // element access:
-  _LIBCPP_HIDE_FROM_ABI const value_type& operator[](size_t __i) const { return __begin_[__i]; }
+  _LIBCPP_HIDE_FROM_ABI const value_type& operator[](size_t __i) const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__i < size(), "valarray::operator[] index out of bounds");
+    return __begin_[__i];
+  }
 
-  _LIBCPP_HIDE_FROM_ABI value_type& operator[](size_t __i) { return __begin_[__i]; }
+  _LIBCPP_HIDE_FROM_ABI value_type& operator[](size_t __i) {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__i < size(), "valarray::operator[] index out of bounds");
+    return __begin_[__i];
+  }
 
   // subset operations:
   _LIBCPP_HIDE_FROM_ABI __val_expr<__slice_expr<const valarray&> > operator[](slice __s) const;
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 3fa1b4b..6c7be7f 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -357,7 +357,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr auto __choose_index_type() {
 }
 
 template <size_t _NumAlts>
-using __variant_index_t = decltype(std::__choose_index_type<_NumAlts>());
+using __variant_index_t _LIBCPP_NODEBUG = decltype(std::__choose_index_type<_NumAlts>());
 
 template <class _IndexType>
 constexpr _IndexType __variant_npos = static_cast<_IndexType>(-1);
@@ -658,8 +658,8 @@ private:
 
 template <size_t _Index, class _Tp>
 struct _LIBCPP_TEMPLATE_VIS __alt {
-  using __value_type              = _Tp;
-  static constexpr size_t __index = _Index;
+  using __value_type _LIBCPP_NODEBUG = _Tp;
+  static constexpr size_t __index    = _Index;
 
   template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI explicit constexpr __alt(in_place_t, _Args&&... __args)
@@ -713,7 +713,7 @@ _LIBCPP_VARIANT_UNION(_Trait::_Unavailable, _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTE
 template <_Trait _DestructibleTrait, class... _Types>
 class _LIBCPP_TEMPLATE_VIS __base {
 public:
-  using __index_t = __variant_index_t<sizeof...(_Types)>;
+  using __index_t _LIBCPP_NODEBUG = __variant_index_t<sizeof...(_Types)>;
 
   _LIBCPP_HIDE_FROM_ABI explicit constexpr __base(__valueless_t __tag) noexcept
       : __data(__tag), __index(__variant_npos<__index_t>) {}
@@ -753,8 +753,8 @@ class _LIBCPP_TEMPLATE_VIS __dtor;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __dtor<__traits<_Types...>, destructible_trait>                                       \
           : public __base<destructible_trait, _Types...> {                                                             \
-        using __base_type = __base<destructible_trait, _Types...>;                                                     \
-        using __index_t   = typename __base_type::__index_t;                                                           \
+        using __base_type _LIBCPP_NODEBUG = __base<destructible_trait, _Types...>;                                     \
+        using __index_t _LIBCPP_NODEBUG   = typename __base_type::__index_t;                                           \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -799,7 +799,7 @@ _LIBCPP_VARIANT_DESTRUCTOR(_Trait::_Unavailable,
 
 template <class _Traits>
 class _LIBCPP_TEMPLATE_VIS __ctor : public __dtor<_Traits> {
-  using __base_type = __dtor<_Traits>;
+  using __base_type _LIBCPP_NODEBUG = __dtor<_Traits>;
 
 public:
   using __base_type::__base_type;
@@ -831,7 +831,7 @@ class _LIBCPP_TEMPLATE_VIS __move_constructor;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __move_constructor<__traits<_Types...>, move_constructible_trait>                     \
           : public __ctor<__traits<_Types...>> {                                                                       \
-        using __base_type = __ctor<__traits<_Types...>>;                                                               \
+        using __base_type _LIBCPP_NODEBUG = __ctor<__traits<_Types...>>;                                               \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -869,7 +869,7 @@ class _LIBCPP_TEMPLATE_VIS __copy_constructor;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __copy_constructor<__traits<_Types...>, copy_constructible_trait>                     \
           : public __move_constructor<__traits<_Types...>> {                                                           \
-        using __base_type = __move_constructor<__traits<_Types...>>;                                                   \
+        using __base_type _LIBCPP_NODEBUG = __move_constructor<__traits<_Types...>>;                                   \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -899,7 +899,7 @@ _LIBCPP_VARIANT_COPY_CONSTRUCTOR(
 
 template <class _Traits>
 class _LIBCPP_TEMPLATE_VIS __assignment : public __copy_constructor<_Traits> {
-  using __base_type = __copy_constructor<_Traits>;
+  using __base_type _LIBCPP_NODEBUG = __copy_constructor<_Traits>;
 
 public:
   using __base_type::__base_type;
@@ -958,7 +958,7 @@ class _LIBCPP_TEMPLATE_VIS __move_assignment;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __move_assignment<__traits<_Types...>, move_assignable_trait>                         \
           : public __assignment<__traits<_Types...>> {                                                                 \
-        using __base_type = __assignment<__traits<_Types...>>;                                                         \
+        using __base_type _LIBCPP_NODEBUG = __assignment<__traits<_Types...>>;                                         \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -997,7 +997,7 @@ class _LIBCPP_TEMPLATE_VIS __copy_assignment;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __copy_assignment<__traits<_Types...>, copy_assignable_trait>                         \
           : public __move_assignment<__traits<_Types...>> {                                                            \
-        using __base_type = __move_assignment<__traits<_Types...>>;                                                    \
+        using __base_type _LIBCPP_NODEBUG = __move_assignment<__traits<_Types...>>;                                    \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -1030,7 +1030,7 @@ _LIBCPP_VARIANT_COPY_ASSIGNMENT(_Trait::_Unavailable,
 
 template <class... _Types>
 class _LIBCPP_TEMPLATE_VIS __impl : public __copy_assignment<__traits<_Types...>> {
-  using __base_type = __copy_assignment<__traits<_Types...>>;
+  using __base_type _LIBCPP_NODEBUG = __copy_assignment<__traits<_Types...>>;
 
 public:
   using __base_type::__base_type; // get in_place_index_t constructor & friends
@@ -1097,7 +1097,7 @@ private:
 
 struct __no_narrowing_check {
   template <class _Dest, class _Source>
-  using _Apply = __type_identity<_Dest>;
+  using _Apply _LIBCPP_NODEBUG = __type_identity<_Dest>;
 };
 
 struct __narrowing_check {
@@ -1138,7 +1138,7 @@ using _MakeOverloads _LIBCPP_NODEBUG =
     typename __make_overloads_imp< __make_indices_imp<sizeof...(_Types), 0> >::template _Apply<_Types...>;
 
 template <class _Tp, class... _Types>
-using __best_match_t = typename invoke_result_t<_MakeOverloads<_Types...>, _Tp, _Tp>::type;
+using __best_match_t _LIBCPP_NODEBUG = typename invoke_result_t<_MakeOverloads<_Types...>, _Tp, _Tp>::type;
 
 } // namespace __variant_detail
 
@@ -1170,10 +1170,10 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DECLSPEC_EMPTY_BASES variant
 
   static_assert(__all<!is_void_v<_Types>...>::value, "variant can not have a void type as an alternative.");
 
-  using __first_type = variant_alternative_t<0, variant>;
+  using __first_type _LIBCPP_NODEBUG = variant_alternative_t<0, variant>;
 
 public:
-  using __trivially_relocatable =
+  using __trivially_relocatable _LIBCPP_NODEBUG =
       conditional_t<_And<__libcpp_is_trivially_relocatable<_Types>...>::value, variant, void>;
 
   template <bool _Dummy                                                                               = true,
diff --git a/libcxx/src/experimental/tzdb.cpp b/libcxx/src/experimental/tzdb.cpp
index d22de21..638d45f 100644
--- a/libcxx/src/experimental/tzdb.cpp
+++ b/libcxx/src/experimental/tzdb.cpp
@@ -9,11 +9,14 @@
 // For information see https://libcxx.llvm.org/DesignDocs/TimeZone.html
 
 #include <algorithm>
+#include <cctype>
 #include <chrono>
 #include <filesystem>
 #include <fstream>
 #include <stdexcept>
 #include <string>
+#include <string_view>
+#include <vector>
 
 #include "include/tzdb/time_zone_private.h"
 #include "include/tzdb/types_private.h"
diff --git a/libcxx/src/filesystem/directory_iterator.cpp b/libcxx/src/filesystem/directory_iterator.cpp
index d7ed9a3..7e8e40d 100644
--- a/libcxx/src/filesystem/directory_iterator.cpp
+++ b/libcxx/src/filesystem/directory_iterator.cpp
@@ -47,9 +47,9 @@ public:
     }
     __stream_ = ::FindFirstFileW((root / "*").c_str(), &__data_);
     if (__stream_ == INVALID_HANDLE_VALUE) {
-      ec                                  = detail::make_windows_error(GetLastError());
+      ec                                  = detail::get_last_error();
       const bool ignore_permission_denied = bool(opts & directory_options::skip_permission_denied);
-      if (ignore_permission_denied && ec.value() == static_cast<int>(errc::permission_denied))
+      if (ignore_permission_denied && ec == errc::permission_denied)
         ec.clear();
       return;
     }
@@ -91,7 +91,7 @@ private:
   error_code close() noexcept {
     error_code ec;
     if (!::FindClose(__stream_))
-      ec = detail::make_windows_error(GetLastError());
+      ec = detail::get_last_error();
     __stream_ = INVALID_HANDLE_VALUE;
     return ec;
   }
@@ -118,7 +118,7 @@ public:
     if ((__stream_ = ::opendir(root.c_str())) == nullptr) {
       ec                      = detail::capture_errno();
       const bool allow_eacces = bool(opts & directory_options::skip_permission_denied);
-      if (allow_eacces && ec.value() == EACCES)
+      if (allow_eacces && ec == errc::permission_denied)
         ec.clear();
       return;
     }
@@ -307,7 +307,7 @@ bool recursive_directory_iterator::__try_recursion(error_code* ec) {
   }
   if (m_ec) {
     const bool allow_eacess = bool(__imp_->__options_ & directory_options::skip_permission_denied);
-    if (m_ec.value() == EACCES && allow_eacess) {
+    if (m_ec == errc::permission_denied && allow_eacess) {
       if (ec)
         ec->clear();
     } else {
diff --git a/libcxx/src/filesystem/error.h b/libcxx/src/filesystem/error.h
index 07ba7fc..c021391 100644
--- a/libcxx/src/filesystem/error.h
+++ b/libcxx/src/filesystem/error.h
@@ -32,80 +32,21 @@ _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM
 
 namespace detail {
 
-#if defined(_LIBCPP_WIN32API)
-
-inline errc __win_err_to_errc(int err) {
-  constexpr struct {
-    DWORD win;
-    errc errc;
-  } win_error_mapping[] = {
-      {ERROR_ACCESS_DENIED, errc::permission_denied},
-      {ERROR_ALREADY_EXISTS, errc::file_exists},
-      {ERROR_BAD_NETPATH, errc::no_such_file_or_directory},
-      {ERROR_BAD_PATHNAME, errc::no_such_file_or_directory},
-      {ERROR_BAD_UNIT, errc::no_such_device},
-      {ERROR_BROKEN_PIPE, errc::broken_pipe},
-      {ERROR_BUFFER_OVERFLOW, errc::filename_too_long},
-      {ERROR_BUSY, errc::device_or_resource_busy},
-      {ERROR_BUSY_DRIVE, errc::device_or_resource_busy},
-      {ERROR_CANNOT_MAKE, errc::permission_denied},
-      {ERROR_CANTOPEN, errc::io_error},
-      {ERROR_CANTREAD, errc::io_error},
-      {ERROR_CANTWRITE, errc::io_error},
-      {ERROR_CURRENT_DIRECTORY, errc::permission_denied},
-      {ERROR_DEV_NOT_EXIST, errc::no_such_device},
-      {ERROR_DEVICE_IN_USE, errc::device_or_resource_busy},
-      {ERROR_DIR_NOT_EMPTY, errc::directory_not_empty},
-      {ERROR_DIRECTORY, errc::invalid_argument},
-      {ERROR_DISK_FULL, errc::no_space_on_device},
-      {ERROR_FILE_EXISTS, errc::file_exists},
-      {ERROR_FILE_NOT_FOUND, errc::no_such_file_or_directory},
-      {ERROR_HANDLE_DISK_FULL, errc::no_space_on_device},
-      {ERROR_INVALID_ACCESS, errc::permission_denied},
-      {ERROR_INVALID_DRIVE, errc::no_such_device},
-      {ERROR_INVALID_FUNCTION, errc::function_not_supported},
-      {ERROR_INVALID_HANDLE, errc::invalid_argument},
-      {ERROR_INVALID_NAME, errc::no_such_file_or_directory},
-      {ERROR_INVALID_PARAMETER, errc::invalid_argument},
-      {ERROR_LOCK_VIOLATION, errc::no_lock_available},
-      {ERROR_LOCKED, errc::no_lock_available},
-      {ERROR_NEGATIVE_SEEK, errc::invalid_argument},
-      {ERROR_NOACCESS, errc::permission_denied},
-      {ERROR_NOT_ENOUGH_MEMORY, errc::not_enough_memory},
-      {ERROR_NOT_READY, errc::resource_unavailable_try_again},
-      {ERROR_NOT_SAME_DEVICE, errc::cross_device_link},
-      {ERROR_NOT_SUPPORTED, errc::not_supported},
-      {ERROR_OPEN_FAILED, errc::io_error},
-      {ERROR_OPEN_FILES, errc::device_or_resource_busy},
-      {ERROR_OPERATION_ABORTED, errc::operation_canceled},
-      {ERROR_OUTOFMEMORY, errc::not_enough_memory},
-      {ERROR_PATH_NOT_FOUND, errc::no_such_file_or_directory},
-      {ERROR_READ_FAULT, errc::io_error},
-      {ERROR_REPARSE_TAG_INVALID, errc::invalid_argument},
-      {ERROR_RETRY, errc::resource_unavailable_try_again},
-      {ERROR_SEEK, errc::io_error},
-      {ERROR_SHARING_VIOLATION, errc::permission_denied},
-      {ERROR_TOO_MANY_OPEN_FILES, errc::too_many_files_open},
-      {ERROR_WRITE_FAULT, errc::io_error},
-      {ERROR_WRITE_PROTECT, errc::permission_denied},
-  };
-
-  for (const auto& pair : win_error_mapping)
-    if (pair.win == static_cast<DWORD>(err))
-      return pair.errc;
-  return errc::invalid_argument;
-}
-
-#endif // _LIBCPP_WIN32API
+// On windows, libc functions use errno, but system functions use GetLastError.
+// So, callers need to be careful which of these next functions they call!
 
 inline error_code capture_errno() {
   _LIBCPP_ASSERT_INTERNAL(errno != 0, "Expected errno to be non-zero");
   return error_code(errno, generic_category());
 }
 
+inline error_code get_last_error() {
 #if defined(_LIBCPP_WIN32API)
-inline error_code make_windows_error(int err) { return make_error_code(__win_err_to_errc(err)); }
+  return std::error_code(GetLastError(), std::system_category());
+#else
+  return capture_errno();
 #endif
+}
 
 template <class T>
 T error_value();
diff --git a/libcxx/src/filesystem/file_descriptor.h b/libcxx/src/filesystem/file_descriptor.h
index db66ad5..9c279c4 100644
--- a/libcxx/src/filesystem/file_descriptor.h
+++ b/libcxx/src/filesystem/file_descriptor.h
@@ -201,7 +201,7 @@ inline perms posix_get_perms(const StatT& st) noexcept { return static_cast<perm
 inline file_status create_file_status(error_code& m_ec, path const& p, const StatT& path_stat, error_code* ec) {
   if (ec)
     *ec = m_ec;
-  if (m_ec && (m_ec.value() == ENOENT || m_ec.value() == ENOTDIR)) {
+  if (m_ec && (m_ec == errc::no_such_file_or_directory || m_ec == errc::not_a_directory)) {
     return file_status(file_type::not_found);
   } else if (m_ec) {
     ErrorHandler<void> err("posix_stat", ec, &p);
@@ -236,7 +236,7 @@ inline file_status create_file_status(error_code& m_ec, path const& p, const Sta
 inline file_status posix_stat(path const& p, StatT& path_stat, error_code* ec) {
   error_code m_ec;
   if (detail::stat(p.c_str(), &path_stat) == -1)
-    m_ec = detail::capture_errno();
+    m_ec = detail::get_last_error();
   return create_file_status(m_ec, p, path_stat, ec);
 }
 
@@ -248,7 +248,7 @@ inline file_status posix_stat(path const& p, error_code* ec) {
 inline file_status posix_lstat(path const& p, StatT& path_stat, error_code* ec) {
   error_code m_ec;
   if (detail::lstat(p.c_str(), &path_stat) == -1)
-    m_ec = detail::capture_errno();
+    m_ec = detail::get_last_error();
   return create_file_status(m_ec, p, path_stat, ec);
 }
 
@@ -260,7 +260,7 @@ inline file_status posix_lstat(path const& p, error_code* ec) {
 // http://pubs.opengroup.org/onlinepubs/9699919799/functions/ftruncate.html
 inline bool posix_ftruncate(const FileDescriptor& fd, off_t to_size, error_code& ec) {
   if (detail::ftruncate(fd.fd, to_size) == -1) {
-    ec = capture_errno();
+    ec = get_last_error();
     return true;
   }
   ec.clear();
@@ -269,7 +269,7 @@ inline bool posix_ftruncate(const FileDescriptor& fd, off_t to_size, error_code&
 
 inline bool posix_fchmod(const FileDescriptor& fd, const StatT& st, error_code& ec) {
   if (detail::fchmod(fd.fd, st.st_mode) == -1) {
-    ec = capture_errno();
+    ec = get_last_error();
     return true;
   }
   ec.clear();
@@ -286,7 +286,7 @@ inline file_status FileDescriptor::refresh_status(error_code& ec) {
   m_stat   = {};
   error_code m_ec;
   if (detail::fstat(fd, &m_stat) == -1)
-    m_ec = capture_errno();
+    m_ec = get_last_error();
   m_status = create_file_status(m_ec, name, m_stat, &ec);
   return m_status;
 }
diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp
index d771f20..23c1c28 100644
--- a/libcxx/src/filesystem/operations.cpp
+++ b/libcxx/src/filesystem/operations.cpp
@@ -15,6 +15,7 @@
 #include <filesystem>
 #include <iterator>
 #include <string_view>
+#include <system_error>
 #include <type_traits>
 #include <vector>
 
@@ -32,11 +33,24 @@
 #  include <dirent.h>
 #  include <sys/stat.h>
 #  include <sys/statvfs.h>
+#  include <sys/types.h>
 #  include <unistd.h>
 #endif
 #include <fcntl.h> /* values for fchmodat */
 #include <time.h>
 
+// since Linux 4.5 and FreeBSD 13, but the Linux libc wrapper is only provided by glibc >= 2.27 and musl
+#if defined(__linux__)
+#  if defined(_LIBCPP_GLIBC_PREREQ)
+#    if _LIBCPP_GLIBC_PREREQ(2, 27)
+#      define _LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE
+#    endif
+#  elif _LIBCPP_HAS_MUSL_LIBC
+#    define _LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE
+#  endif
+#elif defined(__FreeBSD__)
+#  define _LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE
+#endif
 #if __has_include(<sys/sendfile.h>)
 #  include <sys/sendfile.h>
 #  define _LIBCPP_FILESYSTEM_USE_SENDFILE
@@ -44,10 +58,18 @@
 #  include <copyfile.h>
 #  define _LIBCPP_FILESYSTEM_USE_COPYFILE
 #else
-#  include <fstream>
 #  define _LIBCPP_FILESYSTEM_USE_FSTREAM
 #endif
 
+// sendfile and copy_file_range need to fall back
+// to the fstream implementation for special files
+#if (defined(_LIBCPP_FILESYSTEM_USE_SENDFILE) || defined(_LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE) ||                    \
+     defined(_LIBCPP_FILESYSTEM_USE_FSTREAM)) &&                                                                       \
+    _LIBCPP_HAS_LOCALIZATION
+#  include <fstream>
+#  define _LIBCPP_FILESYSTEM_NEED_FSTREAM
+#endif
+
 #if defined(__ELF__) && defined(_LIBCPP_LINK_RT_LIB)
 #  pragma comment(lib, "rt")
 #endif
@@ -86,7 +108,7 @@ path __canonical(path const& orig_p, error_code* ec) {
 #if (defined(_POSIX_VERSION) && _POSIX_VERSION >= 200112) || defined(_LIBCPP_WIN32API)
   std::unique_ptr<path::value_type, decltype(&::free)> hold(detail::realpath(p.c_str(), nullptr), &::free);
   if (hold.get() == nullptr)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   return {hold.get()};
 #else
 #  if defined(__MVS__) && !defined(PATH_MAX)
@@ -96,7 +118,7 @@ path __canonical(path const& orig_p, error_code* ec) {
 #  endif
   path::value_type* ret;
   if ((ret = detail::realpath(p.c_str(), buff)) == nullptr)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   return {ret};
 #endif
 }
@@ -178,9 +200,89 @@ void __copy(const path& from, const path& to, copy_options options, error_code*
 namespace detail {
 namespace {
 
+#if defined(_LIBCPP_FILESYSTEM_NEED_FSTREAM)
+bool copy_file_impl_fstream(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
+  ifstream in;
+  in.__open(read_fd.fd, ios::binary);
+  if (!in.is_open()) {
+    // This assumes that __open didn't reset the error code.
+    ec = capture_errno();
+    return false;
+  }
+  read_fd.fd = -1;
+  ofstream out;
+  out.__open(write_fd.fd, ios::binary);
+  if (!out.is_open()) {
+    ec = capture_errno();
+    return false;
+  }
+  write_fd.fd = -1;
+
+  if (in.good() && out.good()) {
+    using InIt  = istreambuf_iterator<char>;
+    using OutIt = ostreambuf_iterator<char>;
+    InIt bin(in);
+    InIt ein;
+    OutIt bout(out);
+    copy(bin, ein, bout);
+  }
+  if (out.fail() || in.fail()) {
+    ec = make_error_code(errc::io_error);
+    return false;
+  }
+
+  ec.clear();
+  return true;
+}
+#endif
+
+#if defined(_LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE)
+bool copy_file_impl_copy_file_range(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
+  size_t count = read_fd.get_stat().st_size;
+  // a zero-length file is either empty, or not copyable by this syscall
+  // return early to avoid the syscall cost
+  if (count == 0) {
+    ec = {EINVAL, generic_category()};
+    return false;
+  }
+  // do not modify the fd positions as copy_file_impl_sendfile may be called after a partial copy
+#  if defined(__linux__)
+  loff_t off_in  = 0;
+  loff_t off_out = 0;
+#  else
+  off_t off_in  = 0;
+  off_t off_out = 0;
+#  endif
+
+  do {
+    ssize_t res;
+
+    if ((res = ::copy_file_range(read_fd.fd, &off_in, write_fd.fd, &off_out, count, 0)) == -1) {
+      ec = capture_errno();
+      return false;
+    }
+    count -= res;
+  } while (count > 0);
+
+  ec.clear();
+
+  return true;
+}
+#endif
+
 #if defined(_LIBCPP_FILESYSTEM_USE_SENDFILE)
-bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
+bool copy_file_impl_sendfile(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
   size_t count = read_fd.get_stat().st_size;
+  // a zero-length file is either empty, or not copyable by this syscall
+  // return early to avoid the syscall cost
+  // however, we can't afford this luxury in the no-locale build,
+  // as we can't utilize the fstream impl to copy empty files
+#  if _LIBCPP_HAS_LOCALIZATION
+  if (count == 0) {
+    ec = {EINVAL, generic_category()};
+    return false;
+  }
+#  endif
   do {
     ssize_t res;
     if ((res = ::sendfile(write_fd.fd, read_fd.fd, nullptr, count)) == -1) {
@@ -194,6 +296,54 @@ bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_cod
 
   return true;
 }
+#endif
+
+#if defined(_LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE) || defined(_LIBCPP_FILESYSTEM_USE_SENDFILE)
+// If we have copy_file_range or sendfile, try both in succession (if available).
+// If both fail, fall back to using fstream.
+bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
+#  if defined(_LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE)
+  if (copy_file_impl_copy_file_range(read_fd, write_fd, ec)) {
+    return true;
+  }
+  // EINVAL: src and dst are the same file (this is not cheaply
+  // detectable from userspace)
+  // EINVAL: copy_file_range is unsupported for this file type by the
+  // underlying filesystem
+  // ENOTSUP: undocumented, can arise with old kernels and NFS
+  // EOPNOTSUPP: filesystem does not implement copy_file_range
+  // ETXTBSY: src or dst is an active swapfile (nonsensical, but allowed
+  // with normal copying)
+  // EXDEV: src and dst are on different filesystems that do not support
+  // cross-fs copy_file_range
+  // ENOENT: undocumented, can arise with CIFS
+  // ENOSYS: unsupported by kernel or blocked by seccomp
+  if (ec.value() != EINVAL && ec.value() != ENOTSUP && ec.value() != EOPNOTSUPP && ec.value() != ETXTBSY &&
+      ec.value() != EXDEV && ec.value() != ENOENT && ec.value() != ENOSYS) {
+    return false;
+  }
+  ec.clear();
+#  endif
+
+#  if defined(_LIBCPP_FILESYSTEM_USE_SENDFILE)
+  if (copy_file_impl_sendfile(read_fd, write_fd, ec)) {
+    return true;
+  }
+  // EINVAL: unsupported file type
+  if (ec.value() != EINVAL) {
+    return false;
+  }
+  ec.clear();
+#  endif
+
+#  if defined(_LIBCPP_FILESYSTEM_NEED_FSTREAM)
+  return copy_file_impl_fstream(read_fd, write_fd, ec);
+#  else
+  // since iostreams are unavailable in the no-locale build, just fail after a failed sendfile
+  ec.assign(EINVAL, std::system_category());
+  return false;
+#  endif
+}
 #elif defined(_LIBCPP_FILESYSTEM_USE_COPYFILE)
 bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
   struct CopyFileState {
@@ -217,37 +367,7 @@ bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_cod
 }
 #elif defined(_LIBCPP_FILESYSTEM_USE_FSTREAM)
 bool copy_file_impl(FileDescriptor& read_fd, FileDescriptor& write_fd, error_code& ec) {
-  ifstream in;
-  in.__open(read_fd.fd, ios::binary);
-  if (!in.is_open()) {
-    // This assumes that __open didn't reset the error code.
-    ec = capture_errno();
-    return false;
-  }
-  read_fd.fd = -1;
-  ofstream out;
-  out.__open(write_fd.fd, ios::binary);
-  if (!out.is_open()) {
-    ec = capture_errno();
-    return false;
-  }
-  write_fd.fd = -1;
-
-  if (in.good() && out.good()) {
-    using InIt  = istreambuf_iterator<char>;
-    using OutIt = ostreambuf_iterator<char>;
-    InIt bin(in);
-    InIt ein;
-    OutIt bout(out);
-    copy(bin, ein, bout);
-  }
-  if (out.fail() || in.fail()) {
-    ec = make_error_code(errc::io_error);
-    return false;
-  }
-
-  ec.clear();
-  return true;
+  return copy_file_impl_fstream(read_fd, write_fd, ec);
 }
 #else
 #  error "Unknown implementation for copy_file_impl"
@@ -393,9 +513,9 @@ bool __create_directory(const path& p, error_code* ec) {
   if (detail::mkdir(p.c_str(), static_cast<int>(perms::all)) == 0)
     return true;
 
-  if (errno != EEXIST)
-    return err.report(capture_errno());
-  error_code mec = capture_errno();
+  error_code mec = detail::get_last_error();
+  if (mec != errc::file_exists)
+    return err.report(mec);
   error_code ignored_ec;
   const file_status st = status(p, ignored_ec);
   if (!is_directory(st))
@@ -417,10 +537,10 @@ bool __create_directory(path const& p, path const& attributes, error_code* ec) {
   if (detail::mkdir(p.c_str(), attr_stat.st_mode) == 0)
     return true;
 
-  if (errno != EEXIST)
-    return err.report(capture_errno());
+  mec = detail::get_last_error();
+  if (mec != errc::file_exists)
+    return err.report(mec);
 
-  mec = capture_errno();
   error_code ignored_ec;
   st = status(p, ignored_ec);
   if (!is_directory(st))
@@ -431,19 +551,19 @@ bool __create_directory(path const& p, path const& attributes, error_code* ec) {
 void __create_directory_symlink(path const& from, path const& to, error_code* ec) {
   ErrorHandler<void> err("create_directory_symlink", ec, &from, &to);
   if (detail::symlink_dir(from.c_str(), to.c_str()) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
 }
 
 void __create_hard_link(const path& from, const path& to, error_code* ec) {
   ErrorHandler<void> err("create_hard_link", ec, &from, &to);
   if (detail::link(from.c_str(), to.c_str()) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
 }
 
 void __create_symlink(path const& from, path const& to, error_code* ec) {
   ErrorHandler<void> err("create_symlink", ec, &from, &to);
   if (detail::symlink_file(from.c_str(), to.c_str()) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
 }
 
 path __current_path(error_code* ec) {
@@ -486,7 +606,7 @@ path __current_path(error_code* ec) {
 
   unique_ptr<path::value_type, Deleter> hold(detail::getcwd(ptr, size), deleter);
   if (hold.get() == nullptr)
-    return err.report(capture_errno(), "call to getcwd failed");
+    return err.report(detail::get_last_error(), "call to getcwd failed");
 
   return {hold.get()};
 }
@@ -494,7 +614,7 @@ path __current_path(error_code* ec) {
 void __current_path(const path& p, error_code* ec) {
   ErrorHandler<void> err("current_path", ec, &p);
   if (detail::chdir(p.c_str()) == -1)
-    err.report(capture_errno());
+    err.report(detail::get_last_error());
 }
 
 bool __equivalent(const path& p1, const path& p2, error_code* ec) {
@@ -582,10 +702,10 @@ void __last_write_time(const path& p, file_time_type new_time, error_code* ec) {
     return err.report(errc::value_too_large);
   detail::WinHandle h(p.c_str(), FILE_WRITE_ATTRIBUTES, 0);
   if (!h)
-    return err.report(detail::make_windows_error(GetLastError()));
+    return err.report(detail::get_last_error());
   FILETIME last_write = timespec_to_filetime(ts);
   if (!SetFileTime(h, nullptr, nullptr, &last_write))
-    return err.report(detail::make_windows_error(GetLastError()));
+    return err.report(detail::get_last_error());
 #else
   error_code m_ec;
   array<TimeSpec, 2> tbuf;
@@ -643,7 +763,7 @@ void __permissions(const path& p, perms prms, perm_options opts, error_code* ec)
 #if defined(AT_SYMLINK_NOFOLLOW) && defined(AT_FDCWD)
   const int flags = set_sym_perms ? AT_SYMLINK_NOFOLLOW : 0;
   if (detail::fchmodat(AT_FDCWD, p.c_str(), real_perms, flags) == -1) {
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   }
 #else
   if (set_sym_perms)
@@ -671,14 +791,14 @@ path __read_symlink(const path& p, error_code* ec) {
 #else
   StatT sb;
   if (detail::lstat(p.c_str(), &sb) == -1) {
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   }
   const size_t size = sb.st_size + 1;
   auto buff         = unique_ptr<path::value_type[]>(new path::value_type[size]);
 #endif
   detail::SSizeT ret;
   if ((ret = detail::readlink(p.c_str(), buff.get(), size)) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   // Note that `ret` returning `0` would work, resulting in a valid empty string being returned.
   if (static_cast<size_t>(ret) >= size)
     return err.report(errc::value_too_large);
@@ -689,8 +809,9 @@ path __read_symlink(const path& p, error_code* ec) {
 bool __remove(const path& p, error_code* ec) {
   ErrorHandler<bool> err("remove", ec, &p);
   if (detail::remove(p.c_str()) == -1) {
-    if (errno != ENOENT)
-      err.report(capture_errno());
+    error_code mec = detail::get_last_error();
+    if (mec != errc::no_such_file_or_directory)
+      err.report(mec);
     return false;
   }
   return true;
@@ -843,13 +964,13 @@ uintmax_t __remove_all(const path& p, error_code* ec) {
 void __rename(const path& from, const path& to, error_code* ec) {
   ErrorHandler<void> err("rename", ec, &from, &to);
   if (detail::rename(from.c_str(), to.c_str()) == -1)
-    err.report(capture_errno());
+    err.report(detail::get_last_error());
 }
 
 void __resize_file(const path& p, uintmax_t size, error_code* ec) {
   ErrorHandler<void> err("resize_file", ec, &p);
   if (detail::truncate(p.c_str(), static_cast< ::off_t>(size)) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
 }
 
 space_info __space(const path& p, error_code* ec) {
@@ -857,7 +978,7 @@ space_info __space(const path& p, error_code* ec) {
   space_info si;
   detail::StatVFS m_svfs = {};
   if (detail::statvfs(p.c_str(), &m_svfs) == -1) {
-    err.report(capture_errno());
+    err.report(detail::get_last_error());
     si.capacity = si.free = si.available = static_cast<uintmax_t>(-1);
     return si;
   }
@@ -884,7 +1005,7 @@ path __temp_directory_path(error_code* ec) {
   wchar_t buf[MAX_PATH];
   DWORD retval = GetTempPathW(MAX_PATH, buf);
   if (!retval)
-    return err.report(detail::make_windows_error(GetLastError()));
+    return err.report(detail::get_last_error());
   if (retval > MAX_PATH)
     return err.report(errc::filename_too_long);
   // GetTempPathW returns a path with a trailing slash, which we
diff --git a/libcxx/src/filesystem/posix_compat.h b/libcxx/src/filesystem/posix_compat.h
index b41c004..ddd99d8 100644
--- a/libcxx/src/filesystem/posix_compat.h
+++ b/libcxx/src/filesystem/posix_compat.h
@@ -11,9 +11,10 @@
 //
 // These generally behave like the proper posix functions, with these
 // exceptions:
-// On Windows, they take paths in wchar_t* form, instead of char* form.
-// The symlink() function is split into two frontends, symlink_file()
-// and symlink_dir().
+// - On Windows, they take paths in wchar_t* form, instead of char* form.
+// - The symlink() function is split into two frontends, symlink_file()
+//   and symlink_dir().
+// - Errors should be retrieved with get_last_error, not errno.
 //
 // These are provided within an anonymous namespace within the detail
 // namespace - callers need to include this header and call them as
@@ -122,11 +123,6 @@ namespace detail {
 
 #  define O_NONBLOCK 0
 
-inline int set_errno(int e = GetLastError()) {
-  errno = static_cast<int>(__win_err_to_errc(e));
-  return -1;
-}
-
 class WinHandle {
 public:
   WinHandle(const wchar_t* p, DWORD access, DWORD flags) {
@@ -153,7 +149,7 @@ private:
 inline int stat_handle(HANDLE h, StatT* buf) {
   FILE_BASIC_INFO basic;
   if (!GetFileInformationByHandleEx(h, FileBasicInfo, &basic, sizeof(basic)))
-    return set_errno();
+    return -1;
   memset(buf, 0, sizeof(*buf));
   buf->st_mtim = filetime_to_timespec(basic.LastWriteTime);
   buf->st_atim = filetime_to_timespec(basic.LastAccessTime);
@@ -168,18 +164,18 @@ inline int stat_handle(HANDLE h, StatT* buf) {
   if (basic.FileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) {
     FILE_ATTRIBUTE_TAG_INFO tag;
     if (!GetFileInformationByHandleEx(h, FileAttributeTagInfo, &tag, sizeof(tag)))
-      return set_errno();
+      return -1;
     if (tag.ReparseTag == IO_REPARSE_TAG_SYMLINK)
       buf->st_mode = (buf->st_mode & ~_S_IFMT) | _S_IFLNK;
   }
   FILE_STANDARD_INFO standard;
   if (!GetFileInformationByHandleEx(h, FileStandardInfo, &standard, sizeof(standard)))
-    return set_errno();
+    return -1;
   buf->st_nlink = standard.NumberOfLinks;
   buf->st_size  = standard.EndOfFile.QuadPart;
   BY_HANDLE_FILE_INFORMATION info;
   if (!GetFileInformationByHandle(h, &info))
-    return set_errno();
+    return -1;
   buf->st_dev = info.dwVolumeSerialNumber;
   memcpy(&buf->st_ino.id[0], &info.nFileIndexHigh, 4);
   memcpy(&buf->st_ino.id[4], &info.nFileIndexLow, 4);
@@ -189,7 +185,7 @@ inline int stat_handle(HANDLE h, StatT* buf) {
 inline int stat_file(const wchar_t* path, StatT* buf, DWORD flags) {
   WinHandle h(path, FILE_READ_ATTRIBUTES, flags);
   if (!h)
-    return set_errno();
+    return -1;
   int ret = stat_handle(h, buf);
   return ret;
 }
@@ -206,7 +202,7 @@ inline int fstat(int fd, StatT* buf) {
 inline int mkdir(const wchar_t* path, int permissions) {
   (void)permissions;
   if (!CreateDirectoryW(path, nullptr))
-    return set_errno();
+    return -1;
   return 0;
 }
 
@@ -219,10 +215,10 @@ inline int symlink_file_dir(const wchar_t* oldname, const wchar_t* newname, bool
     return 0;
   int e = GetLastError();
   if (e != ERROR_INVALID_PARAMETER)
-    return set_errno(e);
+    return -1;
   if (CreateSymbolicLinkW(newname, oldname, flags))
     return 0;
-  return set_errno();
+  return -1;
 }
 
 inline int symlink_file(const wchar_t* oldname, const wchar_t* newname) {
@@ -236,17 +232,17 @@ inline int symlink_dir(const wchar_t* oldname, const wchar_t* newname) {
 inline int link(const wchar_t* oldname, const wchar_t* newname) {
   if (CreateHardLinkW(newname, oldname, nullptr))
     return 0;
-  return set_errno();
+  return -1;
 }
 
 inline int remove(const wchar_t* path) {
   detail::WinHandle h(path, DELETE, FILE_FLAG_OPEN_REPARSE_POINT);
   if (!h)
-    return set_errno();
+    return -1;
   FILE_DISPOSITION_INFO info;
   info.DeleteFile = TRUE;
   if (!SetFileInformationByHandle(h, FileDispositionInfo, &info, sizeof(info)))
-    return set_errno();
+    return -1;
   return 0;
 }
 
@@ -254,9 +250,9 @@ inline int truncate_handle(HANDLE h, off_t length) {
   LARGE_INTEGER size_param;
   size_param.QuadPart = length;
   if (!SetFilePointerEx(h, size_param, 0, FILE_BEGIN))
-    return set_errno();
+    return -1;
   if (!SetEndOfFile(h))
-    return set_errno();
+    return -1;
   return 0;
 }
 
@@ -268,19 +264,19 @@ inline int ftruncate(int fd, off_t length) {
 inline int truncate(const wchar_t* path, off_t length) {
   detail::WinHandle h(path, GENERIC_WRITE, 0);
   if (!h)
-    return set_errno();
+    return -1;
   return truncate_handle(h, length);
 }
 
 inline int rename(const wchar_t* from, const wchar_t* to) {
   if (!(MoveFileExW(from, to, MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH)))
-    return set_errno();
+    return -1;
   return 0;
 }
 
 inline int chdir(const wchar_t* path) {
   if (!SetCurrentDirectoryW(path))
-    return set_errno();
+    return -1;
   return 0;
 }
 
@@ -300,7 +296,7 @@ inline int statvfs(const wchar_t* p, StatVFS* buf) {
       break;
     path parent = dir.parent_path();
     if (parent == dir) {
-      errno = ENOENT;
+      SetLastError(ERROR_PATH_NOT_FOUND);
       return -1;
     }
     dir = parent;
@@ -308,7 +304,7 @@ inline int statvfs(const wchar_t* p, StatVFS* buf) {
   ULARGE_INTEGER free_bytes_available_to_caller, total_number_of_bytes, total_number_of_free_bytes;
   if (!GetDiskFreeSpaceExW(
           dir.c_str(), &free_bytes_available_to_caller, &total_number_of_bytes, &total_number_of_free_bytes))
-    return set_errno();
+    return -1;
   buf->f_frsize = 1;
   buf->f_blocks = total_number_of_bytes.QuadPart;
   buf->f_bfree  = total_number_of_free_bytes.QuadPart;
@@ -330,7 +326,6 @@ inline wchar_t* getcwd([[maybe_unused]] wchar_t* in_buf, [[maybe_unused]] size_t
     retval = GetCurrentDirectoryW(buff_size, buff.get());
   }
   if (!retval) {
-    set_errno();
     return nullptr;
   }
   return buff.release();
@@ -342,7 +337,6 @@ inline wchar_t* realpath(const wchar_t* path, [[maybe_unused]] wchar_t* resolved
 
   WinHandle h(path, FILE_READ_ATTRIBUTES, 0);
   if (!h) {
-    set_errno();
     return nullptr;
   }
   size_t buff_size = MAX_PATH + 10;
@@ -354,7 +348,6 @@ inline wchar_t* realpath(const wchar_t* path, [[maybe_unused]] wchar_t* resolved
     retval = GetFinalPathNameByHandleW(h, buff.get(), buff_size, FILE_NAME_NORMALIZED | VOLUME_NAME_DOS);
   }
   if (!retval) {
-    set_errno();
     return nullptr;
   }
   wchar_t* ptr = buff.get();
@@ -376,20 +369,20 @@ using ModeT = int;
 inline int fchmod_handle(HANDLE h, int perms) {
   FILE_BASIC_INFO basic;
   if (!GetFileInformationByHandleEx(h, FileBasicInfo, &basic, sizeof(basic)))
-    return set_errno();
+    return -1;
   DWORD orig_attributes = basic.FileAttributes;
   basic.FileAttributes &= ~FILE_ATTRIBUTE_READONLY;
   if ((perms & 0222) == 0)
     basic.FileAttributes |= FILE_ATTRIBUTE_READONLY;
   if (basic.FileAttributes != orig_attributes && !SetFileInformationByHandle(h, FileBasicInfo, &basic, sizeof(basic)))
-    return set_errno();
+    return -1;
   return 0;
 }
 
 inline int fchmodat(int /*fd*/, const wchar_t* path, int perms, int flag) {
   DWORD attributes = GetFileAttributesW(path);
   if (attributes == INVALID_FILE_ATTRIBUTES)
-    return set_errno();
+    return -1;
   if (attributes & FILE_ATTRIBUTE_REPARSE_POINT && !(flag & AT_SYMLINK_NOFOLLOW)) {
     // If the file is a symlink, and we are supposed to operate on the target
     // of the symlink, we need to open a handle to it, without the
@@ -397,7 +390,7 @@ inline int fchmodat(int /*fd*/, const wchar_t* path, int perms, int flag) {
     // symlink, and operate on it via the handle.
     detail::WinHandle h(path, FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES, 0);
     if (!h)
-      return set_errno();
+      return -1;
     return fchmod_handle(h, perms);
   } else {
     // For a non-symlink, or if operating on the symlink itself instead of
@@ -407,7 +400,7 @@ inline int fchmodat(int /*fd*/, const wchar_t* path, int perms, int flag) {
     if ((perms & 0222) == 0)
       attributes |= FILE_ATTRIBUTE_READONLY;
     if (attributes != orig_attributes && !SetFileAttributesW(path, attributes))
-      return set_errno();
+      return -1;
   }
   return 0;
 }
@@ -424,18 +417,18 @@ inline SSizeT readlink(const wchar_t* path, wchar_t* ret_buf, size_t bufsize) {
   uint8_t buf[MAXIMUM_REPARSE_DATA_BUFFER_SIZE];
   detail::WinHandle h(path, FILE_READ_ATTRIBUTES, FILE_FLAG_OPEN_REPARSE_POINT);
   if (!h)
-    return set_errno();
+    return -1;
   DWORD out;
   if (!DeviceIoControl(h, FSCTL_GET_REPARSE_POINT, nullptr, 0, buf, sizeof(buf), &out, 0))
-    return set_errno();
+    return -1;
   const auto* reparse    = reinterpret_cast<LIBCPP_REPARSE_DATA_BUFFER*>(buf);
   size_t path_buf_offset = offsetof(LIBCPP_REPARSE_DATA_BUFFER, SymbolicLinkReparseBuffer.PathBuffer[0]);
   if (out < path_buf_offset) {
-    errno = EINVAL;
+    SetLastError(ERROR_REPARSE_TAG_INVALID);
     return -1;
   }
   if (reparse->ReparseTag != IO_REPARSE_TAG_SYMLINK) {
-    errno = EINVAL;
+    SetLastError(ERROR_REPARSE_TAG_INVALID);
     return -1;
   }
   const auto& symlink = reparse->SymbolicLinkReparseBuffer;
@@ -449,11 +442,11 @@ inline SSizeT readlink(const wchar_t* path, wchar_t* ret_buf, size_t bufsize) {
   }
   // name_offset/length are expressed in bytes, not in wchar_t
   if (path_buf_offset + name_offset + name_length > out) {
-    errno = EINVAL;
+    SetLastError(ERROR_REPARSE_TAG_INVALID);
     return -1;
   }
   if (name_length / sizeof(wchar_t) > bufsize) {
-    errno = ENOMEM;
+    SetLastError(ERROR_NOT_ENOUGH_MEMORY);
     return -1;
   }
   memcpy(ret_buf, &symlink.PathBuffer[name_offset / sizeof(wchar_t)], name_length);
diff --git a/libcxx/src/include/overridable_function.h b/libcxx/src/include/overridable_function.h
index 6c70f62..7372e34 100644
--- a/libcxx/src/include/overridable_function.h
+++ b/libcxx/src/include/overridable_function.h
@@ -29,106 +29,81 @@
 // This is a low-level utility which does not work on all platforms, since it needs
 // to make assumptions about the object file format in use. Furthermore, it requires
 // the "base definition" of the function (the one we want to check whether it has been
-// overridden) to be annotated with the _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
+// overridden) to be defined using the _LIBCPP_OVERRIDABLE_FUNCTION macro.
 //
 // This currently works with Mach-O files (used on Darwin) and with ELF files (used on Linux
 // and others). On platforms where we know how to implement this detection, the macro
 // _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION is defined to 1, and it is defined to 0 on
-// other platforms. The _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro is defined to
-// nothing on unsupported platforms so that it can be used to decorate functions regardless
-// of whether detection is actually supported.
+// other platforms. The _LIBCPP_OVERRIDABLE_FUNCTION macro expands to regular function
+// definition on unsupported platforms so that it can be used to decorate functions
+// regardless of whether detection is actually supported.
 //
 // How does this work?
 // -------------------
 //
 // Let's say we want to check whether a weak function `f` has been overridden by the user.
-// The general mechanism works by placing `f`'s definition (in the libc++ built library)
-// inside a special section, which we do using the `__section__` attribute via the
-// _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
+// The general mechanism works by defining a symbol `f_impl__` and a weak alias `f` via the
+// _LIBCPP_OVERRIDABLE_FUNCTION macro.
 //
 // Then, when comes the time to check whether the function has been overridden, we take
-// the address of the function and we check whether it falls inside the special function
-// we created. This can be done by finding pointers to the start and the end of the section
-// (which is done differently for ELF and Mach-O), and then checking whether `f` falls
-// within those bounds. If it falls within those bounds, then `f` is still inside the
-// special section and so it is the version we defined in the libc++ built library, i.e.
-// it was not overridden. Otherwise, it was overridden by the user because it falls
-// outside of the section.
+// the address of the function `f` and we check whether it is different from `f_impl__`.
+// If so it means the function was overriden by the user.
 //
 // Important note
 // --------------
 //
-// This mechanism should never be used outside of the libc++ built library. In particular,
-// attempting to use this within the libc++ headers will not work at all because we don't
-// want to be defining special sections inside user's executables which use our headers.
+// This mechanism should never be used outside of the libc++ built library. Functions defined
+// with this macro must be defined at global scope.
 //
 
 #if defined(_LIBCPP_OBJECT_FORMAT_MACHO)
 
-#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
-#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE                                                                 \
-    __attribute__((__section__("__TEXT,__lcxx_override,regular,pure_instructions")))
-
 _LIBCPP_BEGIN_NAMESPACE_STD
-template <class _Ret, class... _Args>
-_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
-  // Declare two dummy bytes and give them these special `__asm` values. These values are
-  // defined by the linker, which means that referring to `&__lcxx_override_start` will
-  // effectively refer to the address where the section starts (and same for the end).
-  extern char __lcxx_override_start __asm("section$start$__TEXT$__lcxx_override");
-  extern char __lcxx_override_end __asm("section$end$__TEXT$__lcxx_override");
-
-  // Now get a uintptr_t out of these locations, and out of the function pointer.
-  uintptr_t __start = reinterpret_cast<uintptr_t>(&__lcxx_override_start);
-  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__lcxx_override_end);
-  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
-
-#  if __has_feature(ptrauth_calls)
-  // We must pass a void* to ptrauth_strip since it only accepts a pointer type. Also, in particular,
-  // we must NOT pass a function pointer, otherwise we will strip the function pointer, and then attempt
-  // to authenticate and re-sign it when casting it to a uintptr_t again, which will fail because we just
-  // stripped the function pointer. See rdar://122927845.
-  __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
-#  endif
-
-  // Finally, the function was overridden if it falls outside of the section's bounds.
-  return __ptr < __start || __ptr > __end;
-}
-_LIBCPP_END_NAMESPACE_STD
 
-// The NVPTX linker cannot create '__start/__stop' sections.
-#elif defined(_LIBCPP_OBJECT_FORMAT_ELF) && !defined(__NVPTX__)
+template <auto _Func>
+_LIBCPP_HIDE_FROM_ABI constexpr bool __is_function_overridden();
 
-#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
-#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE __attribute__((__section__("__lcxx_override")))
+_LIBCPP_END_NAMESPACE_STD
 
-// This is very similar to what we do for Mach-O above. The ELF linker will implicitly define
-// variables with those names corresponding to the start and the end of the section.
-//
-// See https://stackoverflow.com/questions/16552710/how-do-you-get-the-start-and-end-addresses-of-a-custom-elf-section
-extern char __start___lcxx_override;
-extern char __stop___lcxx_override;
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist)                                                    \
+    static __attribute__((used)) type symbol##_impl__ arglist __asm__("_" _LIBCPP_TOSTRING(symbol));                   \
+    __asm__(".globl _" _LIBCPP_TOSTRING(symbol));                                                                      \
+    __asm__(".weak_definition _" _LIBCPP_TOSTRING(symbol));                                                            \
+    extern __typeof(symbol##_impl__) name __attribute__((weak_import));                                                \
+    _LIBCPP_BEGIN_NAMESPACE_STD                                                                                        \
+    template <>                                                                                                        \
+    inline bool __is_function_overridden<static_cast<type(*) arglist>(name)>() {                                       \
+      return static_cast<type(*) arglist>(name) != symbol##_impl__;                                                    \
+    }                                                                                                                  \
+    _LIBCPP_END_NAMESPACE_STD                                                                                          \
+    static type symbol##_impl__ arglist
+
+#elif defined(_LIBCPP_OBJECT_FORMAT_ELF)
 
 _LIBCPP_BEGIN_NAMESPACE_STD
-template <class _Ret, class... _Args>
-_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
-  uintptr_t __start = reinterpret_cast<uintptr_t>(&__start___lcxx_override);
-  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__stop___lcxx_override);
-  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
-
-#  if __has_feature(ptrauth_calls)
-  // We must pass a void* to ptrauth_strip since it only accepts a pointer type. See full explanation above.
-  __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
-#  endif
-
-  return __ptr < __start || __ptr > __end;
-}
+
+template <auto _Func>
+_LIBCPP_HIDE_FROM_ABI constexpr bool __is_function_overridden();
+
 _LIBCPP_END_NAMESPACE_STD
 
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist)                                                    \
+    static type symbol##_impl__ arglist __asm__(_LIBCPP_TOSTRING(symbol##_impl__));                                    \
+    [[gnu::weak, gnu::alias(_LIBCPP_TOSTRING(symbol##_impl__))]] type name arglist;                                    \
+    _LIBCPP_BEGIN_NAMESPACE_STD                                                                                        \
+    template <>                                                                                                        \
+    inline bool __is_function_overridden<static_cast<type(*) arglist>(name)>() {                                       \
+      return static_cast<type(*) arglist>(name) != symbol##_impl__;                                                    \
+    }                                                                                                                  \
+    _LIBCPP_END_NAMESPACE_STD                                                                                          \
+    static type symbol##_impl__ arglist
+
 #else
 
 #  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 0
-#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE /* nothing */
+#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist) _LIBCPP_WEAK type name arglist
 
 #endif
 
diff --git a/libcxx/src/new.cpp b/libcxx/src/new.cpp
index e010fe4..b14b522 100644
--- a/libcxx/src/new.cpp
+++ b/libcxx/src/new.cpp
@@ -43,7 +43,7 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_Znwm, void*, operator new, (std::size_t size)) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -54,7 +54,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new)>(),
       "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
       "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
@@ -74,7 +74,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #  endif
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_Znam, void*, operator new[], (size_t size)) _THROW_BAD_ALLOC {
   return ::operator new(size);
 }
 
@@ -82,7 +82,7 @@ _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new[])>(),
       "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
       "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
@@ -136,8 +136,8 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
-operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_ZnwmSt11align_val_t, void*, operator new, (std::size_t size, std::align_val_t alignment))
+_THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -148,7 +148,7 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #    if !_LIBCPP_HAS_EXCEPTIONS
 #      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)>(),
       "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
       "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
@@ -168,16 +168,14 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #    endif
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
-operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
-  return ::operator new(size, alignment);
-}
+_LIBCPP_OVERRIDABLE_FUNCTION(_ZnamSt11align_val_t, void*, operator new[], (size_t size, std::align_val_t alignment))
+_THROW_BAD_ALLOC { return ::operator new(size, alignment); }
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
 #    if !_LIBCPP_HAS_EXCEPTIONS
 #      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])>(),
       "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
       "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "
diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp
index 37b1fc0..4937aaf 100644
--- a/libcxx/src/print.cpp
+++ b/libcxx/src/print.cpp
@@ -51,7 +51,7 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst
                     __view.size(),
                     nullptr,
                     nullptr) == 0) {
-    __throw_system_error(filesystem::detail::make_windows_error(GetLastError()), "failed to write formatted output");
+    __throw_system_error(filesystem::detail::get_last_error(), "failed to write formatted output");
   }
 }
 #  endif // _LIBCPP_HAS_WIDE_CHARACTERS
diff --git a/libcxx/src/system_error.cpp b/libcxx/src/system_error.cpp
index d555bca..d5ec730 100644
--- a/libcxx/src/system_error.cpp
+++ b/libcxx/src/system_error.cpp
@@ -14,6 +14,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <optional>
 #include <string.h>
 #include <string>
 #include <system_error>
@@ -24,8 +25,123 @@
 #  include <android/api-level.h>
 #endif
 
+#if defined(_LIBCPP_WIN32API)
+#  include <windows.h>
+#  include <winerror.h>
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#if defined(_LIBCPP_WIN32API)
+
+namespace {
+std::optional<errc> __win_err_to_errc(int err) {
+  switch (err) {
+  case ERROR_ACCESS_DENIED:
+    return errc::permission_denied;
+  case ERROR_ALREADY_EXISTS:
+    return errc::file_exists;
+  case ERROR_BAD_NETPATH:
+    return errc::no_such_file_or_directory;
+  case ERROR_BAD_PATHNAME:
+    return errc::no_such_file_or_directory;
+  case ERROR_BAD_UNIT:
+    return errc::no_such_device;
+  case ERROR_BROKEN_PIPE:
+    return errc::broken_pipe;
+  case ERROR_BUFFER_OVERFLOW:
+    return errc::filename_too_long;
+  case ERROR_BUSY:
+    return errc::device_or_resource_busy;
+  case ERROR_BUSY_DRIVE:
+    return errc::device_or_resource_busy;
+  case ERROR_CANNOT_MAKE:
+    return errc::permission_denied;
+  case ERROR_CANTOPEN:
+    return errc::io_error;
+  case ERROR_CANTREAD:
+    return errc::io_error;
+  case ERROR_CANTWRITE:
+    return errc::io_error;
+  case ERROR_CURRENT_DIRECTORY:
+    return errc::permission_denied;
+  case ERROR_DEV_NOT_EXIST:
+    return errc::no_such_device;
+  case ERROR_DEVICE_IN_USE:
+    return errc::device_or_resource_busy;
+  case ERROR_DIR_NOT_EMPTY:
+    return errc::directory_not_empty;
+  case ERROR_DIRECTORY:
+    return errc::invalid_argument;
+  case ERROR_DISK_FULL:
+    return errc::no_space_on_device;
+  case ERROR_FILE_EXISTS:
+    return errc::file_exists;
+  case ERROR_FILE_NOT_FOUND:
+    return errc::no_such_file_or_directory;
+  case ERROR_HANDLE_DISK_FULL:
+    return errc::no_space_on_device;
+  case ERROR_INVALID_ACCESS:
+    return errc::permission_denied;
+  case ERROR_INVALID_DRIVE:
+    return errc::no_such_device;
+  case ERROR_INVALID_FUNCTION:
+    return errc::function_not_supported;
+  case ERROR_INVALID_HANDLE:
+    return errc::invalid_argument;
+  case ERROR_INVALID_NAME:
+    return errc::no_such_file_or_directory;
+  case ERROR_INVALID_PARAMETER:
+    return errc::invalid_argument;
+  case ERROR_LOCK_VIOLATION:
+    return errc::no_lock_available;
+  case ERROR_LOCKED:
+    return errc::no_lock_available;
+  case ERROR_NEGATIVE_SEEK:
+    return errc::invalid_argument;
+  case ERROR_NOACCESS:
+    return errc::permission_denied;
+  case ERROR_NOT_ENOUGH_MEMORY:
+    return errc::not_enough_memory;
+  case ERROR_NOT_READY:
+    return errc::resource_unavailable_try_again;
+  case ERROR_NOT_SAME_DEVICE:
+    return errc::cross_device_link;
+  case ERROR_NOT_SUPPORTED:
+    return errc::not_supported;
+  case ERROR_OPEN_FAILED:
+    return errc::io_error;
+  case ERROR_OPEN_FILES:
+    return errc::device_or_resource_busy;
+  case ERROR_OPERATION_ABORTED:
+    return errc::operation_canceled;
+  case ERROR_OUTOFMEMORY:
+    return errc::not_enough_memory;
+  case ERROR_PATH_NOT_FOUND:
+    return errc::no_such_file_or_directory;
+  case ERROR_READ_FAULT:
+    return errc::io_error;
+  case ERROR_REPARSE_TAG_INVALID:
+    return errc::invalid_argument;
+  case ERROR_RETRY:
+    return errc::resource_unavailable_try_again;
+  case ERROR_SEEK:
+    return errc::io_error;
+  case ERROR_SHARING_VIOLATION:
+    return errc::permission_denied;
+  case ERROR_TOO_MANY_OPEN_FILES:
+    return errc::too_many_files_open;
+  case ERROR_WRITE_FAULT:
+    return errc::io_error;
+  case ERROR_WRITE_PROTECT:
+    return errc::permission_denied;
+  default:
+    return {};
+  }
+}
+} // namespace
+#endif
+
 namespace {
 #if _LIBCPP_HAS_THREADS
 
@@ -157,19 +273,52 @@ public:
 const char* __system_error_category::name() const noexcept { return "system"; }
 
 string __system_error_category::message(int ev) const {
-#ifdef _LIBCPP_ELAST
+#ifdef _LIBCPP_WIN32API
+  std::string result;
+  char* str               = nullptr;
+  unsigned long num_chars = ::FormatMessageA(
+      FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+      nullptr,
+      ev,
+      0,
+      reinterpret_cast<char*>(&str),
+      0,
+      nullptr);
+  auto is_whitespace = [](char ch) { return ch == '\n' || ch == '\r' || ch == ' '; };
+  while (num_chars > 0 && is_whitespace(str[num_chars - 1]))
+    --num_chars;
+
+  if (num_chars)
+    result = std::string(str, num_chars);
+  else
+    result = "Unknown error";
+
+  LocalFree(str);
+  return result;
+#else
+#  ifdef _LIBCPP_ELAST
   if (ev > _LIBCPP_ELAST)
     return string("unspecified system_category error");
-#endif // _LIBCPP_ELAST
+#  endif // _LIBCPP_ELAST
   return __do_message::message(ev);
+#endif
 }
 
 error_condition __system_error_category::default_error_condition(int ev) const noexcept {
-#ifdef _LIBCPP_ELAST
+#ifdef _LIBCPP_WIN32API
+  // Remap windows error codes to generic error codes if possible.
+  if (ev == 0)
+    return error_condition(0, generic_category());
+  if (auto maybe_errc = __win_err_to_errc(ev))
+    return error_condition(static_cast<int>(*maybe_errc), generic_category());
+  return error_condition(ev, system_category());
+#else
+#  ifdef _LIBCPP_ELAST
   if (ev > _LIBCPP_ELAST)
     return error_condition(ev, system_category());
-#endif // _LIBCPP_ELAST
+#  endif // _LIBCPP_ELAST
   return error_condition(ev, generic_category());
+#endif
 }
 
 const error_category& system_category() noexcept {
@@ -213,7 +362,7 @@ system_error::~system_error() noexcept {}
 
 void __throw_system_error(int ev, const char* what_arg) {
 #if _LIBCPP_HAS_EXCEPTIONS
-  std::__throw_system_error(error_code(ev, system_category()), what_arg);
+  std::__throw_system_error(error_code(ev, generic_category()), what_arg);
 #else
   // The above could also handle the no-exception case, but for size, avoid referencing system_category() unnecessarily.
   _LIBCPP_VERBOSE_ABORT(
diff --git a/libcxx/test/benchmarks/numeric/gcd.bench.cpp b/libcxx/test/benchmarks/numeric/gcd.bench.cpp
index abbc7e9..ca5fed5 100644
--- a/libcxx/test/benchmarks/numeric/gcd.bench.cpp
+++ b/libcxx/test/benchmarks/numeric/gcd.bench.cpp
@@ -25,7 +25,7 @@ static std::array<T, 1000> generate(std::uniform_int_distribution<T> distributio
 
 static void bm_gcd_random(benchmark::State& state) {
   std::array data = generate<int>();
-  while (state.KeepRunningBatch(data.size()))
+  while (state.KeepRunningBatch(data.size() * data.size()))
     for (auto v0 : data)
       for (auto v1 : data)
         benchmark::DoNotOptimize(std::gcd(v0, v1));
diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py
index 0db9c0d..06f277e 100644
--- a/libcxx/test/libcxx/clang_tidy.gen.py
+++ b/libcxx/test/libcxx/clang_tidy.gen.py
@@ -33,8 +33,7 @@ for header in public_headers:
 {lit_header_undeprecations.get(header, '')}
 
 // TODO: run clang-tidy with modules enabled once they are supported
-// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --checks='-*,libcpp-*' --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- %{{compile_flags}} -fno-modules
-// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy -- -Wweak-vtables %{{compile_flags}} -fno-modules
+// RUN: %{{clang-tidy}} %s --warnings-as-errors=* -header-filter=.* --config-file=%{{libcxx-dir}}/.clang-tidy --load=%{{test-tools-dir}}/clang_tidy_checks/libcxx-tidy.plugin -- -Wweak-vtables %{{compile_flags}} -fno-modules
 
 #include <{header}>
 """)
diff --git a/libcxx/test/libcxx/containers/sequences/forwardlist/assert.pass.cpp b/libcxx/test/libcxx/containers/sequences/forwardlist/assert.pass.cpp
new file mode 100644
index 0000000..6d1748e
--- /dev/null
+++ b/libcxx/test/libcxx/containers/sequences/forwardlist/assert.pass.cpp
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <forward_list>
+
+// Test hardening assertions for std::forward_list.
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <forward_list>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  { // Default-constructed list.
+    std::forward_list<int> c;
+    const auto& const_c = c;
+    TEST_LIBCPP_ASSERT_FAILURE(c.front(), "forward_list::front called on an empty list");
+    TEST_LIBCPP_ASSERT_FAILURE(const_c.front(), "forward_list::front called on an empty list");
+    TEST_LIBCPP_ASSERT_FAILURE(c.pop_front(), "forward_list::pop_front called on an empty list");
+  }
+
+  { // Non-empty list becomes empty.
+    std::forward_list<int> c;
+    const auto& const_c = c;
+    c.push_front(1);
+
+    // Check that there's no assertion on valid access.
+    (void)c.front();
+    (void)const_c.front();
+
+    c.pop_front();
+    TEST_LIBCPP_ASSERT_FAILURE(c.pop_front(), "forward_list::pop_front called on an empty list");
+    TEST_LIBCPP_ASSERT_FAILURE(c.front(), "forward_list::front called on an empty list");
+    TEST_LIBCPP_ASSERT_FAILURE(const_c.front(), "forward_list::front called on an empty list");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/sequences/vector.bool/assert.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector.bool/assert.pass.cpp
new file mode 100644
index 0000000..41badad
--- /dev/null
+++ b/libcxx/test/libcxx/containers/sequences/vector.bool/assert.pass.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// Test hardening assertions for std::vector<bool>.
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <vector>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+template <class Allocator>
+void test() {
+  std::vector<bool, Allocator> c;
+  TEST_LIBCPP_ASSERT_FAILURE(c.front(), "vector<bool>::front() called on an empty vector");
+  TEST_LIBCPP_ASSERT_FAILURE(c.back(), "vector<bool>::back() called on an empty vector");
+  TEST_LIBCPP_ASSERT_FAILURE(c[0], "vector<bool>::operator[] index out of bounds");
+  TEST_LIBCPP_ASSERT_FAILURE(c.pop_back(), "vector<bool>::pop_back called on an empty vector");
+
+  // Repeat the test with a const reference to test the const overloads.
+  {
+    const std::vector<bool, Allocator>& cc = c;
+    TEST_LIBCPP_ASSERT_FAILURE(cc.front(), "vector<bool>::front() called on an empty vector");
+    TEST_LIBCPP_ASSERT_FAILURE(cc.back(), "vector<bool>::back() called on an empty vector");
+    TEST_LIBCPP_ASSERT_FAILURE(cc[0], "vector<bool>::operator[] index out of bounds");
+  }
+
+  c.push_back(true);
+  c.push_back(false);
+  c.push_back(true);
+  TEST_LIBCPP_ASSERT_FAILURE(c[3], "vector<bool>::operator[] index out of bounds");
+  TEST_LIBCPP_ASSERT_FAILURE(c[100], "vector<bool>::operator[] index out of bounds");
+
+  // Repeat the test with a const reference to test the const overloads.
+  {
+    const std::vector<bool, Allocator>& cc = c;
+    TEST_LIBCPP_ASSERT_FAILURE(cc[3], "vector<bool>::operator[] index out of bounds");
+    TEST_LIBCPP_ASSERT_FAILURE(cc[100], "vector<bool>::operator[] index out of bounds");
+  }
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      c.erase(c.end()), "vector<bool>::erase(iterator) called with a non-dereferenceable iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(
+      c.erase(c.begin() + 1, c.begin()), "vector<bool>::erase(iterator, iterator) called with an invalid range");
+}
+
+int main(int, char**) {
+  test<std::allocator<bool>>();
+  test<min_allocator<bool>>();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/diagnostics/system_error_win_codes.pass.cpp b/libcxx/test/libcxx/diagnostics/system_error_win_codes.pass.cpp
new file mode 100644
index 0000000..799a5b5
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/system_error_win_codes.pass.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: windows
+
+// Validate that system_error on windows accepts Windows' System Error Codes (as
+// used by win32 APIs and reported by GetLastError), and that they are properly
+// translated to generic conditions.
+
+#include <windows.h>
+#include <system_error>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  LIBCPP_ASSERT(std::error_code(ERROR_ACCESS_DENIED, std::system_category()) == std::errc::permission_denied);
+  LIBCPP_ASSERT(std::error_code(ERROR_PATH_NOT_FOUND, std::system_category()) == std::errc::no_such_file_or_directory);
+  return 0;
+}
diff --git a/libcxx/test/libcxx/numerics/numarray/assert.pass.cpp b/libcxx/test/libcxx/numerics/numarray/assert.pass.cpp
new file mode 100644
index 0000000..2bdf523
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/numarray/assert.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <valarray>
+
+// Test hardening assertions for std::valarray.
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <valarray>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  { // Empty valarray
+    std::valarray<int> c;
+    const auto& const_c = c;
+    TEST_LIBCPP_ASSERT_FAILURE(c[0], "valarray::operator[] index out of bounds");
+    TEST_LIBCPP_ASSERT_FAILURE(const_c[0], "valarray::operator[] index out of bounds");
+    TEST_LIBCPP_ASSERT_FAILURE(c[42], "valarray::operator[] index out of bounds");
+    TEST_LIBCPP_ASSERT_FAILURE(const_c[42], "valarray::operator[] index out of bounds");
+  }
+
+  { // Non-empty valarray
+    std::valarray<int> c(4);
+    const auto& const_c = c;
+    (void)c[3]; // Check that there's no assertion on valid access.
+    TEST_LIBCPP_ASSERT_FAILURE(c[4], "valarray::operator[] index out of bounds");
+    (void)const_c[3]; // Check that there's no assertion on valid access.
+    TEST_LIBCPP_ASSERT_FAILURE(const_c[4], "valarray::operator[] index out of bounds");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/utilities/template.bitset/assert.pass.cpp b/libcxx/test/libcxx/utilities/template.bitset/assert.pass.cpp
new file mode 100644
index 0000000..4019bdf
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/template.bitset/assert.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <bitset>
+
+// Test hardening assertions for std::bitset.
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <bitset>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  { // Empty bitset
+    std::bitset<0> c;
+    const auto& const_c = c;
+    TEST_LIBCPP_ASSERT_FAILURE(c[0], "bitset::operator[] index out of bounds");
+    TEST_LIBCPP_ASSERT_FAILURE(const_c[0], "bitset::operator[] index out of bounds");
+    TEST_LIBCPP_ASSERT_FAILURE(c[42], "bitset::operator[] index out of bounds");
+    TEST_LIBCPP_ASSERT_FAILURE(const_c[42], "bitset::operator[] index out of bounds");
+  }
+
+  { // Non-empty bitset
+    std::bitset<4> c(42);
+    const auto& const_c = c;
+    (void)c[3]; // Check that there's no assertion on valid access.
+    TEST_LIBCPP_ASSERT_FAILURE(c[4], "bitset::operator[] index out of bounds");
+    (void)const_c[3]; // Check that there's no assertion on valid access.
+    TEST_LIBCPP_ASSERT_FAILURE(const_c[4], "bitset::operator[] index out of bounds");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
index cd998d4..c2afa6b 100644
--- a/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
@@ -17,24 +17,47 @@
 #include <type_traits>
 
 #include "atomic_helpers.h"
+#include "test_helper.h"
 #include "test_macros.h"
 
 template <typename T>
 struct TestExchange {
   void operator()() const {
-    T x(T(1));
-    std::atomic_ref<T> const a(x);
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      {
+        std::same_as<T> decltype(auto) y = a.exchange(T(2));
+        assert(y == T(1));
+        ASSERT_NOEXCEPT(a.exchange(T(2)));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.exchange(T(3), std::memory_order_seq_cst);
+        assert(y == T(2));
+        ASSERT_NOEXCEPT(a.exchange(T(3), std::memory_order_seq_cst));
+      }
+    }
 
+    // memory_order::release
     {
-      std::same_as<T> decltype(auto) y = a.exchange(T(2));
-      assert(y == T(1));
-      ASSERT_NOEXCEPT(a.exchange(T(2)));
+      auto exchange = [](std::atomic_ref<T> const& x, T, T new_val) {
+        x.exchange(new_val, std::memory_order::release);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(exchange, load);
     }
 
+    // memory_order::seq_cst
     {
-      std::same_as<T> decltype(auto) y = a.exchange(T(3), std::memory_order_seq_cst);
-      assert(y == T(2));
-      ASSERT_NOEXCEPT(a.exchange(T(3), std::memory_order_seq_cst));
+      auto exchange_no_arg     = [](std::atomic_ref<T> const& x, T, T new_val) { x.exchange(new_val); };
+      auto exchange_with_order = [](std::atomic_ref<T> const& x, T, T new_val) {
+        x.exchange(new_val, std::memory_order::seq_cst);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(exchange_no_arg, load);
+      test_seq_cst<T>(exchange_with_order, load);
     }
   }
 };
diff --git a/libcxx/test/std/containers/sequences/vector.bool/at.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/at.pass.cpp
new file mode 100644
index 0000000..16832dd
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector.bool/at.pass.cpp
@@ -0,0 +1,125 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// reference at(size_type n); // constexpr since C++20
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif
+
+template <typename Allocator>
+TEST_CONSTEXPR_CXX20 void test() {
+  using C         = std::vector<bool, Allocator>;
+  using reference = typename C::reference;
+  bool a[]        = {1, 0, 1, 0, 1};
+  C v(a, a + sizeof(a) / sizeof(a[0]));
+  ASSERT_SAME_TYPE(reference, decltype(v.at(0)));
+  assert(v.at(0) == true);
+  assert(v.at(1) == false);
+  assert(v.at(2) == true);
+  assert(v.at(3) == false);
+  assert(v.at(4) == true);
+  v.at(1) = 1;
+  assert(v.at(1) == true);
+  v.at(3) = 1;
+  assert(v.at(3) == true);
+}
+
+template <typename Allocator>
+void test_exception() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  {
+    bool a[] = {1, 0, 1, 1};
+    using C  = std::vector<bool, Allocator>;
+    C v(a, a + sizeof(a) / sizeof(a[0]));
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(4);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(5);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(6);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      using size_type = typename C::size_type;
+      TEST_IGNORE_NODISCARD v.at(static_cast<size_type>(-1));
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+  }
+
+  {
+    std::vector<bool, Allocator> v;
+    try {
+      TEST_IGNORE_NODISCARD v.at(0);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+  }
+#endif
+}
+
+TEST_CONSTEXPR_CXX20 bool tests() {
+  test<std::allocator<bool> >();
+  test<min_allocator<bool> >();
+  test<test_allocator<bool> >();
+  return true;
+}
+
+void test_exceptions() {
+  test_exception<std::allocator<bool> >();
+  test_exception<min_allocator<bool> >();
+  test_exception<test_allocator<bool> >();
+}
+
+int main(int, char**) {
+  tests();
+  test_exceptions();
+
+#if TEST_STD_VER >= 20
+  static_assert(tests());
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/vector.bool/at_const.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/at_const.pass.cpp
new file mode 100644
index 0000000..5ed794d
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector.bool/at_const.pass.cpp
@@ -0,0 +1,121 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// const_reference at(size_type n) const; // constexpr since C++20
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif
+
+template <typename Allocator>
+TEST_CONSTEXPR_CXX20 void test() {
+  using C               = const std::vector<bool, Allocator>;
+  using const_reference = typename C::const_reference;
+  bool a[]              = {1, 0, 1, 0, 1};
+  C v(a, a + sizeof(a) / sizeof(a[0]));
+  ASSERT_SAME_TYPE(const_reference, decltype(v.at(0)));
+  assert(v.at(0) == true);
+  assert(v.at(1) == false);
+  assert(v.at(2) == true);
+  assert(v.at(3) == false);
+  assert(v.at(4) == true);
+}
+
+template <typename Allocator>
+void test_exception() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  {
+    bool a[] = {1, 0, 1, 1};
+    using C  = const std::vector<bool, Allocator>;
+    C v(a, a + sizeof(a) / sizeof(a[0]));
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(4);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(5);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(6);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      using size_type = typename C::size_type;
+      TEST_IGNORE_NODISCARD v.at(static_cast<size_type>(-1));
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+  }
+
+  {
+    std::vector<bool, Allocator> v;
+    try {
+      TEST_IGNORE_NODISCARD v.at(0);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+  }
+#endif
+}
+
+TEST_CONSTEXPR_CXX20 bool tests() {
+  test<std::allocator<bool> >();
+  test<min_allocator<bool> >();
+  test<test_allocator<bool> >();
+  return true;
+}
+
+void test_exceptions() {
+  test_exception<std::allocator<bool> >();
+  test_exception<min_allocator<bool> >();
+  test_exception<test_allocator<bool> >();
+}
+
+int main(int, char**) {
+  tests();
+  test_exceptions();
+
+#if TEST_STD_VER >= 20
+  static_assert(tests());
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
index f1f4973..a8b565b 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
@@ -22,6 +22,10 @@
 
 #include "test_macros.h"
 
+#ifndef _WIN32
+#  define TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
+#endif
+
 int main(int, char**) {
   std::error_code e_code1(5, std::generic_category());
   std::error_code e_code2(5, std::system_category());
@@ -45,7 +49,9 @@ int main(int, char**) {
   assert(e_code2 == e_code2);
   assert(e_code2 != e_code3);
   assert(e_code2 != e_code4);
+#ifdef TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
   LIBCPP_ASSERT(e_code2 == e_condition1);
+#endif
   assert(e_code2 == e_condition2);
   LIBCPP_ASSERT(e_code2 != e_condition3);
   assert(e_code2 != e_condition4);
@@ -65,11 +71,15 @@ int main(int, char**) {
   assert(e_code4 == e_code4);
   LIBCPP_ASSERT(e_code4 != e_condition1);
   assert(e_code4 != e_condition2);
+#ifdef TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
   LIBCPP_ASSERT(e_code4 == e_condition3);
+#endif
   assert(e_code4 == e_condition4);
 
   assert(e_condition1 == e_code1);
+#ifdef TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
   LIBCPP_ASSERT(e_condition1 == e_code2);
+#endif
   assert(e_condition1 != e_code3);
   LIBCPP_ASSERT(e_condition1 != e_code4);
   assert(e_condition1 == e_condition1);
@@ -89,7 +99,9 @@ int main(int, char**) {
   assert(e_condition3 != e_code1);
   LIBCPP_ASSERT(e_condition3 != e_code2);
   assert(e_condition3 == e_code3);
+#ifdef TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
   LIBCPP_ASSERT(e_condition3 == e_code4);
+#endif
   assert(e_condition3 != e_condition1);
   assert(e_condition3 != e_condition2);
   assert(e_condition3 == e_condition3);
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
index 9f7eb42..f7f4313 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
@@ -29,8 +29,11 @@ int main(int, char**) {
   assert(!m1.empty());
   assert(!m2.empty());
   assert(!m3.empty());
+#ifndef _WIN32
+  // On windows, system_category is distinct.
   LIBCPP_ASSERT(m1 == m2);
-  assert(m1 != m3);
+#endif
+  assert(m2 != m3);
 
   return 0;
 }
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index 6ba33ba..255cbe7 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -33,7 +33,12 @@ int main(int, char**) {
   {
     const std::error_category& e_cat1 = std::system_category();
     std::error_condition e_cond       = e_cat1.default_error_condition(5);
+#ifdef _WIN32
+    // Windows' system error 5 is ERROR_ACCESS_DENIED, which maps to generic code permission_denied.
+    LIBCPP_ASSERT(e_cond.value() == static_cast<int>(std::errc::permission_denied));
+#else
     LIBCPP_ASSERT(e_cond.value() == 5);
+#endif
     LIBCPP_ASSERT(e_cond.category() == std::generic_category());
     assert(e_cat1.equivalent(5, e_cond));
 
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
index 303a95a..071ee7f 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
@@ -172,8 +172,13 @@ static void test_with_ec_dne() {
     file_status st = status(p, status_ec);
     file_status sym_st = symlink_status(p, sym_status_ec);
     std::error_code ec = GetTestEC(2);
-    auto CheckEC = [&](std::error_code const& other_ec) {
-      bool res = ec == other_ec;
+    auto CheckEC                  = [&](std::error_code const& other_ec) {
+      // Note: we're comparing equality of the _canonicalized_ error_condition
+      // here (unlike in other tests where we expect exactly the same
+      // error_code). This is because directory_entry can construct its own
+      // generic_category error when a file doesn't exist, instead of passing
+      // through an underlying system_category error.
+      bool res = ec.default_error_condition() == other_ec.default_error_condition();
       ec = GetTestEC(2);
       return res;
     };
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
index dd72232..dec04df 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
@@ -44,7 +44,7 @@ static void test_basic() {
     file_status es = e.status(eec);
     assert(ps.type() == es.type());
     assert(ps.permissions() == es.permissions());
-    assert(pec == eec);
+    assert(pec.default_error_condition() == eec.default_error_condition());
   }
   for (const auto& p : TestCases) {
     const directory_entry e(p);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
index 24e8069..77da936 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
@@ -44,7 +44,7 @@ static void test_signature() {
     file_status es = e.symlink_status(eec);
     assert(ps.type() == es.type());
     assert(ps.permissions() == es.permissions());
-    assert(pec == eec);
+    assert(pec.default_error_condition() == eec.default_error_condition());
   }
   for (const auto& p : TestCases) {
     const directory_entry e(p);
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_procfs.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_procfs.pass.cpp
new file mode 100644
index 0000000..29bc8e4
--- /dev/null
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_procfs.pass.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// REQUIRES: linux
+// UNSUPPORTED: no-filesystem
+// XFAIL: no-localization
+// UNSUPPORTED: availability-filesystem-missing
+
+// <filesystem>
+
+// bool copy_file(const path& from, const path& to);
+// bool copy_file(const path& from, const path& to, error_code& ec) noexcept;
+// bool copy_file(const path& from, const path& to, copy_options options);
+// bool copy_file(const path& from, const path& to, copy_options options,
+//           error_code& ec) noexcept;
+
+#include <cassert>
+#include <filesystem>
+#include <system_error>
+
+#include "test_macros.h"
+#include "filesystem_test_helper.h"
+
+namespace fs = std::filesystem;
+
+// Linux has various virtual filesystems such as /proc and /sys
+// where files may have no length (st_size == 0), but still contain data.
+// This is because the to-be-read data is usually generated ad-hoc by the reading syscall
+// These files can not be copied with kernel-side copies like copy_file_range or sendfile,
+// and must instead be copied via a traditional userspace read + write loop.
+int main(int, char** argv) {
+  const fs::path procfile{"/proc/self/comm"};
+  assert(file_size(procfile) == 0);
+
+  scoped_test_env env;
+  std::error_code ec = GetTestEC();
+
+  const fs::path dest = env.make_env_path("dest");
+
+  assert(copy_file(procfile, dest, ec));
+  assert(!ec);
+
+  // /proc/self/comm contains the filename of the executable, plus a null terminator
+  assert(file_size(dest) == fs::path(argv[0]).filename().string().size() + 1);
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp
index e8ab264..d87fd91 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/op_and_eq.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=2000000
+
 // bitset<N>& operator&=(const bitset<N>& rhs); // constexpr since C++23
 
 #include <bitset>
diff --git a/libcxx/test/support/filesystem_test_helper.h b/libcxx/test/support/filesystem_test_helper.h
index a63d645..2ad9efb 100644
--- a/libcxx/test/support/filesystem_test_helper.h
+++ b/libcxx/test/support/filesystem_test_helper.h
@@ -583,7 +583,11 @@ struct ExceptionChecker {
     assert(ErrorIsImp(Err.code(), {expected_err}));
     assert(Err.path1() == expected_path1);
     assert(Err.path2() == expected_path2);
+#ifndef _WIN32
+    // On Windows, the error strings are windows error code strings, and don't
+    // match textually with the strings generated for generic std::errc::*.
     LIBCPP_ONLY(check_libcxx_string(Err));
+#endif
   }
 
   void check_libcxx_string(fs::filesystem_error const& Err) {
diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 05c44e4..0f8f0e88 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -92,6 +92,7 @@ set(SOURCES
     header_exportable_declarations.cpp
     hide_from_abi.cpp
     internal_ftm_use.cpp
+    nodebug_on_aliases.cpp
     proper_version_checks.cpp
     qualify_declval.cpp
     robust_against_adl.cpp
diff --git a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
index 54beed5..bc7c8ce 100644
--- a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
@@ -13,6 +13,7 @@
 #include "header_exportable_declarations.hpp"
 #include "hide_from_abi.hpp"
 #include "internal_ftm_use.hpp"
+#include "nodebug_on_aliases.hpp"
 #include "proper_version_checks.hpp"
 #include "qualify_declval.hpp"
 #include "robust_against_adl.hpp"
@@ -26,6 +27,7 @@ public:
     check_factories.registerCheck<libcpp::header_exportable_declarations>("libcpp-header-exportable-declarations");
     check_factories.registerCheck<libcpp::hide_from_abi>("libcpp-hide-from-abi");
     check_factories.registerCheck<libcpp::internal_ftm_use>("libcpp-internal-ftms");
+    check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
     check_factories.registerCheck<libcpp::proper_version_checks>("libcpp-cpp-version-check");
     check_factories.registerCheck<libcpp::robust_against_adl_check>("libcpp-robust-against-adl");
     check_factories.registerCheck<libcpp::uglify_attributes>("libcpp-uglify-attributes");
diff --git a/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.cpp b/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.cpp
new file mode 100644
index 0000000..9b96269
--- /dev/null
+++ b/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-tidy/ClangTidyCheck.h"
+
+#include "nodebug_on_aliases.hpp"
+#include "utilities.hpp"
+
+namespace libcpp {
+namespace {
+AST_MATCHER(clang::NamedDecl, isPretty) { return !is_ugly_name(Node.getName()); }
+} // namespace
+
+nodebug_on_aliases::nodebug_on_aliases(llvm::StringRef name, clang::tidy::ClangTidyContext* context)
+    : clang::tidy::ClangTidyCheck(name, context) {}
+
+void nodebug_on_aliases::registerMatchers(clang::ast_matchers::MatchFinder* finder) {
+  using namespace clang::ast_matchers;
+  finder->addMatcher(
+      typeAliasDecl(unless(anyOf(isPretty(), hasAttr(clang::attr::NoDebug), hasAncestor(functionDecl()))))
+          .bind("nodebug_on_internal_aliases"),
+      this);
+}
+
+void nodebug_on_aliases::check(const clang::ast_matchers::MatchFinder::MatchResult& result) {
+  if (const auto* alias = result.Nodes.getNodeAs<clang::TypeAliasDecl>("nodebug_on_internal_aliases")) {
+    diag(alias->getBeginLoc(), "Internal aliases should always be marked _LIBCPP_NODEBUG");
+  }
+}
+} // namespace libcpp
diff --git a/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.hpp b/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.hpp
new file mode 100644
index 0000000..1097e89
--- /dev/null
+++ b/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.hpp
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-tidy/ClangTidyCheck.h"
+
+namespace libcpp {
+class nodebug_on_aliases : public clang::tidy::ClangTidyCheck {
+public:
+  nodebug_on_aliases(llvm::StringRef, clang::tidy::ClangTidyContext*);
+  void registerMatchers(clang::ast_matchers::MatchFinder*) override;
+  void check(const clang::ast_matchers::MatchFinder::MatchResult&) override;
+};
+} // namespace libcpp
diff --git a/libcxx/test/tools/clang_tidy_checks/uglify_attributes.cpp b/libcxx/test/tools/clang_tidy_checks/uglify_attributes.cpp
index 7812b23..24bacde 100644
--- a/libcxx/test/tools/clang_tidy_checks/uglify_attributes.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/uglify_attributes.cpp
@@ -10,20 +10,11 @@
 #include "clang-tidy/ClangTidyModuleRegistry.h"
 
 #include "uglify_attributes.hpp"
+#include "utilities.hpp"
 
-#include <algorithm>
-#include <array>
-#include <span>
-#include <string_view>
+#include <optional>
 
 namespace {
-bool isUgly(std::string_view str) {
-  if (str.size() < 2)
-    return false;
-  if (str[0] == '_' && str[1] >= 'A' && str[1] <= 'Z')
-    return true;
-  return str.find("__") != std::string_view::npos;
-}
 
 // Starting with Clang 17 ToT C++23 support is provided by CPlusPlus23 instead
 // of C++23 support is provided by CPlusPlus2b. To allow a smooth transition for
@@ -77,17 +68,15 @@ AST_MATCHER(clang::Attr, isPretty) {
   if (Node.isKeywordAttribute() || !Node.getAttrName())
     return false;
   if (Node.isCXX11Attribute() && !Node.hasScope()) {
-    if (isUgly(Node.getAttrName()->getName()))
+    if (is_ugly_name(Node.getAttrName()->getName()))
       return false;
     return !llvm::is_contained(
         get_standard_attributes(Finder->getASTContext().getLangOpts()), Node.getAttrName()->getName());
   }
   if (Node.hasScope())
-    if (!isUgly(Node.getScopeName()->getName()))
+    if (!is_ugly_name(Node.getScopeName()->getName()))
       return true;
-  return !isUgly(Node.getAttrName()->getName());
-
-  return false;
+  return !is_ugly_name(Node.getAttrName()->getName());
 }
 
 std::optional<std::string> getUglyfiedCXX11Attr(const clang::Attr& attr) {
diff --git a/libcxx/test/tools/clang_tidy_checks/utilities.hpp b/libcxx/test/tools/clang_tidy_checks/utilities.hpp
new file mode 100644
index 0000000..b780efe
--- /dev/null
+++ b/libcxx/test/tools/clang_tidy_checks/utilities.hpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBCXX_TEST_TOOLS_CLANG_TIDY_CHECKS_UTILITIES_HPP
+#define LIBCXX_TEST_TOOLS_CLANG_TIDY_CHECKS_UTILITIES_HPP
+
+#include <string_view>
+
+inline bool is_ugly_name(std::string_view str) {
+  if (str.size() < 2)
+    return false;
+  if (str[0] == '_' && str[1] >= 'A' && str[1] <= 'Z')
+    return true;
+  return str.find("__") != std::string_view::npos;
+}
+
+#endif // LIBCXX_TEST_TOOLS_CLANG_TIDY_CHECKS_UTILITIES_HPP
diff --git a/libcxx/utils/libcxx-benchmark-json b/libcxx/utils/libcxx-benchmark-json
new file mode 100755
index 0000000..7f743c3
--- /dev/null
+++ b/libcxx/utils/libcxx-benchmark-json
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+set -e
+
+PROGNAME="$(basename "${0}")"
+MONOREPO_ROOT="$(realpath $(dirname "${PROGNAME}"))"
+function usage() {
+cat <<EOF
+Usage:
+${PROGNAME} [-h|--help] <build-directory> benchmarks...
+
+Print the path to the JSON files containing benchmark results for the given benchmarks.
+
+This requires those benchmarks to have already been run, i.e. this only resolves the path
+to the benchmark .json file within the build directory.
+
+<build-directory>  The path to the build directory.
+benchmarks...      Paths of the benchmarks to extract the results for. Those paths are relative to '<monorepo-root>'.
+
+Example
+=======
+$ cmake -S runtimes -B build/ -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi"
+$ libcxx-lit build/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+$ less \$(${PROGNAME} build/ libcxx/test/benchmarks/algorithms/for_each.bench.cpp)
+EOF
+}
+
+if [[ "${1}" == "-h" || "${1}" == "--help" ]]; then
+    usage
+    exit 0
+fi
+
+if [[ $# -lt 1 ]]; then
+    usage
+    exit 1
+fi
+
+build_dir="${1}"
+shift
+
+for benchmark in ${@}; do
+    # Normalize the paths by turning all benchmarks paths into absolute ones and then making them
+    # relative to the root of the monorepo.
+    benchmark="$(realpath ${benchmark})"
+    relative=$(python -c "import os; import sys; print(os.path.relpath(sys.argv[1], sys.argv[2]))" "${benchmark}" "${MONOREPO_ROOT}")
+
+    # Extract components of the benchmark path
+    directory="$(dirname ${relative})"
+    file="$(basename ${relative})"
+
+    # Reconstruct the (slightly weird) path to the benchmark json file. This should be kept in sync
+    # whenever the test suite changes.
+    json="${build_dir}/${directory}/Output/${file}.dir/benchmark-result.json"
+    if [[ -f "${json}" ]]; then
+        echo "${json}"
+    fi
+done
diff --git a/libcxx/utils/libcxx-compare-benchmarks b/libcxx/utils/libcxx-compare-benchmarks
new file mode 100755
index 0000000..e04820f
--- /dev/null
+++ b/libcxx/utils/libcxx-compare-benchmarks
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+
+set -e
+
+PROGNAME="$(basename "${0}")"
+MONOREPO_ROOT="$(realpath $(dirname "${PROGNAME}"))"
+function usage() {
+cat <<EOF
+Usage:
+${PROGNAME} [-h|--help] <baseline-build> <candidate-build> benchmarks...
+
+Compare the given benchmarks between the baseline and the candidate build directories.
+
+This requires those benchmarks to have already been generated in both build directories.
+
+<baseline-build>   The path to the build directory considered the baseline.
+<candidate-build>  The path to the build directory considered the candidate.
+benchmarks...      Paths of the benchmarks to compare. Those paths are relative to '<monorepo-root>'.
+
+Example
+=======
+$ libcxx-lit build1/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+$ libcxx-lit build2/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+$ ${PROGNAME} build1/ build2/ libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+EOF
+}
+
+if [[ "${1}" == "-h" || "${1}" == "--help" ]]; then
+    usage
+    exit 0
+fi
+
+if [[ $# -lt 1 ]]; then
+    usage
+    exit 1
+fi
+
+baseline="${1}"
+candidate="${2}"
+shift; shift
+
+GBENCH="${MONOREPO_ROOT}/third-party/benchmark"
+
+python3 -m venv /tmp/libcxx-compare-benchmarks-venv
+source /tmp/libcxx-compare-benchmarks-venv/bin/activate
+pip3 install -r ${GBENCH}/tools/requirements.txt
+
+for benchmark in ${@}; do
+    base="$(${MONOREPO_ROOT}/libcxx/utils/libcxx-benchmark-json ${baseline} ${benchmark})"
+    cand="$(${MONOREPO_ROOT}/libcxx/utils/libcxx-benchmark-json ${candidate} ${benchmark})"
+
+    if [[ ! -e "${base}" ]]; then
+        echo "Benchmark ${benchmark} does not exist in the baseline"
+        continue
+    fi
+    if [[ ! -e "${cand}" ]]; then
+        echo "Benchmark ${benchmark} does not exist in the candidate"
+        continue
+    fi
+
+    "${GBENCH}/tools/compare.py" benchmarks "${base}" "${cand}"
+done
diff --git a/libcxxabi/src/stdlib_new_delete.cpp b/libcxxabi/src/stdlib_new_delete.cpp
index f386b28..73798e2 100644
--- a/libcxxabi/src/stdlib_new_delete.cpp
+++ b/libcxxabi/src/stdlib_new_delete.cpp
@@ -63,7 +63,7 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_Znwm, void*, operator new, (std::size_t size)) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -74,7 +74,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #if !_LIBCPP_HAS_EXCEPTIONS
 #  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new)>(),
       "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
       "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
@@ -94,7 +94,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #endif
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_Znam, void*, operator new[], (size_t size)) _THROW_BAD_ALLOC {
   return ::operator new(size);
 }
 
@@ -102,7 +102,7 @@ _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
 #if !_LIBCPP_HAS_EXCEPTIONS
 #  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new[])>(),
       "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
       "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
@@ -156,8 +156,8 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
-operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_ZnwmSt11align_val_t, void*, operator new, (std::size_t size, std::align_val_t alignment))
+_THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -168,7 +168,7 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)>(),
       "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
       "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
@@ -188,16 +188,14 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #  endif
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
-operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
-  return ::operator new(size, alignment);
-}
+_LIBCPP_OVERRIDABLE_FUNCTION(_ZnamSt11align_val_t, void*, operator new[], (size_t size, std::align_val_t alignment))
+_THROW_BAD_ALLOC { return ::operator new(size, alignment); }
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])>(),
       "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
       "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index a4d6b94..791382f 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -189,6 +189,71 @@ bool LinkerDriver::findUnderscoreMangle(StringRef sym) {
   return s && !isa<Undefined>(s);
 }
 
+static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) {
+  if (mt == IMAGE_FILE_MACHINE_UNKNOWN)
+    return true;
+  switch (ctx.config.machine) {
+  case ARM64:
+    return mt == ARM64 || mt == ARM64X;
+  case ARM64EC:
+    return isArm64EC(mt) || mt == AMD64;
+  case ARM64X:
+    return isAnyArm64(mt) || mt == AMD64;
+  case IMAGE_FILE_MACHINE_UNKNOWN:
+    return true;
+  default:
+    return ctx.config.machine == mt;
+  }
+}
+
+void LinkerDriver::addFile(InputFile *file) {
+  Log(ctx) << "Reading " << toString(file);
+  if (file->lazy) {
+    if (auto *f = dyn_cast<BitcodeFile>(file))
+      f->parseLazy();
+    else
+      cast<ObjFile>(file)->parseLazy();
+  } else {
+    file->parse();
+    if (auto *f = dyn_cast<ObjFile>(file)) {
+      ctx.objFileInstances.push_back(f);
+    } else if (auto *f = dyn_cast<BitcodeFile>(file)) {
+      if (ltoCompilationDone) {
+        Err(ctx) << "LTO object file " << toString(file)
+                 << " linked in after "
+                    "doing LTO compilation.";
+      }
+      ctx.bitcodeFileInstances.push_back(f);
+    } else if (auto *f = dyn_cast<ImportFile>(file)) {
+      ctx.importFileInstances.push_back(f);
+    }
+  }
+
+  MachineTypes mt = file->getMachineType();
+  // The ARM64EC target must be explicitly specified and cannot be inferred.
+  if (mt == ARM64EC &&
+      (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN ||
+       (ctx.config.machineInferred &&
+        (ctx.config.machine == ARM64 || ctx.config.machine == AMD64)))) {
+    Err(ctx) << toString(file)
+             << ": machine type arm64ec is ambiguous and cannot be "
+                "inferred, use /machine:arm64ec or /machine:arm64x";
+    return;
+  }
+  if (!compatibleMachineType(ctx, mt)) {
+    Err(ctx) << toString(file) << ": machine type " << machineToStr(mt)
+             << " conflicts with " << machineToStr(ctx.config.machine);
+    return;
+  }
+  if (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN &&
+      mt != IMAGE_FILE_MACHINE_UNKNOWN) {
+    ctx.config.machineInferred = true;
+    setMachine(mt);
+  }
+
+  parseDirectives(file);
+}
+
 MemoryBufferRef LinkerDriver::takeBuffer(std::unique_ptr<MemoryBuffer> mb) {
   MemoryBufferRef mbref = *mb;
   make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take ownership
@@ -222,17 +287,17 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
         addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
       return;
     }
-    ctx.symtab.addFile(make<ArchiveFile>(ctx, mbref));
+    addFile(make<ArchiveFile>(ctx, mbref));
     break;
   case file_magic::bitcode:
-    ctx.symtab.addFile(make<BitcodeFile>(ctx, mbref, "", 0, lazy));
+    addFile(make<BitcodeFile>(ctx, mbref, "", 0, lazy));
     break;
   case file_magic::coff_object:
   case file_magic::coff_import_library:
-    ctx.symtab.addFile(ObjFile::create(ctx, mbref, lazy));
+    addFile(ObjFile::create(ctx, mbref, lazy));
     break;
   case file_magic::pdb:
-    ctx.symtab.addFile(make<PDBInputFile>(ctx, mbref));
+    addFile(make<PDBInputFile>(ctx, mbref));
     break;
   case file_magic::coff_cl_gl_object:
     Err(ctx) << filename
@@ -240,7 +305,7 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
     break;
   case file_magic::pecoff_executable:
     if (ctx.config.mingw) {
-      ctx.symtab.addFile(make<DLLFile>(ctx.symtab, mbref));
+      addFile(make<DLLFile>(ctx.symtab, mbref));
       break;
     }
     if (filename.ends_with_insensitive(".dll")) {
@@ -306,7 +371,7 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   if (magic == file_magic::coff_import_library) {
     InputFile *imp = make<ImportFile>(ctx, mb);
     imp->parentName = parentName;
-    ctx.symtab.addFile(imp);
+    addFile(imp);
     return;
   }
 
@@ -326,7 +391,7 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   }
 
   obj->parentName = parentName;
-  ctx.symtab.addFile(obj);
+  addFile(obj);
   Log(ctx) << "Loaded " << obj << " for " << symName;
 }
 
@@ -1400,7 +1465,7 @@ void LinkerDriver::convertResources() {
   }
   ObjFile *f =
       ObjFile::create(ctx, convertResToCOFF(resources, resourceObjFiles));
-  ctx.symtab.addFile(f);
+  addFile(f);
   f->includeResourceChunks();
 }
 
@@ -2548,7 +2613,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     symtab.addAbsolute(mangle("__guard_eh_cont_count"), 0);
     symtab.addAbsolute(mangle("__guard_eh_cont_table"), 0);
 
-    if (isArm64EC(ctx.config.machine)) {
+    if (symtab.isEC()) {
       symtab.addAbsolute("__arm64x_extra_rfe_table", 0);
       symtab.addAbsolute("__arm64x_extra_rfe_table_size", 0);
       symtab.addAbsolute("__arm64x_redirection_metadata", 0);
@@ -2702,6 +2767,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // Do LTO by compiling bitcode input files to a set of native COFF files then
   // link those files (unless -thinlto-index-only was given, in which case we
   // resolve symbols and write indices, but don't generate native code or link).
+  ltoCompilationDone = true;
   ctx.symtab.compileBitcodeFiles();
 
   if (Defined *d =
@@ -2812,10 +2878,10 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // Handle /call-graph-ordering-file and /call-graph-profile-sort (default on).
   if (config->callGraphProfileSort) {
     llvm::TimeTraceScope timeScope("Call graph");
-    if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) {
+    if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
       parseCallGraphFile(arg->getValue());
-    }
-    readCallGraphsFromObjectFiles(ctx);
+    else
+      readCallGraphsFromObjectFiles(ctx);
   }
 
   // Handle /print-symbol-order.
@@ -2824,6 +2890,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
 
   if (ctx.symtabEC)
     ctx.symtabEC->initializeECThunks();
+  ctx.forEachSymtab([](SymbolTable &symtab) { symtab.initializeLoadConfig(); });
 
   // Identify unreferenced COMDAT sections.
   if (config->doGC) {
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index c2b92f6..5132568 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -80,13 +80,10 @@ public:
 
   void linkerMain(llvm::ArrayRef<const char *> args);
 
-  void setMachine(llvm::COFF::MachineTypes machine);
+  void addFile(InputFile *file);
 
   void addClangLibSearchPaths(const std::string &argv0);
 
-  // Used by the resolver to parse .drectve section contents.
-  void parseDirectives(InputFile *file);
-
   // Used by ArchiveFile to enqueue members.
   void enqueueArchiveMember(const Archive::Child &c, const Archive::Symbol &sym,
                             StringRef parentName);
@@ -115,12 +112,13 @@ private:
   void detectWinSysRoot(const llvm::opt::InputArgList &args);
 
   // Adds various search paths based on the sysroot.  Must only be called once
-  // config->machine has been set.
+  // config.machine has been set.
   void addWinSysRootLibSearchPaths();
 
   // Symbol names are mangled by prepending "_" on x86.
   StringRef mangle(StringRef sym);
 
+  void setMachine(llvm::COFF::MachineTypes machine);
   llvm::Triple::ArchType getArch();
 
   uint64_t getDefaultImageBase();
@@ -144,6 +142,9 @@ private:
 
   void createImportLibrary(bool asLib);
 
+  // Used by the resolver to parse .drectve section contents.
+  void parseDirectives(InputFile *file);
+
   void parseModuleDefs(StringRef path);
 
   // Parse an /order file. If an option is given, the linker places COMDAT
@@ -279,6 +280,8 @@ private:
   // Create export thunks for exported and patchable Arm64EC function symbols.
   void createECExportThunks();
   void maybeCreateECExportThunk(StringRef name, Symbol *&sym);
+
+  bool ltoCompilationDone = false;
 };
 
 // Create enum with OPT_xxx values for each option in Options.td
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index ad21311..a94c984c 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -149,11 +149,19 @@ std::vector<MemoryBufferRef>
 lld::coff::getArchiveMembers(COFFLinkerContext &ctx, Archive *file) {
   std::vector<MemoryBufferRef> v;
   Error err = Error::success();
+
+  // Thin archives refer to .o files, so --reproduces needs the .o files too.
+  bool addToTar = file->isThin() && ctx.driver.tar;
+
   for (const Archive::Child &c : file->children(err)) {
     MemoryBufferRef mbref =
         CHECK(c.getMemoryBufferRef(),
               file->getFileName() +
                   ": could not get the buffer for a child of the archive");
+    if (addToTar) {
+      ctx.driver.tar->append(relativeToRoot(check(c.getFullName())),
+                             mbref.getBuffer());
+    }
     v.push_back(mbref);
   }
   if (err)
@@ -1412,5 +1420,5 @@ void DLLFile::makeImport(DLLFile::Symbol *s) {
   memcpy(p, s->dllName.data(), s->dllName.size());
   MemoryBufferRef mbref = MemoryBufferRef(StringRef(buf, size), s->dllName);
   ImportFile *impFile = make<ImportFile>(symtab.ctx, mbref);
-  symtab.addFile(impFile);
+  symtab.ctx.driver.addFile(impFile);
 }
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index b1d375b..ae88675 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -27,6 +27,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::support;
 
 namespace lld::coff {
 
@@ -36,71 +37,6 @@ StringRef ltrim1(StringRef s, const char *chars) {
   return s;
 }
 
-static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) {
-  if (mt == IMAGE_FILE_MACHINE_UNKNOWN)
-    return true;
-  switch (ctx.config.machine) {
-  case ARM64:
-    return mt == ARM64 || mt == ARM64X;
-  case ARM64EC:
-    return COFF::isArm64EC(mt) || mt == AMD64;
-  case ARM64X:
-    return COFF::isAnyArm64(mt) || mt == AMD64;
-  case IMAGE_FILE_MACHINE_UNKNOWN:
-    return true;
-  default:
-    return ctx.config.machine == mt;
-  }
-}
-
-void SymbolTable::addFile(InputFile *file) {
-  Log(ctx) << "Reading " << toString(file);
-  if (file->lazy) {
-    if (auto *f = dyn_cast<BitcodeFile>(file))
-      f->parseLazy();
-    else
-      cast<ObjFile>(file)->parseLazy();
-  } else {
-    file->parse();
-    if (auto *f = dyn_cast<ObjFile>(file)) {
-      ctx.objFileInstances.push_back(f);
-    } else if (auto *f = dyn_cast<BitcodeFile>(file)) {
-      if (ltoCompilationDone) {
-        Err(ctx) << "LTO object file " << toString(file)
-                 << " linked in after "
-                    "doing LTO compilation.";
-      }
-      ctx.bitcodeFileInstances.push_back(f);
-    } else if (auto *f = dyn_cast<ImportFile>(file)) {
-      ctx.importFileInstances.push_back(f);
-    }
-  }
-
-  MachineTypes mt = file->getMachineType();
-  // The ARM64EC target must be explicitly specified and cannot be inferred.
-  if (mt == ARM64EC &&
-      (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN ||
-       (ctx.config.machineInferred &&
-        (ctx.config.machine == ARM64 || ctx.config.machine == AMD64)))) {
-    Err(ctx) << toString(file)
-             << ": machine type arm64ec is ambiguous and cannot be "
-                "inferred, use /machine:arm64ec or /machine:arm64x";
-    return;
-  }
-  if (!compatibleMachineType(ctx, mt)) {
-    Err(ctx) << toString(file) << ": machine type " << machineToStr(mt)
-             << " conflicts with " << machineToStr(ctx.config.machine);
-    return;
-  }
-  if (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN &&
-      mt != IMAGE_FILE_MACHINE_UNKNOWN) {
-    ctx.config.machineInferred = true;
-    ctx.driver.setMachine(mt);
-  }
-
-  ctx.driver.parseDirectives(file);
-}
-
 static COFFSyncStream errorOrWarn(COFFLinkerContext &ctx) {
   return {ctx, ctx.config.forceUnresolved ? DiagLevel::Warn : DiagLevel::Err};
 }
@@ -117,7 +53,7 @@ static void forceLazy(Symbol *s) {
   case Symbol::Kind::LazyObjectKind: {
     InputFile *file = cast<LazyObject>(s)->file;
     file->lazy = false;
-    file->symtab.addFile(file);
+    file->symtab.ctx.driver.addFile(file);
     break;
   }
   case Symbol::Kind::LazyDLLSymbolKind: {
@@ -596,6 +532,61 @@ std::pair<Symbol *, bool> SymbolTable::insert(StringRef name, InputFile *file) {
   return result;
 }
 
+void SymbolTable::initializeLoadConfig() {
+  auto sym =
+      dyn_cast_or_null<DefinedRegular>(findUnderscore("_load_config_used"));
+  if (!sym) {
+    if (isEC()) {
+      Warn(ctx) << "EC version of '_load_config_used' is missing";
+      return;
+    }
+    if (ctx.hybridSymtab) {
+      Warn(ctx) << "native version of '_load_config_used' is missing for "
+                   "ARM64X target";
+      return;
+    }
+    if (ctx.config.guardCF != GuardCFLevel::Off)
+      Warn(ctx)
+          << "Control Flow Guard is enabled but '_load_config_used' is missing";
+    if (ctx.config.dependentLoadFlags)
+      Warn(ctx) << "_load_config_used not found, /dependentloadflag will have "
+                   "no effect";
+    return;
+  }
+
+  SectionChunk *sc = sym->getChunk();
+  if (!sc->hasData) {
+    Err(ctx) << "_load_config_used points to uninitialized data";
+    return;
+  }
+  uint64_t offsetInChunk = sym->getValue();
+  if (offsetInChunk + 4 > sc->getSize()) {
+    Err(ctx) << "_load_config_used section chunk is too small";
+    return;
+  }
+
+  ArrayRef<uint8_t> secContents = sc->getContents();
+  loadConfigSize =
+      *reinterpret_cast<const ulittle32_t *>(&secContents[offsetInChunk]);
+  if (offsetInChunk + loadConfigSize > sc->getSize()) {
+    Err(ctx) << "_load_config_used specifies a size larger than its containing "
+                "section chunk";
+    return;
+  }
+
+  uint32_t expectedAlign = ctx.config.is64() ? 8 : 4;
+  if (sc->getAlignment() < expectedAlign)
+    Warn(ctx) << "'_load_config_used' is misaligned (expected alignment to be "
+              << expectedAlign << " bytes, got " << sc->getAlignment()
+              << " instead)";
+  else if (!isAligned(Align(expectedAlign), offsetInChunk))
+    Warn(ctx) << "'_load_config_used' is misaligned (section offset is 0x"
+              << Twine::utohexstr(sym->getValue()) << " not aligned to "
+              << expectedAlign << " bytes)";
+
+  loadConfigSym = sym;
+}
+
 void SymbolTable::addEntryThunk(Symbol *from, Symbol *to) {
   entryThunks.push_back({from, to});
 }
@@ -729,7 +720,7 @@ void SymbolTable::addLazyObject(InputFile *f, StringRef n) {
     return;
   s->pendingArchiveLoad = true;
   f->lazy = false;
-  addFile(f);
+  ctx.driver.addFile(f);
 }
 
 void SymbolTable::addLazyDLLSymbol(DLLFile *f, DLLFile::Symbol *sym,
@@ -1007,7 +998,6 @@ Symbol *SymbolTable::addUndefined(StringRef name) {
 }
 
 void SymbolTable::compileBitcodeFiles() {
-  ltoCompilationDone = true;
   if (ctx.bitcodeFileInstances.empty())
     return;
 
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index b694893..5443815 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -51,8 +51,6 @@ public:
               llvm::COFF::MachineTypes machine = IMAGE_FILE_MACHINE_UNKNOWN)
       : ctx(c), machine(machine) {}
 
-  void addFile(InputFile *file);
-
   // Emit errors for symbols that cannot be resolved.
   void reportUnresolvable();
 
@@ -138,6 +136,10 @@ public:
       callback(pair.second);
   }
 
+  DefinedRegular *loadConfigSym = nullptr;
+  uint32_t loadConfigSize = 0;
+  void initializeLoadConfig();
+
 private:
   /// Given a name without "__imp_" prefix, returns a defined symbol
   /// with the "__imp_" prefix, if it exists.
@@ -151,7 +153,6 @@ private:
 
   llvm::DenseMap<llvm::CachedHashStringRef, Symbol *> symMap;
   std::unique_ptr<BitcodeCompiler> lto;
-  bool ltoCompilationDone = false;
   std::vector<std::pair<Symbol *, Symbol *>> entryThunks;
   llvm::DenseMap<Symbol *, Symbol *> exitThunks;
 };
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 3c6112b..a5b2b62 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -282,7 +282,8 @@ private:
   uint32_t getSizeOfInitializedData();
 
   void prepareLoadConfig();
-  template <typename T> void prepareLoadConfig(T *loadConfig);
+  template <typename T>
+  void prepareLoadConfig(SymbolTable &symtab, T *loadConfig);
 
   std::unique_ptr<FileOutputBuffer> &buffer;
   std::map<PartialSectionKey, PartialSection *> partialSections;
@@ -574,7 +575,7 @@ bool Writer::createThunks(OutputSection *os, int margin) {
 
 // Create a code map for CHPE metadata.
 void Writer::createECCodeMap() {
-  if (!isArm64EC(ctx.config.machine))
+  if (!ctx.symtabEC)
     return;
 
   // Clear the map in case we were're recomputing the map after adding
@@ -610,7 +611,8 @@ void Writer::createECCodeMap() {
 
   closeRange();
 
-  Symbol *tableCountSym = ctx.symtab.findUnderscore("__hybrid_code_map_count");
+  Symbol *tableCountSym =
+      ctx.symtabEC->findUnderscore("__hybrid_code_map_count");
   cast<DefinedAbsolute>(tableCountSym)->setVA(codeMap.size());
 }
 
@@ -944,7 +946,7 @@ void Writer::appendECImportTables() {
 
   const uint32_t rdata = IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ;
 
-  // IAT is always placed at the begining of .rdata section and its size
+  // IAT is always placed at the beginning of .rdata section and its size
   // is aligned to 4KB. Insert it here, after all merges all done.
   if (PartialSection *importAddresses = findPartialSection(".idata$5", rdata)) {
     if (!rdataSec->chunks.empty())
@@ -1228,8 +1230,7 @@ void Writer::createMiscChunks() {
   // Create /guard:cf tables if requested.
   createGuardCFTables();
 
-  if (isArm64EC(config->machine))
-    createECChunks();
+  createECChunks();
 
   if (config->autoImport)
     createRuntimePseudoRelocs();
@@ -1837,22 +1838,10 @@ template <typename PEHeaderTy> void Writer::writeHeader() {
     dir[DEBUG_DIRECTORY].RelativeVirtualAddress = debugDirectory->getRVA();
     dir[DEBUG_DIRECTORY].Size = debugDirectory->getSize();
   }
-  if (Symbol *sym = ctx.symtab.findUnderscore("_load_config_used")) {
-    if (auto *b = dyn_cast<DefinedRegular>(sym)) {
-      SectionChunk *sc = b->getChunk();
-      assert(b->getRVA() >= sc->getRVA());
-      uint64_t offsetInChunk = b->getRVA() - sc->getRVA();
-      if (!sc->hasData || offsetInChunk + 4 > sc->getSize())
-        Fatal(ctx) << "_load_config_used is malformed";
-
-      ArrayRef<uint8_t> secContents = sc->getContents();
-      uint32_t loadConfigSize =
-          *reinterpret_cast<const ulittle32_t *>(&secContents[offsetInChunk]);
-      if (offsetInChunk + loadConfigSize > sc->getSize())
-        Fatal(ctx) << "_load_config_used is too large";
-      dir[LOAD_CONFIG_TABLE].RelativeVirtualAddress = b->getRVA();
-      dir[LOAD_CONFIG_TABLE].Size = loadConfigSize;
-    }
+  if (ctx.symtab.loadConfigSym) {
+    dir[LOAD_CONFIG_TABLE].RelativeVirtualAddress =
+        ctx.symtab.loadConfigSym->getRVA();
+    dir[LOAD_CONFIG_TABLE].Size = ctx.symtab.loadConfigSize;
   }
   if (!delayIdata.empty()) {
     dir[DELAY_IMPORT_DESCRIPTOR].RelativeVirtualAddress =
@@ -2169,7 +2158,11 @@ void Writer::maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
 
 // Create CHPE metadata chunks.
 void Writer::createECChunks() {
-  for (Symbol *s : ctx.symtab.expSymbols) {
+  SymbolTable *symtab = ctx.symtabEC;
+  if (!symtab)
+    return;
+
+  for (Symbol *s : symtab->expSymbols) {
     auto sym = dyn_cast<Defined>(s);
     if (!sym || !sym->getChunk())
       continue;
@@ -2188,9 +2181,9 @@ void Writer::createECChunks() {
       // we should use the #foo$hp_target symbol as the redirection target.
       // First, try to look up the $hp_target symbol. If it can't be found,
       // assume it's a regular function and look for #foo instead.
-      Symbol *targetSym = ctx.symtab.find((targetName + "$hp_target").str());
+      Symbol *targetSym = symtab->find((targetName + "$hp_target").str());
       if (!targetSym)
-        targetSym = ctx.symtab.find(targetName);
+        targetSym = symtab->find(targetName);
       Defined *t = dyn_cast_or_null<Defined>(targetSym);
       if (t && isArm64EC(t->getChunk()->getMachine()))
         exportThunks.push_back({chunk, t});
@@ -2199,20 +2192,20 @@ void Writer::createECChunks() {
 
   auto codeMapChunk = make<ECCodeMapChunk>(codeMap);
   rdataSec->addChunk(codeMapChunk);
-  Symbol *codeMapSym = ctx.symtab.findUnderscore("__hybrid_code_map");
+  Symbol *codeMapSym = symtab->findUnderscore("__hybrid_code_map");
   replaceSymbol<DefinedSynthetic>(codeMapSym, codeMapSym->getName(),
                                   codeMapChunk);
 
   CHPECodeRangesChunk *ranges = make<CHPECodeRangesChunk>(exportThunks);
   rdataSec->addChunk(ranges);
   Symbol *rangesSym =
-      ctx.symtab.findUnderscore("__x64_code_ranges_to_entry_points");
+      symtab->findUnderscore("__x64_code_ranges_to_entry_points");
   replaceSymbol<DefinedSynthetic>(rangesSym, rangesSym->getName(), ranges);
 
   CHPERedirectionChunk *entryPoints = make<CHPERedirectionChunk>(exportThunks);
   a64xrmSec->addChunk(entryPoints);
   Symbol *entryPointsSym =
-      ctx.symtab.findUnderscore("__arm64x_redirection_metadata");
+      symtab->findUnderscore("__arm64x_redirection_metadata");
   replaceSymbol<DefinedSynthetic>(entryPointsSym, entryPointsSym->getName(),
                                   entryPoints);
 }
@@ -2305,7 +2298,8 @@ void Writer::setSectionPermissions() {
 
 // Set symbols used by ARM64EC metadata.
 void Writer::setECSymbols() {
-  if (!isArm64EC(ctx.config.machine))
+  SymbolTable *symtab = ctx.symtabEC;
+  if (!symtab)
     return;
 
   llvm::stable_sort(exportThunks, [](const std::pair<Chunk *, Defined *> &a,
@@ -2313,45 +2307,45 @@ void Writer::setECSymbols() {
     return a.first->getRVA() < b.first->getRVA();
   });
 
-  Symbol *rfeTableSym = ctx.symtab.findUnderscore("__arm64x_extra_rfe_table");
+  Symbol *rfeTableSym = symtab->findUnderscore("__arm64x_extra_rfe_table");
   replaceSymbol<DefinedSynthetic>(rfeTableSym, "__arm64x_extra_rfe_table",
                                   pdata.first);
 
   if (pdata.first) {
     Symbol *rfeSizeSym =
-        ctx.symtab.findUnderscore("__arm64x_extra_rfe_table_size");
+        symtab->findUnderscore("__arm64x_extra_rfe_table_size");
     cast<DefinedAbsolute>(rfeSizeSym)
         ->setVA(pdata.last->getRVA() + pdata.last->getSize() -
                 pdata.first->getRVA());
   }
 
   Symbol *rangesCountSym =
-      ctx.symtab.findUnderscore("__x64_code_ranges_to_entry_points_count");
+      symtab->findUnderscore("__x64_code_ranges_to_entry_points_count");
   cast<DefinedAbsolute>(rangesCountSym)->setVA(exportThunks.size());
 
   Symbol *entryPointCountSym =
-      ctx.symtab.findUnderscore("__arm64x_redirection_metadata_count");
+      symtab->findUnderscore("__arm64x_redirection_metadata_count");
   cast<DefinedAbsolute>(entryPointCountSym)->setVA(exportThunks.size());
 
-  Symbol *iatSym = ctx.symtab.findUnderscore("__hybrid_auxiliary_iat");
+  Symbol *iatSym = symtab->findUnderscore("__hybrid_auxiliary_iat");
   replaceSymbol<DefinedSynthetic>(iatSym, "__hybrid_auxiliary_iat",
                                   idata.auxIat.empty() ? nullptr
                                                        : idata.auxIat.front());
 
-  Symbol *iatCopySym = ctx.symtab.findUnderscore("__hybrid_auxiliary_iat_copy");
+  Symbol *iatCopySym = symtab->findUnderscore("__hybrid_auxiliary_iat_copy");
   replaceSymbol<DefinedSynthetic>(
       iatCopySym, "__hybrid_auxiliary_iat_copy",
       idata.auxIatCopy.empty() ? nullptr : idata.auxIatCopy.front());
 
   Symbol *delayIatSym =
-      ctx.symtab.findUnderscore("__hybrid_auxiliary_delayload_iat");
+      symtab->findUnderscore("__hybrid_auxiliary_delayload_iat");
   replaceSymbol<DefinedSynthetic>(
       delayIatSym, "__hybrid_auxiliary_delayload_iat",
       delayIdata.getAuxIat().empty() ? nullptr
                                      : delayIdata.getAuxIat().front());
 
   Symbol *delayIatCopySym =
-      ctx.symtab.findUnderscore("__hybrid_auxiliary_delayload_iat_copy");
+      symtab->findUnderscore("__hybrid_auxiliary_delayload_iat_copy");
   replaceSymbol<DefinedSynthetic>(
       delayIatCopySym, "__hybrid_auxiliary_delayload_iat_copy",
       delayIdata.getAuxIatCopy().empty() ? nullptr
@@ -2649,39 +2643,25 @@ void Writer::fixTlsAlignment() {
 }
 
 void Writer::prepareLoadConfig() {
-  Symbol *sym = ctx.symtab.findUnderscore("_load_config_used");
-  auto *b = cast_if_present<DefinedRegular>(sym);
-  if (!b) {
-    if (ctx.config.guardCF != GuardCFLevel::Off)
-      Warn(ctx)
-          << "Control Flow Guard is enabled but '_load_config_used' is missing";
-    if (ctx.config.dependentLoadFlags)
-      Warn(ctx) << "_load_config_used not found, /dependentloadflag will have "
-                   "no effect";
-    return;
-  }
+  ctx.forEachSymtab([&](SymbolTable &symtab) {
+    if (!symtab.loadConfigSym)
+      return;
 
-  OutputSection *sec = ctx.getOutputSection(b->getChunk());
-  uint8_t *buf = buffer->getBufferStart();
-  uint8_t *secBuf = buf + sec->getFileOff();
-  uint8_t *symBuf = secBuf + (b->getRVA() - sec->getRVA());
-  uint32_t expectedAlign = ctx.config.is64() ? 8 : 4;
-  if (b->getChunk()->getAlignment() < expectedAlign)
-    Warn(ctx) << "'_load_config_used' is misaligned (expected alignment to be "
-              << expectedAlign << " bytes, got "
-              << b->getChunk()->getAlignment() << " instead)";
-  else if (!isAligned(Align(expectedAlign), b->getRVA()))
-    Warn(ctx) << "'_load_config_used' is misaligned (RVA is 0x"
-              << Twine::utohexstr(b->getRVA()) << " not aligned to "
-              << expectedAlign << " bytes)";
-
-  if (ctx.config.is64())
-    prepareLoadConfig(reinterpret_cast<coff_load_configuration64 *>(symBuf));
-  else
-    prepareLoadConfig(reinterpret_cast<coff_load_configuration32 *>(symBuf));
+    OutputSection *sec = ctx.getOutputSection(symtab.loadConfigSym->getChunk());
+    uint8_t *secBuf = buffer->getBufferStart() + sec->getFileOff();
+    uint8_t *symBuf = secBuf + (symtab.loadConfigSym->getRVA() - sec->getRVA());
+
+    if (ctx.config.is64())
+      prepareLoadConfig(symtab,
+                        reinterpret_cast<coff_load_configuration64 *>(symBuf));
+    else
+      prepareLoadConfig(symtab,
+                        reinterpret_cast<coff_load_configuration32 *>(symBuf));
+  });
 }
 
-template <typename T> void Writer::prepareLoadConfig(T *loadConfig) {
+template <typename T>
+void Writer::prepareLoadConfig(SymbolTable &symtab, T *loadConfig) {
   size_t loadConfigSize = loadConfig->Size;
 
 #define RETURN_IF_NOT_CONTAINS(field)                                          \
@@ -2694,12 +2674,12 @@ template <typename T> void Writer::prepareLoadConfig(T *loadConfig) {
   if (loadConfigSize >= offsetof(T, field) + sizeof(T::field))
 
 #define CHECK_VA(field, sym)                                                   \
-  if (auto *s = dyn_cast<DefinedSynthetic>(ctx.symtab.findUnderscore(sym)))    \
+  if (auto *s = dyn_cast<DefinedSynthetic>(symtab.findUnderscore(sym)))        \
     if (loadConfig->field != ctx.config.imageBase + s->getRVA())               \
       Warn(ctx) << #field " not set correctly in '_load_config_used'";
 
 #define CHECK_ABSOLUTE(field, sym)                                             \
-  if (auto *s = dyn_cast<DefinedAbsolute>(ctx.symtab.findUnderscore(sym)))     \
+  if (auto *s = dyn_cast<DefinedAbsolute>(symtab.findUnderscore(sym)))         \
     if (loadConfig->field != s->getVA())                                       \
       Warn(ctx) << #field " not set correctly in '_load_config_used'";
 
@@ -2720,6 +2700,23 @@ template <typename T> void Writer::prepareLoadConfig(T *loadConfig) {
     }
   }
 
+  IF_CONTAINS(CHPEMetadataPointer) {
+    // On ARM64X, only the EC version of the load config contains
+    // CHPEMetadataPointer. Copy its value to the native load config.
+    if (ctx.hybridSymtab && !symtab.isEC() &&
+        ctx.hybridSymtab->loadConfigSize >=
+            offsetof(T, CHPEMetadataPointer) + sizeof(T::CHPEMetadataPointer)) {
+      OutputSection *sec =
+          ctx.getOutputSection(ctx.hybridSymtab->loadConfigSym->getChunk());
+      uint8_t *secBuf = buffer->getBufferStart() + sec->getFileOff();
+      auto hybridLoadConfig =
+          reinterpret_cast<const coff_load_configuration64 *>(
+              secBuf +
+              (ctx.hybridSymtab->loadConfigSym->getRVA() - sec->getRVA()));
+      loadConfig->CHPEMetadataPointer = hybridLoadConfig->CHPEMetadataPointer;
+    }
+  }
+
   if (ctx.config.guardCF == GuardCFLevel::Off)
     return;
   RETURN_IF_NOT_CONTAINS(GuardFlags)
diff --git a/lld/Common/BPSectionOrdererBase.cpp b/lld/Common/BPSectionOrdererBase.cpp
index 7c5874f..75be4f6 100644
--- a/lld/Common/BPSectionOrdererBase.cpp
+++ b/lld/Common/BPSectionOrdererBase.cpp
@@ -109,9 +109,6 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
 
   // Process input sections
   for (const auto &isec : inputSections) {
-    if (!isec->hasValidData())
-      continue;
-
     unsigned sectionIdx = sections.size();
     sectionToIdx.try_emplace(isec->getSection(), sectionIdx);
     sections.emplace_back(isec.get());
@@ -371,4 +368,4 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
   for (const auto *isec : orderedSections)
     sectionPriorities[isec] = --highestAvailablePriority;
   return sectionPriorities;
-}
-\ No newline at end of file
+}
diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp
index 35c59d6..9caba5c 100644
--- a/lld/ELF/CallGraphSort.cpp
+++ b/lld/ELF/CallGraphSort.cpp
@@ -235,7 +235,7 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
   });
 
   DenseMap<const InputSectionBase *, int> orderMap;
-  int curOrder = 1;
+  int curOrder = -clusters.size();
   for (int leader : sorted) {
     for (int i = leader;;) {
       orderMap[sections[i]] = curOrder++;
@@ -276,8 +276,8 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
 // Sort sections by the profile data using the Cache-Directed Sort algorithm.
 // The placement is done by optimizing the locality by co-locating frequently
 // executed code sections together.
-DenseMap<const InputSectionBase *, int>
-elf::computeCacheDirectedSortOrder(Ctx &ctx) {
+static DenseMap<const InputSectionBase *, int>
+computeCacheDirectedSortOrder(Ctx &ctx) {
   SmallVector<uint64_t, 0> funcSizes;
   SmallVector<uint64_t, 0> funcCounts;
   SmallVector<codelayout::EdgeCount, 0> callCounts;
@@ -328,7 +328,7 @@ elf::computeCacheDirectedSortOrder(Ctx &ctx) {
 
   // Create the final order.
   DenseMap<const InputSectionBase *, int> orderMap;
-  int curOrder = 1;
+  int curOrder = -sortedSections.size();
   for (uint64_t secIdx : sortedSections)
     orderMap[sections[secIdx]] = curOrder++;
 
diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h
index 5f9987c..89e6f617 100644
--- a/lld/ELF/CallGraphSort.h
+++ b/lld/ELF/CallGraphSort.h
@@ -16,9 +16,6 @@ struct Ctx;
 class InputSectionBase;
 
 llvm::DenseMap<const InputSectionBase *, int>
-computeCacheDirectedSortOrder(Ctx &);
-
-llvm::DenseMap<const InputSectionBase *, int>
 computeCallGraphProfileOrder(Ctx &);
 } // namespace lld::elf
 
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index f573a8d..13e8f8c 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1746,13 +1746,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
     if (args.hasArg(OPT_call_graph_ordering_file))
       ErrAlways(ctx) << "--symbol-ordering-file and --call-graph-order-file "
                         "may not be used together";
-    if (std::optional<MemoryBufferRef> buffer =
-            readFile(ctx, arg->getValue())) {
+    if (auto buffer = readFile(ctx, arg->getValue()))
       ctx.arg.symbolOrderingFile = getSymbolOrderingFile(ctx, *buffer);
-      // Also need to disable CallGraphProfileSort to prevent
-      // LLD order symbols with CGProfile
-      ctx.arg.callGraphProfileSort = CGProfileSortKind::None;
-    }
   }
 
   assert(ctx.arg.versionDefinitions.empty());
@@ -3215,11 +3210,12 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   // Read the callgraph now that we know what was gced or icfed
   if (ctx.arg.callGraphProfileSort != CGProfileSortKind::None) {
-    if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
+    if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) {
       if (std::optional<MemoryBufferRef> buffer =
               readFile(ctx, arg->getValue()))
         readCallGraph(ctx, *buffer);
-    readCallGraphsFromObjectFiles<ELFT>(ctx);
+    } else
+      readCallGraphsFromObjectFiles<ELFT>(ctx);
   }
 
   // Write the result to the file.
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index 60988df..98e7d5d 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -33,7 +33,7 @@ class SyntheticSection;
 template <class ELFT> class ObjFile;
 class OutputSection;
 
-// Returned by InputSectionBase::relsOrRelas. At most one member is empty.
+// Returned by InputSectionBase::relsOrRelas. At least two members are empty.
 template <class ELFT> struct RelsOrRelas {
   Relocs<typename ELFT::Rel> rels;
   Relocs<typename ELFT::Rela> relas;
diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index d5b2200..9757005 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -318,7 +318,7 @@ void SymbolTable::scanVersionScript() {
     // script with `global: *` are used.
     //
     // '--retain-symbol-file' adds a "*" pattern to
-    // 'config->versionDefinitions[VER_NDX_LOCAL].nonLocalPatterns', see
+    // 'versionDefinitions[VER_NDX_LOCAL].nonLocalPatterns', see
     // 'readConfigs()' in 'Driver.cpp'. Note that it is not '.localPatterns',
     // and may seem counterintuitive, but still works as expected. Here we can
     // exploit that and skip analyzing the pattern added for this option.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index baa7a08..10cbfe1 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -3798,9 +3798,8 @@ VersionTableSection::VersionTableSection(Ctx &ctx)
 }
 
 void VersionTableSection::finalizeContents() {
-  // At the moment of june 2016 GNU docs does not mention that sh_link field
-  // should be set, but Sun docs do. Also readelf relies on this field.
-  getParent()->link = getPartition(ctx).dynSymTab->getParent()->sectionIndex;
+  if (OutputSection *osec = getPartition(ctx).dynSymTab->getParent())
+    getParent()->link = osec->sectionIndex;
 }
 
 size_t VersionTableSection::getSize() const {
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 3e92b76..fe4a0a1 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1080,12 +1080,14 @@ static void maybeShuffle(Ctx &ctx,
   }
 }
 
-// Builds section order for handling --symbol-ordering-file.
+// Return section order within an InputSectionDescription.
+// If both --symbol-ordering-file and call graph profile are present, the order
+// file takes precedence, but the call graph profile is still used for symbols
+// that don't appear in the order file.
 static DenseMap<const InputSectionBase *, int> buildSectionOrder(Ctx &ctx) {
   DenseMap<const InputSectionBase *, int> sectionOrder;
-  // Use the rarely used option --call-graph-ordering-file to sort sections.
   if (!ctx.arg.callGraphProfile.empty())
-    return computeCallGraphProfileOrder(ctx);
+    sectionOrder = computeCallGraphProfileOrder(ctx);
 
   if (ctx.arg.symbolOrderingFile.empty())
     return sectionOrder;
@@ -1099,7 +1101,7 @@ static DenseMap<const InputSectionBase *, int> buildSectionOrder(Ctx &ctx) {
   // appear in the symbol ordering file have the lowest priority 0.
   // All explicitly mentioned symbols have negative (higher) priorities.
   DenseMap<CachedHashStringRef, SymbolOrderEntry> symbolOrder;
-  int priority = -ctx.arg.symbolOrderingFile.size();
+  int priority = -sectionOrder.size() - ctx.arg.symbolOrderingFile.size();
   for (StringRef s : ctx.arg.symbolOrderingFile)
     symbolOrder.insert({CachedHashStringRef(s), {priority++, false}});
 
@@ -1255,11 +1257,11 @@ static void sortSection(Ctx &ctx, OutputSection &osec,
   }
 }
 
-// If no layout was provided by linker script, we want to apply default
-// sorting for special input sections. This also handles --symbol-ordering-file.
+// Sort sections within each InputSectionDescription.
 template <class ELFT> void Writer<ELFT>::sortInputSections() {
-  // Build the order once since it is expensive.
+  // Assign negative priorities.
   DenseMap<const InputSectionBase *, int> order = buildSectionOrder(ctx);
+  // Assign non-negative priorities due to --shuffle-sections.
   maybeShuffle(ctx, order);
   for (SectionCommand *cmd : ctx.script->sectionCommands)
     if (auto *osd = dyn_cast<OutputDesc>(cmd))
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index 29b20c7..8ba911f 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -68,10 +68,6 @@ public:
 
   bool isCodeSection() const override { return macho::isCodeSection(isec); }
 
-  bool hasValidData() const override {
-    return isec && !isec->data.empty() && isec->data.data();
-  }
-
   SmallVector<std::unique_ptr<BPSymbol>> getSymbols() const override {
     SmallVector<std::unique_ptr<BPSymbol>> symbols;
     for (auto *sym : isec->symbols)
diff --git a/lld/MachO/ConcatOutputSection.h b/lld/MachO/ConcatOutputSection.h
index 9af661d..8131c48 100644
--- a/lld/MachO/ConcatOutputSection.h
+++ b/lld/MachO/ConcatOutputSection.h
@@ -25,8 +25,9 @@ class Defined;
 // in the final binary.
 class ConcatOutputSection : public OutputSection {
 public:
-  explicit ConcatOutputSection(StringRef name)
-      : OutputSection(ConcatKind, name) {}
+  explicit ConcatOutputSection(StringRef name,
+                               OutputSection::Kind kind = ConcatKind)
+      : OutputSection(kind, name) {}
 
   const ConcatInputSection *firstSection() const { return inputs.front(); }
   const ConcatInputSection *lastSection() const { return inputs.back(); }
@@ -46,7 +47,7 @@ public:
   void writeTo(uint8_t *buf) const override;
 
   static bool classof(const OutputSection *sec) {
-    return sec->kind() == ConcatKind;
+    return sec->kind() == ConcatKind || sec->kind() == TextKind;
   }
 
   static ConcatOutputSection *getOrCreateForInput(const InputSection *);
@@ -66,12 +67,18 @@ private:
 // support thunk insertion.
 class TextOutputSection : public ConcatOutputSection {
 public:
-  explicit TextOutputSection(StringRef name) : ConcatOutputSection(name) {}
+  explicit TextOutputSection(StringRef name)
+      : ConcatOutputSection(name, TextKind) {}
   void finalizeContents() override {}
   void finalize() override;
   bool needsThunks() const;
+  ArrayRef<ConcatInputSection *> getThunks() const { return thunks; }
   void writeTo(uint8_t *buf) const override;
 
+  static bool classof(const OutputSection *sec) {
+    return sec->kind() == TextKind;
+  }
+
 private:
   uint64_t estimateStubsInRangeVA(size_t callIdx) const;
 
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 7550b0b..31630ba 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1842,7 +1842,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
       args.hasArg(OPT_irpgo_profile_sort_eq))
     warn("--irpgo-profile-sort is deprecated. Please use "
          "--bp-startup-sort=function");
-  if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_eq))
+  if (const Arg *arg = args.getLastArg(OPT_irpgo_profile))
     config->irpgoProfilePath = arg->getValue();
 
   if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_sort)) {
diff --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp
index 9c06216..12417df 100644
--- a/lld/MachO/MapFile.cpp
+++ b/lld/MachO/MapFile.cpp
@@ -161,6 +161,20 @@ static uint64_t getSymSizeForMap(Defined *sym) {
   return sym->size;
 }
 
+// Merges two vectors of input sections in order of their outSecOff values.
+// This approach creates a new (temporary) vector which is not ideal but the
+// ideal approach leads to a lot of code duplication.
+static std::vector<ConcatInputSection *>
+mergeOrderedInputs(ArrayRef<ConcatInputSection *> inputs1,
+                   ArrayRef<ConcatInputSection *> inputs2) {
+  std::vector<ConcatInputSection *> vec(inputs1.size() + inputs2.size());
+  std::merge(inputs1.begin(), inputs1.end(), inputs2.begin(), inputs2.end(),
+             vec.begin(), [](ConcatInputSection *a, ConcatInputSection *b) {
+               return a->outSecOff < b->outSecOff;
+             });
+  return vec;
+}
+
 void macho::writeMapFile() {
   if (config->mapFile.empty())
     return;
@@ -220,7 +234,11 @@ void macho::writeMapFile() {
   os << "# Address\tSize    \tFile  Name\n";
   for (const OutputSegment *seg : outputSegments) {
     for (const OutputSection *osec : seg->getSections()) {
-      if (auto *concatOsec = dyn_cast<ConcatOutputSection>(osec)) {
+      if (auto *textOsec = dyn_cast<TextOutputSection>(osec)) {
+        auto inputsAndThunks =
+            mergeOrderedInputs(textOsec->inputs, textOsec->getThunks());
+        printIsecArrSyms(inputsAndThunks);
+      } else if (auto *concatOsec = dyn_cast<ConcatOutputSection>(osec)) {
         printIsecArrSyms(concatOsec->inputs);
       } else if (osec == in.cStringSection || osec == in.objcMethnameSection) {
         const auto &liveCStrings = info.liveCStringsForSection.lookup(osec);
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 1d7f1d8..4b1e9e4 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -126,8 +126,10 @@ def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">,
 def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
     HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">,
     Group<grp_lld>;
+def irpgo_profile: Separate<["--"], "irpgo-profile">, Group<grp_lld>;
 def irpgo_profile_eq: Joined<["--"], "irpgo-profile=">,
-    HelpText<"Read the IRPGO profile for use with -bp-startup-sort and other profile-guided optimizations">,
+    Alias<!cast<Separate>(irpgo_profile)>, MetaVarName<"<profile>">,
+    HelpText<"Read the IRPGO <profile> for use with -bp-startup-sort and other profile-guided optimizations">,
     Group<grp_lld>;
 def bp_startup_sort: Joined<["--"], "bp-startup-sort=">,
     MetaVarName<"[none,function]">,
diff --git a/lld/MachO/OutputSection.h b/lld/MachO/OutputSection.h
index 5297a03..9afd3a9 100644
--- a/lld/MachO/OutputSection.h
+++ b/lld/MachO/OutputSection.h
@@ -37,6 +37,7 @@ public:
   enum Kind {
     ConcatKind,
     SyntheticKind,
+    TextKind,
   };
 
   OutputSection(Kind kind, StringRef name) : name(name), sectionKind(kind) {}
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 28fb804..417b7cf 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -2079,12 +2079,12 @@ void ObjCMethListSection::finalize() {
 void ObjCMethListSection::writeTo(uint8_t *bufStart) const {
   uint8_t *buf = bufStart;
   for (const ConcatInputSection *isec : inputs) {
-    assert(buf - bufStart == long(isec->outSecOff) &&
+    assert(buf - bufStart == std::ptrdiff_t(isec->outSecOff) &&
            "Writing at unexpected offset");
     uint32_t writtenSize = writeRelativeMethodList(isec, buf);
     buf += writtenSize;
   }
-  assert(buf - bufStart == sectionSize &&
+  assert(buf - bufStart == std::ptrdiff_t(sectionSize) &&
          "Written size does not match expected section size");
 }
 
diff --git a/lld/include/lld/Common/BPSectionOrdererBase.h b/lld/include/lld/Common/BPSectionOrdererBase.h
index 6f483af..e2cb41f 100644
--- a/lld/include/lld/Common/BPSectionOrdererBase.h
+++ b/lld/include/lld/Common/BPSectionOrdererBase.h
@@ -37,7 +37,6 @@ class BPSectionBase {
 public:
   virtual ~BPSectionBase() = default;
   virtual uint64_t getSize() const = 0;
-  virtual bool hasValidData() const = 0;
   virtual bool isCodeSection() const = 0;
   virtual llvm::SmallVector<std::unique_ptr<BPSymbol>> getSymbols() const = 0;
   virtual const void *getSection() const = 0;
diff --git a/lld/test/COFF/Inputs/loadconfig-arm64.s b/lld/test/COFF/Inputs/loadconfig-arm64.s
new file mode 100644
index 0000000..67d1a0a
--- /dev/null
+++ b/lld/test/COFF/Inputs/loadconfig-arm64.s
@@ -0,0 +1,15 @@
+        .section .rdata,"dr"
+        .globl _load_config_used
+        .p2align 3, 0
+_load_config_used:
+        .word 0x140
+        .fill 0x7c,1,0
+        .xword __guard_fids_table
+        .xword __guard_fids_count
+        .xword __guard_flags
+        .xword 0
+        .xword __guard_iat_table
+        .xword __guard_iat_count
+        .xword __guard_longjmp_table
+        .xword __guard_longjmp_count
+        .fill 0x80,1,0
diff --git a/lld/test/COFF/arm64ec-codemap.test b/lld/test/COFF/arm64ec-codemap.test
index 2d79538f..0502611 100644
--- a/lld/test/COFF/arm64ec-codemap.test
+++ b/lld/test/COFF/arm64ec-codemap.test
@@ -9,7 +9,7 @@ RUN: llvm-mc -filetype=obj -triple=arm64ec-windows data-sec2.s -o data-sec2.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows empty-sec.s -o arm64ec-empty-sec.obj
 RUN: llvm-mc -filetype=obj -triple=x86_64-windows x86_64-func-sym.s -o x86_64-func-sym.obj
 RUN: llvm-mc -filetype=obj -triple=x86_64-windows empty-sec.s -o x86_64-empty-sec.obj
-RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 
 Link ARM64EC DLL and verify that the code is arranged as expected.
diff --git a/lld/test/COFF/arm64ec-entry-thunk.s b/lld/test/COFF/arm64ec-entry-thunk.s
index bf5cb42..b31d315 100644
--- a/lld/test/COFF/arm64ec-entry-thunk.s
+++ b/lld/test/COFF/arm64ec-entry-thunk.s
@@ -27,7 +27,7 @@ thunk:
     .rva func
 
 // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadcfg.obj
-// RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64ec.s -o native-loadcfg.obj
+// RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o native-loadcfg.obj
 // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test-simple.s -o test-simple.obj
 // RUN: lld-link -machine:arm64ec -dll -noentry -out:out-simple.dll loadcfg.obj test-simple.obj
 // RUN: llvm-objdump -d out-simple.dll | FileCheck --check-prefix=DISASM %s
diff --git a/lld/test/COFF/arm64ec-lib.test b/lld/test/COFF/arm64ec-lib.test
index ea07d28..8698a5c 100644
--- a/lld/test/COFF/arm64ec-lib.test
+++ b/lld/test/COFF/arm64ec-lib.test
@@ -11,7 +11,7 @@ RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ref-alias.s -o ref-alias.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ref-thunk.s -o ref-thunk.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func.s -o func.obj
 RUN: llvm-mc -filetype=obj -triple=x86_64-windows func-x86_64.s -o func-x86_64.obj
-RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 
 RUN: llvm-lib -machine:arm64ec -out:sym-arm64ec.lib sym-arm64ec.obj nsym-aarch64.obj
diff --git a/lld/test/COFF/arm64ec-range-thunks.s b/lld/test/COFF/arm64ec-range-thunks.s
index 09d9b01..dcfa636 100644
--- a/lld/test/COFF/arm64ec-range-thunks.s
+++ b/lld/test/COFF/arm64ec-range-thunks.s
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple=aarch64-windows native-funcs.s -o funcs-aarch64.obj
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows space.s -o space-x86_64.obj
 # RUN: llvm-mc -filetype=obj -triple=aarch64-windows space.s -o space-aarch64.obj
-# RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64.obj
+# RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
 # RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
 
 
diff --git a/lld/test/COFF/arm64x-loadconfig.s b/lld/test/COFF/arm64x-loadconfig.s
index 0d4fe0e..d21f4bf 100644
--- a/lld/test/COFF/arm64x-loadconfig.s
+++ b/lld/test/COFF/arm64x-loadconfig.s
@@ -2,12 +2,27 @@
 // RUN: split-file %s %t.dir && cd %t.dir
 
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows test.s -o test.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows chpe.s -o chpe.obj
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows loadconfig.s -o loadconfig.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-ec.s -o loadconfig-ec.obj
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows loadconfig-short.s -o loadconfig-short.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows loadconfig-short.s -o loadconfig-short-arm64ec.obj
 
-// RUN: lld-link -machine:arm64x -out:out.dll -dll -noentry loadconfig.obj test.obj
+// RUN: lld-link -machine:arm64x -out:out-warn.dll -dll -noentry test.obj \
+// RUN:           2>&1 | FileCheck --check-prefixes=WARN-LOADCFG,WARN-EC-LOADCFG %s
+// WARN-LOADCFG:    lld-link: warning: native version of '_load_config_used' is missing for ARM64X target
+// WARN-EC-LOADCFG: lld-link: warning: EC version of '_load_config_used' is missing
 
-// RUN: llvm-readobj --coff-load-config out.dll | FileCheck -check-prefix=DYNRELOCS %s
+// RUN: lld-link -machine:arm64x -out:out-nonative.dll -dll -noentry loadconfig-ec.obj chpe.obj \
+// RUN:           2>&1 | FileCheck --check-prefixes=WARN-LOADCFG --implicit-check-not EC %s
+
+// RUN: lld-link -machine:arm64ec -out:out-ec.dll -dll -noentry chpe.obj \
+// RUN:           2>&1 | FileCheck --check-prefixes=WARN-EC-LOADCFG --implicit-check-not native %s
+
+// RUN: lld-link -machine:arm64x -out:out.dll -dll -noentry loadconfig.obj test.obj \
+// RUN:           2>&1 | FileCheck --check-prefixes=WARN-EC-LOADCFG --implicit-check-not native %s
+
+// RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=DYNRELOCS %s
 // DYNRELOCS:      DynamicValueRelocTableOffset: 0xC
 // DYNRELOCS-NEXT: DynamicValueRelocTableSection: 4
 // DYNRELOCS:      DynamicRelocations [
@@ -34,7 +49,7 @@
 // DYNRELOCS-NEXT:   ]
 // DYNRELOCS-NEXT: ]
 
-// RUN: llvm-readobj --headers out.dll | FileCheck -check-prefix=HEADERS %s
+// RUN: llvm-readobj --headers out.dll | FileCheck --check-prefix=HEADERS %s
 // HEADERS:      BaseRelocationTableRVA: 0x4000
 // HEADERS-NEXT: BaseRelocationTableSize: 0xC
 // HEADERS:      LoadConfigTableRVA: 0x1000
@@ -43,8 +58,73 @@
 // HEADERS-NEXT: VirtualSize: 0x38
 
 // RUN: lld-link -machine:arm64x -out:out-short.dll -dll -noentry loadconfig-short.obj 2>&1 | FileCheck --check-prefix=WARN-RELOC-SIZE %s
+// RUN: lld-link -machine:arm64x -out:out-short.dll -dll -noentry loadconfig-short-arm64ec.obj 2>&1 | FileCheck --check-prefix=WARN-RELOC-SIZE %s
 // WARN-RELOC-SIZE: lld-link: warning: '_load_config_used' structure too small to include dynamic relocations
 
+// Check that the CHPE metadata pointer is correctly copied from the EC load config to the native load config.
+
+// RUN: lld-link -machine:arm64x -out:out-hyb.dll -dll -noentry loadconfig.obj loadconfig-ec.obj chpe.obj test.obj
+
+// RUN: llvm-readobj --coff-load-config out-hyb.dll | FileCheck --check-prefix=LOADCFG %s
+// LOADCFG:      Format: COFF-ARM64X
+// LOADCFG-NEXT: Arch: aarch64
+// LOADCFG-NEXT: AddressSize: 64bit
+// LOADCFG-NEXT: LoadConfig [
+// LOADCFG-NEXT:   Size: 0x140
+// LOADCFG:      CHPEMetadata [
+// LOADCFG-NEXT:   Version: 0x2
+// LOADCFG:        RedirectionMetadata: 12288
+// LOADCFG:        AlternateEntryPoint: 0x0
+// LOADCFG-NEXT:   AuxiliaryIAT: 0x0
+// LOADCFG-NEXT:   GetX64InformationFunctionPointer: 0x0
+// LOADCFG-NEXT:   SetX64InformationFunctionPointer: 0x0
+// LOADCFG-NEXT:   ExtraRFETable: 0x0
+// LOADCFG-NEXT:   ExtraRFETableSize: 0x0
+// LOADCFG-NEXT:   __os_arm64x_dispatch_fptr: 0x0
+// LOADCFG-NEXT:   AuxiliaryIATCopy: 0x0
+// LOADCFG-NEXT:   AuxiliaryDelayloadIAT: 0x0
+// LOADCFG-NEXT:   AuxiliaryDelayloadIATCopy: 0x0
+// LOADCFG-NEXT:   HybridImageInfoBitfield: 0x0
+// LOADCFG:      ]
+// LOADCFG-NEXT: DynamicRelocations [
+// LOADCFG-NEXT:   Version: 0x1
+// LOADCFG-NEXT:   Arm64X [
+// LOADCFG-NEXT:     Entry [
+// LOADCFG-NEXT:       RVA: 0x7C
+// LOADCFG-NEXT:       Type: VALUE
+// LOADCFG-NEXT:       Size: 0x2
+// LOADCFG-NEXT:       Value: 0x8664
+// LOADCFG-NEXT:     ]
+// LOADCFG-NEXT:     Entry [
+// LOADCFG-NEXT:       RVA: 0x150
+// LOADCFG-NEXT:       Type: VALUE
+// LOADCFG-NEXT:       Size: 0x4
+// LOADCFG-NEXT:       Value: 0x0
+// LOADCFG-NEXT:     ]
+// LOADCFG-NEXT:     Entry [
+// LOADCFG-NEXT:       RVA: 0x154
+// LOADCFG-NEXT:       Type: VALUE
+// LOADCFG-NEXT:       Size: 0x4
+// LOADCFG-NEXT:       Value: 0x0
+// LOADCFG-NEXT:     ]
+// LOADCFG-NEXT:   ]
+// LOADCFG-NEXT: ]
+// LOADCFG-NEXT: HybridObject {
+// LOADCFG-NEXT:   Format: COFF-x86-64
+// LOADCFG-NEXT:   Arch: x86_64
+// LOADCFG-NEXT:   AddressSize: 64bit
+
+// RUN: llvm-readobj --coff-basereloc out-hyb.dll | FileCheck --check-prefix=BASERELOC %s
+// BASERELOC:      BaseReloc [
+// BASERELOC-NEXT:   Entry {
+// BASERELOC-NEXT:     Type: DIR64
+// BASERELOC-NEXT:     Address: 0x1208
+// BASERELOC-NEXT:   }
+// BASERELOC-NEXT:   Entry {
+// BASERELOC:          Type: DIR64
+// BASERELOC-NEXT:     Address: 0x2074
+// BASERELOC-NEXT:   }
+
 #--- test.s
         .data
 sym:
@@ -59,6 +139,16 @@ _load_config_used:
         .word 0x140
         .fill 0x13c,1,0
 
+#--- loadconfig-ec.s
+        .section .rdata,"dr"
+        .globl _load_config_used
+        .p2align 3, 0
+_load_config_used:
+        .word 0x140
+        .fill 0xc4,1,0
+        .xword __chpe_metadata
+        .fill 0x70,1,0
+
 #--- loadconfig-short.s
         .section .rdata,"dr"
         .globl _load_config_used
@@ -66,3 +156,38 @@ _load_config_used:
 _load_config_used:
         .word 0xe4
         .fill 0xe0,1,0
+
+#--- chpe.s
+        .data
+        .globl __chpe_metadata
+        .p2align 3, 0
+__chpe_metadata:
+        .word 2
+        .rva __hybrid_code_map
+        .word __hybrid_code_map_count
+        .rva __x64_code_ranges_to_entry_points
+        .rva __arm64x_redirection_metadata
+        .word 0 // __os_arm64x_dispatch_call_no_redirect
+        .word 0 // __os_arm64x_dispatch_ret
+        .word 0 // __os_arm64x_check_call
+        .word 0 // __os_arm64x_check_icall
+        .word 0 // __os_arm64x_check_icall_cfg
+        .rva __arm64x_native_entrypoint
+        .rva __hybrid_auxiliary_iat
+        .word __x64_code_ranges_to_entry_points_count
+        .word __arm64x_redirection_metadata_count
+        .word 0 // __os_arm64x_get_x64_information
+        .word 0 // __os_arm64x_set_x64_information
+        .rva __arm64x_extra_rfe_table
+        .word __arm64x_extra_rfe_table_size
+        .word 0 // __os_arm64x_dispatch_fptr
+        .rva __hybrid_auxiliary_iat_copy
+        .rva __hybrid_auxiliary_delayload_iat
+        .rva __hybrid_auxiliary_delayload_iat_copy
+        .word __hybrid_image_info_bitfield
+        .word 0 // __os_arm64x_helper3
+        .word 0 // __os_arm64x_helper4
+        .word 0 // __os_arm64x_helper5
+        .word 0 // __os_arm64x_helper6
+        .word 0 // __os_arm64x_helper7
+        .word 0 // __os_arm64x_helper8
diff --git a/lld/test/COFF/cgprofile-obj.s b/lld/test/COFF/cgprofile-obj.s
index b267850..c16aa2e 100644
--- a/lld/test/COFF/cgprofile-obj.s
+++ b/lld/test/COFF/cgprofile-obj.s
@@ -2,9 +2,12 @@
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t
 # RUN: lld-link /subsystem:console /entry:A %t /out:%t2 /debug:symtab
-# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s
+# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=CG-OBJ
 # RUN: lld-link /call-graph-profile-sort:no /subsystem:console /entry:A %t /out:%t3 /debug:symtab
 # RUN: llvm-nm --numeric-sort %t3 | FileCheck %s --check-prefix=NO-CG
+# RUN: echo "D A 200" > %t.call_graph
+# RUN: lld-link /subsystem:console /entry:A %t /out:%t4 /debug:symtab /call-graph-ordering-file:%t.call_graph
+# RUN: llvm-nm --numeric-sort %t4 | FileCheck %s --check-prefix=CG-OBJ-OF
 
     .section    .text,"ax", one_only, D
 D:
@@ -33,13 +36,20 @@ Aa:
     .cg_profile B, C, 30
     .cg_profile C, D, 90
 
-# CHECK: 140001000 T A
-# CHECK: 140001001 T B
-# CHECK: 140001002 T C
-# CHECK: 140001003 t D
-
-
-# NO-CG: 140001000 t D
-# NO-CG: 140001001 T C
-# NO-CG: 140001002 T B
-# NO-CG: 140001003 T A
+# CG-OBJ:      140001000 T A
+# CG-OBJ-NEXT: 140001000 t Aa
+# CG-OBJ-NEXT: 140001001 T B
+# CG-OBJ-NEXT: 140001002 T C
+# CG-OBJ-NEXT: 140001003 t D
+
+# NO-CG:      140001000 t D
+# NO-CG-NEXT: 140001001 T C
+# NO-CG-NEXT: 140001002 T B
+# NO-CG-NEXT: 140001003 T A
+# NO-CG-NEXT: 140001003 t Aa
+
+# CG-OBJ-OF:      140001000 t D
+# CG-OBJ-OF-NEXT: 140001001 T A
+# CG-OBJ-OF-NEXT: 140001001 t Aa
+# CG-OBJ-OF-NEXT: 140001004 T C
+# CG-OBJ-OF-NEXT: 140001005 T B
diff --git a/lld/test/COFF/guard-warnings.s b/lld/test/COFF/guard-warnings.s
index 77448ee..0928715 100644
--- a/lld/test/COFF/guard-warnings.s
+++ b/lld/test/COFF/guard-warnings.s
@@ -38,7 +38,7 @@
 
 # RUN: llvm-mc -triple x86_64-windows-msvc %t/loadcfg-misaligned2.s -filetype=obj -o %t/loadcfg-misaligned2.obj
 # RUN: lld-link %t/main.obj %t/loadcfg-misaligned2.obj -guard:cf,longjmp,ehcont -out:%t-misaligned2.exe -entry:main %basename_t-exp.lib 2>&1 | FileCheck %s --check-prefix=WARN_ALIGN2
-# WARN_ALIGN2: warning: '_load_config_used' is misaligned (RVA is 0x{{[0-9A-F]*}}2 not aligned to 8 bytes)
+# WARN_ALIGN2: warning: '_load_config_used' is misaligned (section offset is 0x{{[0-9A-F]*}}2 not aligned to 8 bytes)
 
 # RUN: llvm-mc -triple x86_64-windows-msvc %t/loadcfg-full.s -filetype=obj -o %t/loadcfg-full.obj
 # RUN: lld-link %t/main.obj %t/loadcfg-full.obj -guard:cf,longjmp,ehcont -out:%t.exe -entry:main %basename_t-exp.lib 2>&1 | FileCheck %s --check-prefix=NOWARN --allow-empty
diff --git a/lld/test/COFF/linkrepro-thin-archives.s b/lld/test/COFF/linkrepro-thin-archives.s
new file mode 100644
index 0000000..6fde36b
--- /dev/null
+++ b/lld/test/COFF/linkrepro-thin-archives.s
@@ -0,0 +1,23 @@
+# REQUIRES: x86
+
+# RUN: rm -rf %t.dir; split-file %s %t.dir
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-windows %t.dir/foo.s -o %t.dir/foo.obj
+# RUN: cd %t.dir
+# RUN: llvm-ar rcsT foo.lib foo.obj
+
+# RUN: lld-link foo.lib /out:/dev/null /reproduce:repro.tar \
+# RUN:     /subsystem:console /machine:x64
+# RUN: tar tf repro.tar | FileCheck -DPATH='repro/%:t.dir' %s
+
+# RUN: lld-link /wholearchive foo.lib /out:/dev/null /reproduce:repro2.tar \
+# RUN:     /subsystem:console /machine:x64
+# RUN: tar tf repro2.tar | FileCheck -DPATH='repro2/%:t.dir' %s
+
+# CHECK-DAG: [[PATH]]/foo.lib
+# CHECK-DAG: [[PATH]]/foo.obj
+
+#--- foo.s
+.globl mainCRTStartup
+mainCRTStartup:
+  nop
diff --git a/lld/test/COFF/loadcfg-short.test b/lld/test/COFF/loadcfg-short.test
new file mode 100644
index 0000000..dd4d438
--- /dev/null
+++ b/lld/test/COFF/loadcfg-short.test
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s -o %t.obj
+# RUN: not lld-link -out:%t.dll %t.obj -dll -noentry 2>&1 | FileCheck %s
+# CHECK: lld-link: error: _load_config_used section chunk is too small
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: []
+sections:
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       16
+    SectionData:     '030000'
+symbols:
+  - Name:            .rdata
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          112
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+  - Name:            _load_config_used
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/lld/test/COFF/loadcfg-size.test b/lld/test/COFF/loadcfg-size.test
new file mode 100644
index 0000000..871590f
--- /dev/null
+++ b/lld/test/COFF/loadcfg-size.test
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s -o %t.obj
+# RUN: not lld-link -out:%t.dll %t.obj -dll -noentry 2>&1 | FileCheck %s
+# CHECK: lld-link: error: _load_config_used specifies a size larger than its containing section chunk
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: []
+sections:
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       16
+    SectionData:     '0c00000000000000'
+symbols:
+  - Name:            .rdata
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          112
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+  - Name:            _load_config_used
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/lld/test/COFF/loadcfg-uninitialized.test b/lld/test/COFF/loadcfg-uninitialized.test
new file mode 100644
index 0000000..5f956bc
--- /dev/null
+++ b/lld/test/COFF/loadcfg-uninitialized.test
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s -o %t.obj
+# RUN: not lld-link -out:%t.dll %t.obj -dll -noentry 2>&1 | FileCheck %s
+# CHECK: lld-link: error: _load_config_used points to uninitialized data
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: []
+sections:
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_UNINITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       16
+    VirtualSize:     0x140
+symbols:
+  - Name:            .rdata
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          112
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+  - Name:            _load_config_used
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s
index 0848adc5..358dcd9 100644
--- a/lld/test/ELF/cgprofile-obj.s
+++ b/lld/test/ELF/cgprofile-obj.s
@@ -2,12 +2,15 @@
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: ld.lld -e A %t.o -o %t
-# RUN: llvm-nm --no-sort %t | FileCheck %s
+# RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=CG-OBJ
 # RUN: ld.lld --call-graph-profile-sort=none -e A %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG
 ## --no-call-graph-profile-sort is an alias for --call-graph-profile-sort=none.
 # RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t1
 # RUN: cmp %t %t1
+# RUN: echo "D A 200" > %t.call_graph
+# RUN: ld.lld -e A %t.o -call-graph-ordering-file=%t.call_graph -o %t2
+# RUN: llvm-nm --no-sort %t2 | FileCheck %s --check-prefix=CG-OBJ-OF
 
     .section    .text.D,"ax",@progbits
 D:
@@ -36,12 +39,20 @@ Aa:
     .cg_profile B, C, 30
     .cg_profile C, D, 90
 
-# CHECK: 0000000000201123 t D
-# CHECK: 0000000000201122 T C
-# CHECK: 0000000000201121 T B
-# CHECK: 0000000000201120 T A
-
-# NO-CG: 0000000000201120 t D
-# NO-CG: 0000000000201121 T C
-# NO-CG: 0000000000201122 T B
-# NO-CG: 0000000000201123 T A
+# CG-OBJ:      0000000000201123 t D
+# CG-OBJ-NEXT: 0000000000201120 t Aa
+# CG-OBJ-NEXT: 0000000000201122 T C
+# CG-OBJ-NEXT: 0000000000201121 T B
+# CG-OBJ-NEXT: 0000000000201120 T A
+
+# NO-CG:      0000000000201120 t D
+# NO-CG-NEXT: 0000000000201123 t Aa
+# NO-CG-NEXT: 0000000000201121 T C
+# NO-CG-NEXT: 0000000000201122 T B
+# NO-CG-NEXT: 0000000000201123 T A
+
+# CG-OBJ-OF:      0000000000201120 t D
+# CG-OBJ-OF-NEXT: 0000000000201121 t Aa
+# CG-OBJ-OF-NEXT: 0000000000201124 T C
+# CG-OBJ-OF-NEXT: 0000000000201125 T B
+# CG-OBJ-OF-NEXT: 0000000000201121 T A
diff --git a/lld/test/ELF/cgprofile-orderfile.s b/lld/test/ELF/cgprofile-orderfile.s
new file mode 100644
index 0000000..584b2ec
--- /dev/null
+++ b/lld/test/ELF/cgprofile-orderfile.s
@@ -0,0 +1,41 @@
+# REQUIRES: x86
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+
+# RUN: ld.lld -e A a.o --symbol-ordering-file=order --call-graph-profile-sort=hfsort -o out
+# RUN: llvm-nm --numeric-sort out | FileCheck %s
+# RUN: ld.lld -e A a.o --call-graph-profile-sort=hfsort -o out1
+# RUN: llvm-nm --numeric-sort out1 | FileCheck %s --check-prefix=ONLY-CG
+
+#--- order
+B
+A
+
+#--- a.s
+.section .text.D,"ax"; .globl D; D:
+  retq
+
+.section .text.C,"ax"; .globl C; C:
+  call D
+
+.section .text.B,"ax"; .globl B; B:
+  retq
+
+.section .text.A,"ax"; .globl A; A:
+  call B
+  call C
+
+.cg_profile A, B, 100
+.cg_profile A, C,  40
+.cg_profile C, D,  61
+
+# CHECK:      T B
+# CHECK-NEXT: T A
+# CHECK-NEXT: T C
+# CHECK-NEXT: T D
+
+# ONLY-CG:      T A
+# ONLY-CG-NEXT: T B
+# ONLY-CG-NEXT: T C
+# ONLY-CG-NEXT: T D
diff --git a/lld/test/ELF/linkerscript/discard-section-dynsym.s b/lld/test/ELF/linkerscript/discard-section-dynsym.s
index 7c7c9c2..f5d483d 100644
--- a/lld/test/ELF/linkerscript/discard-section-dynsym.s
+++ b/lld/test/ELF/linkerscript/discard-section-dynsym.s
@@ -1,24 +1,43 @@
 # REQUIRES: aarch64
 
 ## We allow discarding .dynsym, check we don't crash.
-# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=aarch64 a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64 c.s -o c.o
+# RUN: ld.lld -shared --version-script=c.ver c.o -o c.so
 
-# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym) } }' > %t.lds
-# RUN: ld.lld -shared -T %t.lds %t.o -o %t.so
-# RUN: llvm-readelf -r %t.so | FileCheck %s
+# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym) } }' > 1.lds
+# RUN: ld.lld -shared -T 1.lds a.o c.so -o out1.so
+# RUN: llvm-readelf -Sr out1.so | FileCheck %s --check-prefixes=CHECK,CHECK1
 
-# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym .dynstr) } }' > %t.lds
-# RUN: ld.lld -shared -T %t.lds %t.o -o %t.so
-# RUN: llvm-readelf -r %t.so | FileCheck %s
+# RUN: echo 'SECTIONS { /DISCARD/ : { *(.dynsym .dynstr) } }' > 2.lds
+# RUN: ld.lld -shared -T 2.lds a.o c.so -o out2.so
+# RUN: llvm-readelf -Sr out2.so | FileCheck %s --check-prefixes=CHECK,CHECK2
+
+# CHECK:       [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK-NEXT:  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
+# CHECK-NEXT:  [ 1] .gnu.version      VERSYM          0000000000000000 {{.*}} 000006 02   A  0   0  2
+# CHECK1-NEXT: [ 2] .gnu.version_r    VERNEED         0000000000000008 {{.*}} 000020 00   A  5   1  4
+# CHECK2-NEXT: [ 2] .gnu.version_r    VERNEED         0000000000000008 {{.*}} 000020 00   A  0   1  4
+# CHECK1:      [ 5] .dynstr           STRTAB
 
 # CHECK:      contains 2 entries:
 # CHECK:      R_AARCH64_RELATIVE  [[#]]
 # CHECK-NEXT: R_AARCH64_GLOB_DAT  0{{$}}
 
+#--- a.s
   adrp x9, :got:var
   ldr  x9, [x9, :got_lo12:var]
+  bl __libc_start_main
 
 .data
 .align 8
 foo:
 .quad foo
+
+#--- c.s
+.globl __libc_start_main
+__libc_start_main:
+
+#--- c.ver
+GLIBC_2.34 { __libc_start_main; };
diff --git a/lld/test/MachO/arm64-thunks.s b/lld/test/MachO/arm64-thunks.s
index d887359..858a27d 100644
--- a/lld/test/MachO/arm64-thunks.s
+++ b/lld/test/MachO/arm64-thunks.s
@@ -8,14 +8,46 @@
 ## (4) early calls to a dylib stub use a thunk, and later calls the stub
 ##     directly
 ## (5) Thunks are created for all sections in the text segment with branches.
+## (6) Thunks are in the linker map file.
 ## Notes:
 ## 0x4000000 = 64 Mi = half the magnitude of the forward-branch range
 
 # RUN: rm -rf %t; mkdir %t
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t/input.o
-# RUN: %lld -arch arm64 -dead_strip -lSystem -U _extern_sym -o %t/thunk %t/input.o
+# RUN: %lld -arch arm64 -dead_strip -lSystem -U _extern_sym -map %t/thunk.map -o %t/thunk %t/input.o
 # RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t/thunk | FileCheck %s
 
+## Check that the thunks appear in the map file and that everything is sorted by address
+# Because of the `.space` instructions, there will end up being a lot of dead symbols in the 
+# linker map (linker map will be ~2.7GB). So to avoid the test trying to (slowly) match regex
+# across all the ~2.7GB of the linker map - generate a version of the linker map without dead symbols.
+# RUN: awk '/# Dead Stripped Symbols:/ {exit} {print}' %t/thunk.map > %t/thunk_no_dead_syms.map
+
+# RUN: FileCheck %s --input-file %t/thunk_no_dead_syms.map --check-prefix=MAP
+ 
+# MAP:      0x{{[[:xdigit:]]+}} {{.*}} _b
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _c
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _d.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _e.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _f.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _g.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _h.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} ___nan.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _d
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _e
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _f
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _g
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _a.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _b.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _h
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _main
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _c.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _d.thunk.1
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _e.thunk.1
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _f.thunk.1
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _z
+
+
 # CHECK: Disassembly of section __TEXT,__text:
 
 # CHECK: [[#%.13x, A_PAGE:]][[#%.3x, A_OFFSET:]] <_a>:
diff --git a/lld/test/MachO/bp-section-orderer-errs.s b/lld/test/MachO/bp-section-orderer-errs.s
index 8d19e01..abeb251 100644
--- a/lld/test/MachO/bp-section-orderer-errs.s
+++ b/lld/test/MachO/bp-section-orderer-errs.s
@@ -14,8 +14,9 @@
 # RUN: not %lld -o /dev/null --bp-compression-sort-startup-functions 2>&1 | FileCheck %s --check-prefix=STARTUP
 # STARTUP: --bp-compression-sort-startup-functions must be used with --bp-startup-sort=function
 
+# RUN: not %lld -o /dev/null --irpgo-profile %s --bp-startup-sort=function --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-STARTUP
 # RUN: not %lld -o /dev/null --irpgo-profile=%s --bp-startup-sort=function --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-STARTUP
 # IRPGO-STARTUP: --bp-startup-sort= is incompatible with --call-graph-profile-sort
 
 # RUN: not %lld -o /dev/null --bp-startup-sort=function 2>&1 | FileCheck %s --check-prefix=STARTUP-COMPRESSION
-# STARTUP-COMPRESSION: --bp-startup-sort=function must be used with --irpgo-profile
-\ No newline at end of file
+# STARTUP-COMPRESSION: --bp-startup-sort=function must be used with --irpgo-profile
diff --git a/lld/test/MachO/bp-section-orderer.s b/lld/test/MachO/bp-section-orderer.s
index e5d0e71..2eaff04 100644
--- a/lld/test/MachO/bp-section-orderer.s
+++ b/lld/test/MachO/bp-section-orderer.s
@@ -7,7 +7,7 @@
 # RUN: %no-fatal-warnings-lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
 # RUN: %no-fatal-warnings-lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer --icf=all --compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP
 
-# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile=%t/a.profdata --bp-startup-sort=function --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile %t/a.profdata --bp-startup-sort=function --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
 # RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile=%t/a.profdata --bp-startup-sort=function --verbose-bp-section-orderer --icf=all --bp-compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP
 # STARTUP: Ordered 3 sections using balanced partitioning
 
diff --git a/lld/test/wasm/dylink-non-pie.s b/lld/test/wasm/dylink-non-pie.s
new file mode 100755
index 0000000..3157b8c
--- /dev/null
+++ b/lld/test/wasm/dylink-non-pie.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.lib.o %p/Inputs/ret32.s
+# RUN: wasm-ld -m wasm32 --experimental-pic -shared --no-entry %t.lib.o -o %t.lib.so
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld -m wasm32 -Bdynamic %t.o %t.lib.so -o %t.wasm
+# RUN: obj2yaml %t.wasm | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --check-prefixes DIS
+
+	.functype	ret32 (f32) -> (i32)
+	.globl	_start
+_start:
+	.functype	_start () -> ()
+	i32.const   f_p
+	drop
+	end_function
+
+	.section	.data.f_p,"",@
+f_p:
+	.int32	ret32
+	.size	f_p, 4
+
+# CHECK:      Sections:
+# CHECK-NEXT:   - Type:            CUSTOM
+# CHECK-NEXT:     Name:            dylink.0
+
+# non-pie executable doesn't import __memory_base
+# CHECK:   - Type:            IMPORT
+# CHECK-NOT: Field:           __memory_base
+
+# CHECK:   - Type:            EXPORT
+# CHECK:       - Name:            __wasm_apply_data_relocs
+# CHECK-NEXT:         Kind:            FUNCTION
+
+# DIS:    <__wasm_apply_data_relocs>:
+# DIS-EMPTY:
+# DIS-NEXT:    i32.const       1024
+# DIS-NEXT:    global.get      0
+# DIS-NEXT:    i32.store       0
+# DIS-NEXT:    end
diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h
index 0c2ba3e..1fa6c42 100644
--- a/lld/wasm/Config.h
+++ b/lld/wasm/Config.h
@@ -126,17 +126,9 @@ struct Config {
   llvm::SmallVector<uint8_t, 0> buildIdVector;
 };
 
-struct ConfigWrapper {
-  Config c;
-  Config *operator->() { return &c; }
-};
-
-// The only instance of Configuration struct.
-extern ConfigWrapper config;
-
 // The Ctx object hold all other (non-configuration) global state.
 struct Ctx {
-  Config &arg;
+  Config arg;
 
   llvm::SmallVector<ObjFile *, 0> objectFiles;
   llvm::SmallVector<StubFile *, 0> stubFiles;
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 0247195..c3a74dd 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -44,17 +44,16 @@ using namespace llvm::sys;
 using namespace llvm::wasm;
 
 namespace lld::wasm {
-ConfigWrapper config;
 Ctx ctx;
 
 void errorOrWarn(const llvm::Twine &msg) {
-  if (config->noinhibitExec)
+  if (ctx.arg.noinhibitExec)
     warn(msg);
   else
     error(msg);
 }
 
-Ctx::Ctx() : arg(config.c) {}
+Ctx::Ctx() {}
 
 void Ctx::reset() {
   arg.~Config();
@@ -268,7 +267,7 @@ opt::InputArgList WasmOptTable::parse(ArrayRef<const char *> argv) {
 static void readImportFile(StringRef filename) {
   if (std::optional<MemoryBufferRef> buf = readFile(filename))
     for (StringRef sym : args::getLines(*buf))
-      config->allowUndefinedSymbols.insert(sym);
+      ctx.arg.allowUndefinedSymbols.insert(sym);
 }
 
 // Returns slices of MB by parsing MB as an archive file.
@@ -345,7 +344,7 @@ void LinkerDriver::addFile(StringRef path) {
   case file_magic::bitcode:
   case file_magic::wasm_object: {
     auto obj = createObjectFile(mbref, "", 0, inLib);
-    if (config->isStatic && isa<SharedFile>(obj)) {
+    if (ctx.arg.isStatic && isa<SharedFile>(obj)) {
       error("attempted static link of dynamic object " + path);
       break;
     }
@@ -364,7 +363,7 @@ void LinkerDriver::addFile(StringRef path) {
 }
 
 static std::optional<std::string> findFromSearchPaths(StringRef path) {
-  for (StringRef dir : config->searchPaths)
+  for (StringRef dir : ctx.arg.searchPaths)
     if (std::optional<std::string> s = findFile(dir, path))
       return s;
   return std::nullopt;
@@ -373,8 +372,8 @@ static std::optional<std::string> findFromSearchPaths(StringRef path) {
 // This is for -l<basename>. We'll look for lib<basename>.a from
 // search paths.
 static std::optional<std::string> searchLibraryBaseName(StringRef name) {
-  for (StringRef dir : config->searchPaths) {
-    if (!config->isStatic)
+  for (StringRef dir : ctx.arg.searchPaths) {
+    if (!ctx.arg.isStatic)
       if (std::optional<std::string> s = findFile(dir, "lib" + name + ".so"))
         return s;
     if (std::optional<std::string> s = findFile(dir, "lib" + name + ".a"))
@@ -408,10 +407,10 @@ void LinkerDriver::createFiles(opt::InputArgList &args) {
       addFile(arg->getValue());
       break;
     case OPT_Bstatic:
-      config->isStatic = true;
+      ctx.arg.isStatic = true;
       break;
     case OPT_Bdynamic:
-      config->isStatic = false;
+      ctx.arg.isStatic = false;
       break;
     case OPT_whole_archive:
       inWholeArchive = true;
@@ -527,99 +526,98 @@ getBuildId(opt::InputArgList &args) {
 
 // Initializes Config members by the command line options.
 static void readConfigs(opt::InputArgList &args) {
-  config->allowMultipleDefinition =
+  ctx.arg.allowMultipleDefinition =
       hasZOption(args, "muldefs") ||
       args.hasFlag(OPT_allow_multiple_definition,
                    OPT_no_allow_multiple_definition, false);
-  config->bsymbolic = args.hasArg(OPT_Bsymbolic);
-  config->checkFeatures =
+  ctx.arg.bsymbolic = args.hasArg(OPT_Bsymbolic);
+  ctx.arg.checkFeatures =
       args.hasFlag(OPT_check_features, OPT_no_check_features, true);
-  config->compressRelocations = args.hasArg(OPT_compress_relocations);
-  config->demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true);
-  config->disableVerify = args.hasArg(OPT_disable_verify);
-  config->emitRelocs = args.hasArg(OPT_emit_relocs);
-  config->experimentalPic = args.hasArg(OPT_experimental_pic);
-  config->entry = getEntry(args);
-  config->exportAll = args.hasArg(OPT_export_all);
-  config->exportTable = args.hasArg(OPT_export_table);
-  config->growableTable = args.hasArg(OPT_growable_table);
-  config->noinhibitExec = args.hasArg(OPT_noinhibit_exec);
+  ctx.arg.compressRelocations = args.hasArg(OPT_compress_relocations);
+  ctx.arg.demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true);
+  ctx.arg.disableVerify = args.hasArg(OPT_disable_verify);
+  ctx.arg.emitRelocs = args.hasArg(OPT_emit_relocs);
+  ctx.arg.experimentalPic = args.hasArg(OPT_experimental_pic);
+  ctx.arg.entry = getEntry(args);
+  ctx.arg.exportAll = args.hasArg(OPT_export_all);
+  ctx.arg.exportTable = args.hasArg(OPT_export_table);
+  ctx.arg.growableTable = args.hasArg(OPT_growable_table);
+  ctx.arg.noinhibitExec = args.hasArg(OPT_noinhibit_exec);
 
   if (args.hasArg(OPT_import_memory_with_name)) {
-    config->memoryImport =
+    ctx.arg.memoryImport =
         args.getLastArgValue(OPT_import_memory_with_name).split(",");
   } else if (args.hasArg(OPT_import_memory)) {
-    config->memoryImport =
+    ctx.arg.memoryImport =
         std::pair<llvm::StringRef, llvm::StringRef>(defaultModule, memoryName);
   } else {
-    config->memoryImport =
+    ctx.arg.memoryImport =
         std::optional<std::pair<llvm::StringRef, llvm::StringRef>>();
   }
 
   if (args.hasArg(OPT_export_memory_with_name)) {
-    config->memoryExport =
-        args.getLastArgValue(OPT_export_memory_with_name);
+    ctx.arg.memoryExport = args.getLastArgValue(OPT_export_memory_with_name);
   } else if (args.hasArg(OPT_export_memory)) {
-    config->memoryExport = memoryName;
+    ctx.arg.memoryExport = memoryName;
   } else {
-    config->memoryExport = std::optional<llvm::StringRef>();
+    ctx.arg.memoryExport = std::optional<llvm::StringRef>();
   }
 
-  config->sharedMemory = args.hasArg(OPT_shared_memory);
-  config->soName = args.getLastArgValue(OPT_soname);
-  config->importTable = args.hasArg(OPT_import_table);
-  config->importUndefined = args.hasArg(OPT_import_undefined);
-  config->ltoo = args::getInteger(args, OPT_lto_O, 2);
-  if (config->ltoo > 3)
-    error("invalid optimization level for LTO: " + Twine(config->ltoo));
+  ctx.arg.sharedMemory = args.hasArg(OPT_shared_memory);
+  ctx.arg.soName = args.getLastArgValue(OPT_soname);
+  ctx.arg.importTable = args.hasArg(OPT_import_table);
+  ctx.arg.importUndefined = args.hasArg(OPT_import_undefined);
+  ctx.arg.ltoo = args::getInteger(args, OPT_lto_O, 2);
+  if (ctx.arg.ltoo > 3)
+    error("invalid optimization level for LTO: " + Twine(ctx.arg.ltoo));
   unsigned ltoCgo =
-      args::getInteger(args, OPT_lto_CGO, args::getCGOptLevel(config->ltoo));
+      args::getInteger(args, OPT_lto_CGO, args::getCGOptLevel(ctx.arg.ltoo));
   if (auto level = CodeGenOpt::getLevel(ltoCgo))
-    config->ltoCgo = *level;
+    ctx.arg.ltoCgo = *level;
   else
     error("invalid codegen optimization level for LTO: " + Twine(ltoCgo));
-  config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1);
-  config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq);
-  config->ltoDebugPassManager = args.hasArg(OPT_lto_debug_pass_manager);
-  config->mapFile = args.getLastArgValue(OPT_Map);
-  config->optimize = args::getInteger(args, OPT_O, 1);
-  config->outputFile = args.getLastArgValue(OPT_o);
-  config->relocatable = args.hasArg(OPT_relocatable);
-  config->gcSections =
-      args.hasFlag(OPT_gc_sections, OPT_no_gc_sections, !config->relocatable);
+  ctx.arg.ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1);
+  ctx.arg.ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq);
+  ctx.arg.ltoDebugPassManager = args.hasArg(OPT_lto_debug_pass_manager);
+  ctx.arg.mapFile = args.getLastArgValue(OPT_Map);
+  ctx.arg.optimize = args::getInteger(args, OPT_O, 1);
+  ctx.arg.outputFile = args.getLastArgValue(OPT_o);
+  ctx.arg.relocatable = args.hasArg(OPT_relocatable);
+  ctx.arg.gcSections =
+      args.hasFlag(OPT_gc_sections, OPT_no_gc_sections, !ctx.arg.relocatable);
   for (auto *arg : args.filtered(OPT_keep_section))
-    config->keepSections.insert(arg->getValue());
-  config->mergeDataSegments =
+    ctx.arg.keepSections.insert(arg->getValue());
+  ctx.arg.mergeDataSegments =
       args.hasFlag(OPT_merge_data_segments, OPT_no_merge_data_segments,
-                   !config->relocatable);
-  config->pie = args.hasFlag(OPT_pie, OPT_no_pie, false);
-  config->printGcSections =
+                   !ctx.arg.relocatable);
+  ctx.arg.pie = args.hasFlag(OPT_pie, OPT_no_pie, false);
+  ctx.arg.printGcSections =
       args.hasFlag(OPT_print_gc_sections, OPT_no_print_gc_sections, false);
-  config->saveTemps = args.hasArg(OPT_save_temps);
-  config->searchPaths = args::getStrings(args, OPT_library_path);
-  config->shared = args.hasArg(OPT_shared);
-  config->shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck);
-  config->stripAll = args.hasArg(OPT_strip_all);
-  config->stripDebug = args.hasArg(OPT_strip_debug);
-  config->stackFirst = args.hasArg(OPT_stack_first);
-  config->trace = args.hasArg(OPT_trace);
-  config->thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir);
-  config->thinLTOCachePolicy = CHECK(
+  ctx.arg.saveTemps = args.hasArg(OPT_save_temps);
+  ctx.arg.searchPaths = args::getStrings(args, OPT_library_path);
+  ctx.arg.shared = args.hasArg(OPT_shared);
+  ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck);
+  ctx.arg.stripAll = args.hasArg(OPT_strip_all);
+  ctx.arg.stripDebug = args.hasArg(OPT_strip_debug);
+  ctx.arg.stackFirst = args.hasArg(OPT_stack_first);
+  ctx.arg.trace = args.hasArg(OPT_trace);
+  ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir);
+  ctx.arg.thinLTOCachePolicy = CHECK(
       parseCachePruningPolicy(args.getLastArgValue(OPT_thinlto_cache_policy)),
       "--thinlto-cache-policy: invalid cache policy");
-  config->thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files);
-  config->thinLTOEmitIndexFiles = args.hasArg(OPT_thinlto_emit_index_files) ||
+  ctx.arg.thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files);
+  ctx.arg.thinLTOEmitIndexFiles = args.hasArg(OPT_thinlto_emit_index_files) ||
                                   args.hasArg(OPT_thinlto_index_only) ||
                                   args.hasArg(OPT_thinlto_index_only_eq);
-  config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) ||
+  ctx.arg.thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) ||
                              args.hasArg(OPT_thinlto_index_only_eq);
-  config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq);
-  config->thinLTOObjectSuffixReplace =
+  ctx.arg.thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq);
+  ctx.arg.thinLTOObjectSuffixReplace =
       getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq);
-  std::tie(config->thinLTOPrefixReplaceOld, config->thinLTOPrefixReplaceNew,
-           config->thinLTOPrefixReplaceNativeObject) =
+  std::tie(ctx.arg.thinLTOPrefixReplaceOld, ctx.arg.thinLTOPrefixReplaceNew,
+           ctx.arg.thinLTOPrefixReplaceNativeObject) =
       getOldNewOptionsExtra(args, OPT_thinlto_prefix_replace_eq);
-  if (config->thinLTOEmitIndexFiles && !config->thinLTOIndexOnly) {
+  if (ctx.arg.thinLTOEmitIndexFiles && !ctx.arg.thinLTOIndexOnly) {
     if (args.hasArg(OPT_thinlto_object_suffix_replace_eq))
       error("--thinlto-object-suffix-replace is not supported with "
             "--thinlto-emit-index-files");
@@ -627,45 +625,45 @@ static void readConfigs(opt::InputArgList &args) {
       error("--thinlto-prefix-replace is not supported with "
             "--thinlto-emit-index-files");
   }
-  if (!config->thinLTOPrefixReplaceNativeObject.empty() &&
-      config->thinLTOIndexOnlyArg.empty()) {
+  if (!ctx.arg.thinLTOPrefixReplaceNativeObject.empty() &&
+      ctx.arg.thinLTOIndexOnlyArg.empty()) {
     error("--thinlto-prefix-replace=old_dir;new_dir;obj_dir must be used with "
           "--thinlto-index-only=");
   }
-  config->unresolvedSymbols = getUnresolvedSymbolPolicy(args);
-  config->whyExtract = args.getLastArgValue(OPT_why_extract);
+  ctx.arg.unresolvedSymbols = getUnresolvedSymbolPolicy(args);
+  ctx.arg.whyExtract = args.getLastArgValue(OPT_why_extract);
   errorHandler().verbose = args.hasArg(OPT_verbose);
   LLVM_DEBUG(errorHandler().verbose = true);
 
-  config->tableBase = args::getInteger(args, OPT_table_base, 0);
-  config->globalBase = args::getInteger(args, OPT_global_base, 0);
-  config->initialHeap = args::getInteger(args, OPT_initial_heap, 0);
-  config->initialMemory = args::getInteger(args, OPT_initial_memory, 0);
-  config->maxMemory = args::getInteger(args, OPT_max_memory, 0);
-  config->noGrowableMemory = args.hasArg(OPT_no_growable_memory);
-  config->zStackSize =
+  ctx.arg.tableBase = args::getInteger(args, OPT_table_base, 0);
+  ctx.arg.globalBase = args::getInteger(args, OPT_global_base, 0);
+  ctx.arg.initialHeap = args::getInteger(args, OPT_initial_heap, 0);
+  ctx.arg.initialMemory = args::getInteger(args, OPT_initial_memory, 0);
+  ctx.arg.maxMemory = args::getInteger(args, OPT_max_memory, 0);
+  ctx.arg.noGrowableMemory = args.hasArg(OPT_no_growable_memory);
+  ctx.arg.zStackSize =
       args::getZOptionValue(args, OPT_z, "stack-size", WasmPageSize);
 
   // -Bdynamic by default if -pie or -shared is specified.
-  if (config->pie || config->shared)
-    config->isStatic = false;
+  if (ctx.arg.pie || ctx.arg.shared)
+    ctx.arg.isStatic = false;
 
-  if (config->maxMemory != 0 && config->noGrowableMemory) {
+  if (ctx.arg.maxMemory != 0 && ctx.arg.noGrowableMemory) {
     // Erroring out here is simpler than defining precedence rules.
     error("--max-memory is incompatible with --no-growable-memory");
   }
 
   // Default value of exportDynamic depends on `-shared`
-  config->exportDynamic =
-      args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, config->shared);
+  ctx.arg.exportDynamic =
+      args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, ctx.arg.shared);
 
   // Parse wasm32/64.
   if (auto *arg = args.getLastArg(OPT_m)) {
     StringRef s = arg->getValue();
     if (s == "wasm32")
-      config->is64 = false;
+      ctx.arg.is64 = false;
     else if (s == "wasm64")
-      config->is64 = true;
+      ctx.arg.is64 = true;
     else
       error("invalid target architecture: " + s);
   }
@@ -679,36 +677,36 @@ static void readConfigs(opt::InputArgList &args) {
       error(arg->getSpelling() + ": expected a positive integer, but got '" +
             arg->getValue() + "'");
     parallel::strategy = hardware_concurrency(threads);
-    config->thinLTOJobs = v;
+    ctx.arg.thinLTOJobs = v;
   }
   if (auto *arg = args.getLastArg(OPT_thinlto_jobs))
-    config->thinLTOJobs = arg->getValue();
+    ctx.arg.thinLTOJobs = arg->getValue();
 
   if (auto *arg = args.getLastArg(OPT_features)) {
-    config->features =
+    ctx.arg.features =
         std::optional<std::vector<std::string>>(std::vector<std::string>());
     for (StringRef s : arg->getValues())
-      config->features->push_back(std::string(s));
+      ctx.arg.features->push_back(std::string(s));
   }
 
   if (auto *arg = args.getLastArg(OPT_extra_features)) {
-    config->extraFeatures =
+    ctx.arg.extraFeatures =
         std::optional<std::vector<std::string>>(std::vector<std::string>());
     for (StringRef s : arg->getValues())
-      config->extraFeatures->push_back(std::string(s));
+      ctx.arg.extraFeatures->push_back(std::string(s));
   }
 
   // Legacy --allow-undefined flag which is equivalent to
   // --unresolve-symbols=ignore + --import-undefined
   if (args.hasArg(OPT_allow_undefined)) {
-    config->importUndefined = true;
-    config->unresolvedSymbols = UnresolvedPolicy::Ignore;
+    ctx.arg.importUndefined = true;
+    ctx.arg.unresolvedSymbols = UnresolvedPolicy::Ignore;
   }
 
   if (args.hasArg(OPT_print_map))
-    config->mapFile = "-";
+    ctx.arg.mapFile = "-";
 
-  std::tie(config->buildId, config->buildIdVector) = getBuildId(args);
+  std::tie(ctx.arg.buildId, ctx.arg.buildIdVector) = getBuildId(args);
 }
 
 // Some Config members do not directly correspond to any particular
@@ -716,86 +714,86 @@ static void readConfigs(opt::InputArgList &args) {
 // This function initialize such members. See Config.h for the details
 // of these values.
 static void setConfigs() {
-  ctx.isPic = config->pie || config->shared;
+  ctx.isPic = ctx.arg.pie || ctx.arg.shared;
 
   if (ctx.isPic) {
-    if (config->exportTable)
+    if (ctx.arg.exportTable)
       error("-shared/-pie is incompatible with --export-table");
-    config->importTable = true;
+    ctx.arg.importTable = true;
   } else {
     // Default table base.  Defaults to 1, reserving 0 for the NULL function
     // pointer.
-    if (!config->tableBase)
-      config->tableBase = 1;
+    if (!ctx.arg.tableBase)
+      ctx.arg.tableBase = 1;
     // The default offset for static/global data, for when --global-base is
     // not specified on the command line.  The precise value of 1024 is
     // somewhat arbitrary, and pre-dates wasm-ld (Its the value that
     // emscripten used prior to wasm-ld).
-    if (!config->globalBase && !config->relocatable && !config->stackFirst)
-      config->globalBase = 1024;
+    if (!ctx.arg.globalBase && !ctx.arg.relocatable && !ctx.arg.stackFirst)
+      ctx.arg.globalBase = 1024;
   }
 
-  if (config->relocatable) {
-    if (config->exportTable)
+  if (ctx.arg.relocatable) {
+    if (ctx.arg.exportTable)
       error("--relocatable is incompatible with --export-table");
-    if (config->growableTable)
+    if (ctx.arg.growableTable)
       error("--relocatable is incompatible with --growable-table");
     // Ignore any --import-table, as it's redundant.
-    config->importTable = true;
+    ctx.arg.importTable = true;
   }
 
-  if (config->shared) {
-    if (config->memoryExport.has_value()) {
+  if (ctx.arg.shared) {
+    if (ctx.arg.memoryExport.has_value()) {
       error("--export-memory is incompatible with --shared");
     }
-    if (!config->memoryImport.has_value()) {
-      config->memoryImport =
-          std::pair<llvm::StringRef, llvm::StringRef>(defaultModule, memoryName);
+    if (!ctx.arg.memoryImport.has_value()) {
+      ctx.arg.memoryImport = std::pair<llvm::StringRef, llvm::StringRef>(
+          defaultModule, memoryName);
     }
   }
 
   // If neither export-memory nor import-memory is specified, default to
   // exporting memory under its default name.
-  if (!config->memoryExport.has_value() && !config->memoryImport.has_value()) {
-    config->memoryExport = memoryName;
+  if (!ctx.arg.memoryExport.has_value() && !ctx.arg.memoryImport.has_value()) {
+    ctx.arg.memoryExport = memoryName;
   }
 }
 
 // Some command line options or some combinations of them are not allowed.
 // This function checks for such errors.
 static void checkOptions(opt::InputArgList &args) {
-  if (!config->stripDebug && !config->stripAll && config->compressRelocations)
+  if (!ctx.arg.stripDebug && !ctx.arg.stripAll && ctx.arg.compressRelocations)
     error("--compress-relocations is incompatible with output debug"
           " information. Please pass --strip-debug or --strip-all");
 
-  if (config->ltoPartitions == 0)
+  if (ctx.arg.ltoPartitions == 0)
     error("--lto-partitions: number of threads must be > 0");
-  if (!get_threadpool_strategy(config->thinLTOJobs))
-    error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs);
+  if (!get_threadpool_strategy(ctx.arg.thinLTOJobs))
+    error("--thinlto-jobs: invalid job count: " + ctx.arg.thinLTOJobs);
 
-  if (config->pie && config->shared)
+  if (ctx.arg.pie && ctx.arg.shared)
     error("-shared and -pie may not be used together");
 
-  if (config->outputFile.empty() && !config->thinLTOIndexOnly)
+  if (ctx.arg.outputFile.empty() && !ctx.arg.thinLTOIndexOnly)
     error("no output file specified");
 
-  if (config->importTable && config->exportTable)
+  if (ctx.arg.importTable && ctx.arg.exportTable)
     error("--import-table and --export-table may not be used together");
 
-  if (config->relocatable) {
-    if (!config->entry.empty())
+  if (ctx.arg.relocatable) {
+    if (!ctx.arg.entry.empty())
       error("entry point specified for relocatable output file");
-    if (config->gcSections)
+    if (ctx.arg.gcSections)
       error("-r and --gc-sections may not be used together");
-    if (config->compressRelocations)
+    if (ctx.arg.compressRelocations)
       error("-r -and --compress-relocations may not be used together");
     if (args.hasArg(OPT_undefined))
       error("-r -and --undefined may not be used together");
-    if (config->pie)
+    if (ctx.arg.pie)
       error("-r and -pie may not be used together");
-    if (config->sharedMemory)
+    if (ctx.arg.sharedMemory)
       error("-r and --shared-memory may not be used together");
-    if (config->globalBase)
+    if (ctx.arg.globalBase)
       error("-r and --global-base may not by used together");
   }
 
@@ -804,31 +802,31 @@ static void checkOptions(opt::InputArgList &args) {
   // mode, to give anyone using them a heads-up that they will be changing.
   //
   // Also, warn about flags which request explicit exports.
-  if (!config->experimentalPic) {
+  if (!ctx.arg.experimentalPic) {
     // -shared will change meaning when Module Linking is implemented.
-    if (config->shared) {
+    if (ctx.arg.shared) {
       warn("creating shared libraries, with -shared, is not yet stable");
     }
 
     // -pie will change meaning when Module Linking is implemented.
-    if (config->pie) {
+    if (ctx.arg.pie) {
       warn("creating PIEs, with -pie, is not yet stable");
     }
 
-    if (config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic) {
+    if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic) {
       warn("dynamic imports are not yet stable "
            "(--unresolved-symbols=import-dynamic)");
     }
   }
 
-  if (config->bsymbolic && !config->shared) {
+  if (ctx.arg.bsymbolic && !ctx.arg.shared) {
     warn("-Bsymbolic is only meaningful when combined with -shared");
   }
 
   if (ctx.isPic) {
-    if (config->globalBase)
+    if (ctx.arg.globalBase)
       error("--global-base may not be used with -shared/-pie");
-    if (config->tableBase)
+    if (ctx.arg.tableBase)
       error("--table-base may not be used with -shared/-pie");
   }
 }
@@ -851,7 +849,7 @@ static Symbol *handleUndefined(StringRef name, const char *option) {
 
   if (auto *lazySym = dyn_cast<LazySymbol>(sym)) {
     lazySym->extract();
-    if (!config->whyExtract.empty())
+    if (!ctx.arg.whyExtract.empty())
       ctx.whyExtractRecords.emplace_back(option, sym->getFile(), *sym);
   }
 
@@ -861,20 +859,20 @@ static Symbol *handleUndefined(StringRef name, const char *option) {
 static void handleLibcall(StringRef name) {
   Symbol *sym = symtab->find(name);
   if (sym && sym->isLazy() && isa<BitcodeFile>(sym->getFile())) {
-    if (!config->whyExtract.empty())
+    if (!ctx.arg.whyExtract.empty())
       ctx.whyExtractRecords.emplace_back("<libcall>", sym->getFile(), *sym);
     cast<LazySymbol>(sym)->extract();
   }
 }
 
 static void writeWhyExtract() {
-  if (config->whyExtract.empty())
+  if (ctx.arg.whyExtract.empty())
     return;
 
   std::error_code ec;
-  raw_fd_ostream os(config->whyExtract, ec, sys::fs::OF_None);
+  raw_fd_ostream os(ctx.arg.whyExtract, ec, sys::fs::OF_None);
   if (ec) {
-    error("cannot open --why-extract= file " + config->whyExtract + ": " +
+    error("cannot open --why-extract= file " + ctx.arg.whyExtract + ": " +
           ec.message());
     return;
   }
@@ -905,14 +903,14 @@ static UndefinedGlobal *
 createUndefinedGlobal(StringRef name, llvm::wasm::WasmGlobalType *type) {
   auto *sym = cast<UndefinedGlobal>(symtab->addUndefinedGlobal(
       name, std::nullopt, std::nullopt, WASM_SYMBOL_UNDEFINED, nullptr, type));
-  config->allowUndefinedSymbols.insert(sym->getName());
+  ctx.arg.allowUndefinedSymbols.insert(sym->getName());
   sym->isUsedInRegularObj = true;
   return sym;
 }
 
 static InputGlobal *createGlobal(StringRef name, bool isMutable) {
   llvm::wasm::WasmGlobal wasmGlobal;
-  bool is64 = config->is64.value_or(false);
+  bool is64 = ctx.arg.is64.value_or(false);
   wasmGlobal.Type = {uint8_t(is64 ? WASM_TYPE_I64 : WASM_TYPE_I32), isMutable};
   wasmGlobal.InitExpr = intConst(0, is64);
   wasmGlobal.SymbolName = name;
@@ -931,7 +929,7 @@ static GlobalSymbol *createOptionalGlobal(StringRef name, bool isMutable) {
 
 // Create ABI-defined synthetic symbols
 static void createSyntheticSymbols() {
-  if (config->relocatable)
+  if (ctx.arg.relocatable)
     return;
 
   static WasmSignature nullSignature = {{}, {}};
@@ -947,11 +945,11 @@ static void createSyntheticSymbols() {
       "__wasm_call_ctors", WASM_SYMBOL_VISIBILITY_HIDDEN,
       make<SyntheticFunction>(nullSignature, "__wasm_call_ctors"));
 
-  bool is64 = config->is64.value_or(false);
+  bool is64 = ctx.arg.is64.value_or(false);
 
   if (ctx.isPic) {
     WasmSym::stackPointer =
-        createUndefinedGlobal("__stack_pointer", config->is64.value_or(false)
+        createUndefinedGlobal("__stack_pointer", ctx.arg.is64.value_or(false)
                                                      ? &mutableGlobalTypeI64
                                                      : &mutableGlobalTypeI32);
     // For PIC code, we import two global variables (__memory_base and
@@ -970,7 +968,7 @@ static void createSyntheticSymbols() {
     WasmSym::stackPointer->markLive();
   }
 
-  if (config->sharedMemory) {
+  if (ctx.arg.sharedMemory) {
     WasmSym::tlsBase = createGlobalVariable("__tls_base", true);
     WasmSym::tlsSize = createGlobalVariable("__tls_size", false);
     WasmSym::tlsAlign = createGlobalVariable("__tls_align", false);
@@ -983,12 +981,12 @@ static void createSyntheticSymbols() {
 }
 
 static void createOptionalSymbols() {
-  if (config->relocatable)
+  if (ctx.arg.relocatable)
     return;
 
   WasmSym::dsoHandle = symtab->addOptionalDataSymbol("__dso_handle");
 
-  if (!config->shared)
+  if (!ctx.arg.shared)
     WasmSym::dataEnd = symtab->addOptionalDataSymbol("__data_end");
 
   if (!ctx.isPic) {
@@ -1010,7 +1008,7 @@ static void createOptionalSymbols() {
   //
   // __tls_size and __tls_align are not needed in this case since they are only
   // needed for __wasm_init_tls (which we do not create in this case).
-  if (!config->sharedMemory)
+  if (!ctx.arg.sharedMemory)
     WasmSym::tlsBase = createOptionalGlobal("__tls_base", false);
 }
 
@@ -1035,7 +1033,7 @@ static void processStubLibrariesPreLTO() {
             // extracted during processStubLibraries, which is too late since
             // LTO has already being performed at that point.
             if (needed->isLazy() && isa<BitcodeFile>(needed->getFile())) {
-              if (!config->whyExtract.empty())
+              if (!ctx.arg.whyExtract.empty())
                 ctx.whyExtractRecords.emplace_back(toString(stub_file),
                                                    needed->getFile(), *needed);
               cast<LazySymbol>(needed)->extract();
@@ -1079,7 +1077,7 @@ static bool addStubSymbolDeps(const StubFile *stub_file, Symbol *sym,
       if (auto *lazy = dyn_cast<LazySymbol>(needed)) {
         depsAdded = true;
         lazy->extract();
-        if (!config->whyExtract.empty())
+        if (!ctx.arg.whyExtract.empty())
           ctx.whyExtractRecords.emplace_back(toString(stub_file),
                                              sym->getFile(), *sym);
       }
diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp
index 9383dca..ccdc92f 100644
--- a/lld/wasm/InputChunks.cpp
+++ b/lld/wasm/InputChunks.cpp
@@ -67,7 +67,7 @@ uint32_t InputChunk::getSize() const {
     return ms->builder.getSize();
 
   if (const auto *f = dyn_cast<InputFunction>(this)) {
-    if (config->compressRelocations && f->file) {
+    if (ctx.arg.compressRelocations && f->file) {
       return f->getCompressedSize();
     }
   }
@@ -84,7 +84,7 @@ uint32_t InputChunk::getInputSize() const {
 // Copy this input chunk to an mmap'ed output file and apply relocations.
 void InputChunk::writeTo(uint8_t *buf) const {
   if (const auto *f = dyn_cast<InputFunction>(this)) {
-    if (file && config->compressRelocations)
+    if (file && ctx.arg.compressRelocations)
       return f->writeCompressed(buf);
   } else if (const auto *ms = dyn_cast<SyntheticMergedChunk>(this)) {
     ms->builder.write(buf + outSecOff);
@@ -269,7 +269,7 @@ static unsigned getRelocWidth(const WasmRelocation &rel, uint64_t value) {
 // This function only computes the final output size.  It must be called
 // before getSize() is used to calculate of layout of the code section.
 void InputFunction::calculateSize() {
-  if (!file || !config->compressRelocations)
+  if (!file || !ctx.arg.compressRelocations)
     return;
 
   LLVM_DEBUG(dbgs() << "calculateSize: " << name << "\n");
@@ -365,7 +365,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
   LLVM_DEBUG(dbgs() << "generating runtime relocations: " << name
                     << " count=" << relocations.size() << "\n");
 
-  bool is64 = config->is64.value_or(false);
+  bool is64 = ctx.arg.is64.value_or(false);
   bool generated = false;
   unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
                                    : WASM_OPCODE_I32_CONST;
diff --git a/lld/wasm/InputChunks.h b/lld/wasm/InputChunks.h
index d6769bc..f545449 100644
--- a/lld/wasm/InputChunks.h
+++ b/lld/wasm/InputChunks.h
@@ -112,7 +112,7 @@ protected:
   InputChunk(ObjFile *f, Kind k, StringRef name, uint32_t alignment = 0,
              uint32_t flags = 0)
       : name(name), file(f), alignment(alignment), flags(flags), sectionKind(k),
-        live(!config->gcSections), discarded(false) {}
+        live(!ctx.arg.gcSections), discarded(false) {}
   ArrayRef<uint8_t> data() const { return rawData; }
   uint64_t getTombstone() const;
 
@@ -156,7 +156,7 @@ class SyntheticMergedChunk;
 // be found by looking at the next one).
 struct SectionPiece {
   SectionPiece(size_t off, uint32_t hash, bool live)
-      : inputOff(off), live(live || !config->gcSections), hash(hash >> 1) {}
+      : inputOff(off), live(live || !ctx.arg.gcSections), hash(hash >> 1) {}
 
   uint32_t inputOff;
   uint32_t live : 1;
diff --git a/lld/wasm/InputElement.h b/lld/wasm/InputElement.h
index 10dc2a3..c2a24c8 100644
--- a/lld/wasm/InputElement.h
+++ b/lld/wasm/InputElement.h
@@ -24,7 +24,7 @@ namespace wasm {
 class InputElement {
 protected:
   InputElement(StringRef name, ObjFile *f)
-      : file(f), live(!config->gcSections), name(name) {}
+      : file(f), live(!ctx.arg.gcSections), name(name) {}
 
 public:
   StringRef getName() const { return name; }
@@ -65,7 +65,7 @@ public:
   const WasmInitExpr &getInitExpr() const { return initExpr; }
 
   void setPointerValue(uint64_t value) {
-    initExpr = intConst(value, config->is64.value_or(false));
+    initExpr = intConst(value, ctx.arg.is64.value_or(false));
   }
 
 private:
diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp
index 221f02a..614cddd 100644
--- a/lld/wasm/InputFiles.cpp
+++ b/lld/wasm/InputFiles.cpp
@@ -47,7 +47,7 @@ std::string toString(const wasm::InputFile *file) {
 namespace wasm {
 
 std::string replaceThinLTOSuffix(StringRef path) {
-  auto [suffix, repl] = config->thinLTOObjectSuffixReplace;
+  auto [suffix, repl] = ctx.arg.thinLTOObjectSuffixReplace;
   if (path.consume_back(suffix))
     return (path + repl).str();
   return std::string(path);
@@ -55,10 +55,10 @@ std::string replaceThinLTOSuffix(StringRef path) {
 
 void InputFile::checkArch(Triple::ArchType arch) const {
   bool is64 = arch == Triple::wasm64;
-  if (is64 && !config->is64) {
+  if (is64 && !ctx.arg.is64) {
     fatal(toString(this) +
           ": must specify -mwasm64 to process wasm64 object files");
-  } else if (config->is64.value_or(false) != is64) {
+  } else if (ctx.arg.is64.value_or(false) != is64) {
     fatal(toString(this) +
           ": wasm32 object file can't be linked in wasm64 mode");
   }
@@ -169,7 +169,7 @@ uint64_t ObjFile::calcNewValue(const WasmRelocation &reloc, uint64_t tombstone,
     uint32_t index = getFunctionSymbol(reloc.Index)->getTableIndex();
     if (reloc.Type == R_WASM_TABLE_INDEX_REL_SLEB ||
         reloc.Type == R_WASM_TABLE_INDEX_REL_SLEB64)
-      index -= config->tableBase;
+      index -= ctx.arg.tableBase;
     return index;
   }
   case R_WASM_MEMORY_ADDR_LEB:
@@ -360,7 +360,7 @@ void ObjFile::addLegacyIndirectFunctionTableIfNeeded(
 }
 
 static bool shouldMerge(const WasmSection &sec) {
-  if (config->optimize == 0)
+  if (ctx.arg.optimize == 0)
     return false;
   // Sadly we don't have section attributes yet for custom sections, so we
   // currently go by the name alone.
@@ -383,7 +383,7 @@ static bool shouldMerge(const WasmSegment &seg) {
   // On a regular link we don't merge sections if -O0 (default is -O1). This
   // sometimes makes the linker significantly faster, although the output will
   // be bigger.
-  if (config->optimize == 0)
+  if (ctx.arg.optimize == 0)
     return false;
 
   // A mergeable section with size 0 is useless because they don't have
@@ -845,7 +845,7 @@ BitcodeFile::BitcodeFile(MemoryBufferRef m, StringRef archiveName,
   this->archiveName = std::string(archiveName);
 
   std::string path = mb.getBufferIdentifier().str();
-  if (config->thinLTOIndexOnly)
+  if (ctx.arg.thinLTOIndexOnly)
     path = replaceThinLTOSuffix(mb.getBufferIdentifier());
 
   // ThinLTO assumes that all MemoryBufferRefs given to it have a unique
diff --git a/lld/wasm/InputFiles.h b/lld/wasm/InputFiles.h
index 1b1de98..fd7fcb1 100644
--- a/lld/wasm/InputFiles.h
+++ b/lld/wasm/InputFiles.h
@@ -73,7 +73,7 @@ public:
 
 protected:
   InputFile(Kind k, MemoryBufferRef m)
-      : mb(m), fileKind(k), live(!config->gcSections) {}
+      : mb(m), fileKind(k), live(!ctx.arg.gcSections) {}
 
   void checkArch(llvm::Triple::ArchType arch) const;
 
diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp
index d9fff74..b9bd48a 100644
--- a/lld/wasm/LTO.cpp
+++ b/lld/wasm/LTO.cpp
@@ -44,8 +44,8 @@ using namespace lld::wasm;
 using namespace lld;
 
 static std::string getThinLTOOutputFile(StringRef modulePath) {
-  return lto::getThinLTOOutputFile(modulePath, config->thinLTOPrefixReplaceOld,
-                                   config->thinLTOPrefixReplaceNew);
+  return lto::getThinLTOOutputFile(modulePath, ctx.arg.thinLTOPrefixReplaceOld,
+                                   ctx.arg.thinLTOPrefixReplaceNew);
 }
 
 static lto::Config createConfig() {
@@ -56,23 +56,23 @@ static lto::Config createConfig() {
   c.Options.FunctionSections = true;
   c.Options.DataSections = true;
 
-  c.DisableVerify = config->disableVerify;
+  c.DisableVerify = ctx.arg.disableVerify;
   c.DiagHandler = diagnosticHandler;
-  c.OptLevel = config->ltoo;
+  c.OptLevel = ctx.arg.ltoo;
   c.MAttrs = getMAttrs();
-  c.CGOptLevel = config->ltoCgo;
-  c.DebugPassManager = config->ltoDebugPassManager;
-  c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty();
+  c.CGOptLevel = ctx.arg.ltoCgo;
+  c.DebugPassManager = ctx.arg.ltoDebugPassManager;
+  c.AlwaysEmitRegularLTOObj = !ctx.arg.ltoObjPath.empty();
 
-  if (config->relocatable)
+  if (ctx.arg.relocatable)
     c.RelocModel = std::nullopt;
   else if (ctx.isPic)
     c.RelocModel = Reloc::PIC_;
   else
     c.RelocModel = Reloc::Static;
 
-  if (config->saveTemps)
-    checkError(c.addSaveTemps(config->outputFile.str() + ".",
+  if (ctx.arg.saveTemps)
+    checkError(c.addSaveTemps(ctx.arg.outputFile.str() + ".",
                               /*UseInputModulePath*/ true));
   return c;
 }
@@ -81,27 +81,27 @@ namespace lld::wasm {
 
 BitcodeCompiler::BitcodeCompiler() {
   // Initialize indexFile.
-  if (!config->thinLTOIndexOnlyArg.empty())
-    indexFile = openFile(config->thinLTOIndexOnlyArg);
+  if (!ctx.arg.thinLTOIndexOnlyArg.empty())
+    indexFile = openFile(ctx.arg.thinLTOIndexOnlyArg);
 
   // Initialize ltoObj.
   lto::ThinBackend backend;
   auto onIndexWrite = [&](StringRef s) { thinIndices.erase(s); };
-  if (config->thinLTOIndexOnly) {
+  if (ctx.arg.thinLTOIndexOnly) {
     backend = lto::createWriteIndexesThinBackend(
-        llvm::hardware_concurrency(config->thinLTOJobs),
-        std::string(config->thinLTOPrefixReplaceOld),
-        std::string(config->thinLTOPrefixReplaceNew),
-        std::string(config->thinLTOPrefixReplaceNativeObject),
-        config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite);
+        llvm::hardware_concurrency(ctx.arg.thinLTOJobs),
+        std::string(ctx.arg.thinLTOPrefixReplaceOld),
+        std::string(ctx.arg.thinLTOPrefixReplaceNew),
+        std::string(ctx.arg.thinLTOPrefixReplaceNativeObject),
+        ctx.arg.thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite);
   } else {
     backend = lto::createInProcessThinBackend(
-        llvm::heavyweight_hardware_concurrency(config->thinLTOJobs),
-        onIndexWrite, config->thinLTOEmitIndexFiles,
-        config->thinLTOEmitImportsFiles);
+        llvm::heavyweight_hardware_concurrency(ctx.arg.thinLTOJobs),
+        onIndexWrite, ctx.arg.thinLTOEmitIndexFiles,
+        ctx.arg.thinLTOEmitImportsFiles);
   }
   ltoObj = std::make_unique<lto::LTO>(createConfig(), backend,
-                                      config->ltoPartitions);
+                                      ctx.arg.ltoPartitions);
 }
 
 BitcodeCompiler::~BitcodeCompiler() = default;
@@ -123,7 +123,7 @@ void BitcodeCompiler::add(BitcodeFile &f) {
   ArrayRef<Symbol *> syms = f.getSymbols();
   std::vector<lto::SymbolResolution> resols(syms.size());
 
-  if (config->thinLTOEmitIndexFiles) {
+  if (ctx.arg.thinLTOEmitIndexFiles) {
     thinIndices.insert(obj.getName());
   }
 
@@ -139,7 +139,7 @@ void BitcodeCompiler::add(BitcodeFile &f) {
     // Once IRObjectFile is fixed to report only one symbol this hack can
     // be removed.
     r.Prevailing = !objSym.isUndefined() && sym->getFile() == &f;
-    r.VisibleToRegularObj = config->relocatable || sym->isUsedInRegularObj ||
+    r.VisibleToRegularObj = ctx.arg.relocatable || sym->isUsedInRegularObj ||
                             sym->isNoStrip() ||
                             (r.Prevailing && sym->isExported());
     if (r.Prevailing)
@@ -175,7 +175,7 @@ static void thinLTOCreateEmptyIndexFiles() {
     ModuleSummaryIndex m(/*HaveGVs*/ false);
     m.setSkipModuleByDistributedBackend();
     writeIndexToFile(m, *os);
-    if (config->thinLTOEmitImportsFiles)
+    if (ctx.arg.thinLTOEmitImportsFiles)
       openFile(path + ".imports");
   }
 }
@@ -191,8 +191,8 @@ std::vector<StringRef> BitcodeCompiler::compile() {
   // to cache native object files for ThinLTO incremental builds. If a path was
   // specified, configure LTO to use it as the cache directory.
   FileCache cache;
-  if (!config->thinLTOCacheDir.empty())
-    cache = check(localCache("ThinLTO", "Thin", config->thinLTOCacheDir,
+  if (!ctx.arg.thinLTOCacheDir.empty())
+    cache = check(localCache("ThinLTO", "Thin", ctx.arg.thinLTOCacheDir,
                              [&](size_t task, const Twine &moduleName,
                                  std::unique_ptr<MemoryBuffer> mb) {
                                files[task] = std::move(mb);
@@ -210,16 +210,16 @@ std::vector<StringRef> BitcodeCompiler::compile() {
   for (StringRef s : thinIndices) {
     std::string path(s);
     openFile(path + ".thinlto.bc");
-    if (config->thinLTOEmitImportsFiles)
+    if (ctx.arg.thinLTOEmitImportsFiles)
       openFile(path + ".imports");
   }
 
-  if (config->thinLTOEmitIndexFiles)
+  if (ctx.arg.thinLTOEmitIndexFiles)
     thinLTOCreateEmptyIndexFiles();
 
-  if (config->thinLTOIndexOnly) {
-    if (!config->ltoObjPath.empty())
-      saveBuffer(buf[0].second, config->ltoObjPath);
+  if (ctx.arg.thinLTOIndexOnly) {
+    if (!ctx.arg.ltoObjPath.empty())
+      saveBuffer(buf[0].second, ctx.arg.ltoObjPath);
 
     // ThinLTO with index only option is required to generate only the index
     // files. After that, we exit from linker and ThinLTO backend runs in a
@@ -229,8 +229,8 @@ std::vector<StringRef> BitcodeCompiler::compile() {
     return {};
   }
 
-  if (!config->thinLTOCacheDir.empty())
-    pruneCache(config->thinLTOCacheDir, config->thinLTOCachePolicy, files);
+  if (!ctx.arg.thinLTOCacheDir.empty())
+    pruneCache(ctx.arg.thinLTOCacheDir, ctx.arg.thinLTOCachePolicy, files);
 
   std::vector<StringRef> ret;
   for (unsigned i = 0; i != maxTasks; ++i) {
@@ -239,7 +239,7 @@ std::vector<StringRef> BitcodeCompiler::compile() {
     if (objBuf.empty())
       continue;
     ret.emplace_back(objBuf.data(), objBuf.size());
-    if (!config->saveTemps)
+    if (!ctx.arg.saveTemps)
       continue;
 
     // If the input bitcode file is path/to/x.o and -o specifies a.out, the
@@ -248,7 +248,7 @@ std::vector<StringRef> BitcodeCompiler::compile() {
     StringRef ltoObjName;
     if (bitcodeFilePath == "ld-temp.o") {
       ltoObjName =
-          saver().save(Twine(config->outputFile) + ".lto" +
+          saver().save(Twine(ctx.arg.outputFile) + ".lto" +
                        (i == 0 ? Twine("") : Twine('.') + Twine(i)) + ".o");
     } else {
       StringRef directory = sys::path::parent_path(bitcodeFilePath);
@@ -258,7 +258,7 @@ std::vector<StringRef> BitcodeCompiler::compile() {
       StringRef baseName = bitcodeFilePath.ends_with(")")
                                ? sys::path::filename(bitcodeFilePath)
                                : sys::path::stem(bitcodeFilePath);
-      StringRef outputFileBaseName = sys::path::filename(config->outputFile);
+      StringRef outputFileBaseName = sys::path::filename(ctx.arg.outputFile);
       SmallString<256> path;
       sys::path::append(path, directory,
                         outputFileBaseName + ".lto." + baseName + ".o");
@@ -268,10 +268,10 @@ std::vector<StringRef> BitcodeCompiler::compile() {
     saveBuffer(objBuf, ltoObjName);
   }
 
-  if (!config->ltoObjPath.empty()) {
-    saveBuffer(buf[0].second, config->ltoObjPath);
+  if (!ctx.arg.ltoObjPath.empty()) {
+    saveBuffer(buf[0].second, ctx.arg.ltoObjPath);
     for (unsigned i = 1; i != maxTasks; ++i)
-      saveBuffer(buf[i].second, config->ltoObjPath + Twine(i));
+      saveBuffer(buf[i].second, ctx.arg.ltoObjPath + Twine(i));
   }
 
   for (std::unique_ptr<MemoryBuffer> &file : files)
diff --git a/lld/wasm/MapFile.cpp b/lld/wasm/MapFile.cpp
index c96b64c..d8487e4 100644
--- a/lld/wasm/MapFile.cpp
+++ b/lld/wasm/MapFile.cpp
@@ -103,14 +103,14 @@ getSymbolStrings(ArrayRef<Symbol *> syms) {
 }
 
 void lld::wasm::writeMapFile(ArrayRef<OutputSection *> outputSections) {
-  if (config->mapFile.empty())
+  if (ctx.arg.mapFile.empty())
     return;
 
   // Open a map file for writing.
   std::error_code ec;
-  raw_fd_ostream os(config->mapFile, ec, sys::fs::OF_None);
+  raw_fd_ostream os(ctx.arg.mapFile, ec, sys::fs::OF_None);
   if (ec) {
-    error("cannot open " + config->mapFile + ": " + ec.message());
+    error("cannot open " + ctx.arg.mapFile + ": " + ec.message());
     return;
   }
 
diff --git a/lld/wasm/MarkLive.cpp b/lld/wasm/MarkLive.cpp
index 1b99f03..13c7a3d 100644
--- a/lld/wasm/MarkLive.cpp
+++ b/lld/wasm/MarkLive.cpp
@@ -106,8 +106,8 @@ void MarkLive::enqueueRetainedSegments(const ObjFile *file) {
 
 void MarkLive::run() {
   // Add GC root symbols.
-  if (!config->entry.empty())
-    enqueue(symtab->find(config->entry));
+  if (!ctx.arg.entry.empty())
+    enqueue(symtab->find(ctx.arg.entry));
 
   // We need to preserve any no-strip or exported symbol
   for (Symbol *sym : symtab->symbols())
@@ -166,7 +166,7 @@ void MarkLive::mark() {
 }
 
 void markLive() {
-  if (!config->gcSections)
+  if (!ctx.arg.gcSections)
     return;
 
   LLVM_DEBUG(dbgs() << "markLive\n");
@@ -175,7 +175,7 @@ void markLive() {
   marker.run();
 
   // Report garbage-collected sections.
-  if (config->printGcSections) {
+  if (ctx.arg.printGcSections) {
     for (const ObjFile *obj : ctx.objectFiles) {
       for (InputChunk *c : obj->functions)
         if (!c->live)
@@ -207,7 +207,7 @@ void markLive() {
 
 bool MarkLive::isCallCtorsLive() {
   // In a reloctable link, we don't call `__wasm_call_ctors`.
-  if (config->relocatable)
+  if (ctx.arg.relocatable)
     return false;
 
   // In Emscripten-style PIC, we call `__wasm_call_ctors` which calls
diff --git a/lld/wasm/OutputSections.cpp b/lld/wasm/OutputSections.cpp
index e4f7582..95f7ecc 100644
--- a/lld/wasm/OutputSections.cpp
+++ b/lld/wasm/OutputSections.cpp
@@ -105,13 +105,13 @@ void DataSection::finalizeContents() {
   });
 #endif
 
-  assert((config->sharedMemory || !ctx.isPic || config->extendedConst ||
+  assert((ctx.arg.sharedMemory || !ctx.isPic || ctx.arg.extendedConst ||
           activeCount <= 1) &&
          "output segments should have been combined by now");
 
   writeUleb128(os, segmentCount, "data segment count");
   bodySize = dataSectionHeader.size();
-  bool is64 = config->is64.value_or(false);
+  bool is64 = ctx.arg.is64.value_or(false);
 
   for (OutputSegment *segment : segments) {
     if (!segment->requiredInBinary())
@@ -121,7 +121,7 @@ void DataSection::finalizeContents() {
     if (segment->initFlags & WASM_DATA_SEGMENT_HAS_MEMINDEX)
       writeUleb128(os, 0, "memory index");
     if ((segment->initFlags & WASM_DATA_SEGMENT_IS_PASSIVE) == 0) {
-      if (ctx.isPic && config->extendedConst) {
+      if (ctx.isPic && ctx.arg.extendedConst) {
         writeU8(os, WASM_OPCODE_GLOBAL_GET, "global get");
         writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(),
                      "literal (global index)");
diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp
index 45ad327..52888ad 100644
--- a/lld/wasm/Relocations.cpp
+++ b/lld/wasm/Relocations.cpp
@@ -22,13 +22,13 @@ static bool requiresGOTAccess(const Symbol *sym) {
   if (sym->isShared())
     return true;
   if (!ctx.isPic &&
-      config->unresolvedSymbols != UnresolvedPolicy::ImportDynamic)
+      ctx.arg.unresolvedSymbols != UnresolvedPolicy::ImportDynamic)
     return false;
   if (sym->isHidden() || sym->isLocal())
     return false;
   // With `-Bsymbolic` (or when building an executable) as don't need to use
   // the GOT for symbols that are defined within the current module.
-  if (sym->isDefined() && (!config->shared || config->bsymbolic))
+  if (sym->isDefined() && (!ctx.arg.shared || ctx.arg.bsymbolic))
     return false;
   return true;
 }
@@ -38,15 +38,15 @@ static bool allowUndefined(const Symbol* sym) {
   // link time.
   if (sym->isImported())
     return true;
-  if (isa<UndefinedFunction>(sym) && config->importUndefined)
+  if (isa<UndefinedFunction>(sym) && ctx.arg.importUndefined)
     return true;
 
-  return config->allowUndefinedSymbols.count(sym->getName()) != 0;
+  return ctx.arg.allowUndefinedSymbols.count(sym->getName()) != 0;
 }
 
 static void reportUndefined(ObjFile *file, Symbol *sym) {
   if (!allowUndefined(sym)) {
-    switch (config->unresolvedSymbols) {
+    switch (ctx.arg.unresolvedSymbols) {
     case UnresolvedPolicy::ReportError:
       error(toString(file) + ": undefined symbol: " + toString(*sym));
       break;
@@ -63,8 +63,8 @@ static void reportUndefined(ObjFile *file, Symbol *sym) {
 
     if (auto *f = dyn_cast<UndefinedFunction>(sym)) {
       if (!f->stubFunction &&
-          config->unresolvedSymbols != UnresolvedPolicy::ImportDynamic &&
-          !config->importUndefined) {
+          ctx.arg.unresolvedSymbols != UnresolvedPolicy::ImportDynamic &&
+          !ctx.arg.importUndefined) {
         f->stubFunction = symtab->createUndefinedStub(*f->getSignature());
         f->stubFunction->markLive();
         // Mark the function itself as a stub which prevents it from being
@@ -125,7 +125,7 @@ void scanRelocations(InputChunk *chunk) {
       // In single-threaded builds TLS is lowered away and TLS data can be
       // merged with normal data and allowing TLS relocation in non-TLS
       // segments.
-      if (config->sharedMemory) {
+      if (ctx.arg.sharedMemory) {
         if (!sym->isTLS()) {
           error(toString(file) + ": relocation " +
                 relocTypeToString(reloc.Type) +
@@ -144,9 +144,9 @@ void scanRelocations(InputChunk *chunk) {
       break;
     }
 
-    if (ctx.isPic ||
+    if (ctx.isPic || sym->isShared() ||
         (sym->isUndefined() &&
-         config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) {
+         ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) {
       switch (reloc.Type) {
       case R_WASM_TABLE_INDEX_SLEB:
       case R_WASM_TABLE_INDEX_SLEB64:
@@ -173,7 +173,7 @@ void scanRelocations(InputChunk *chunk) {
       }
     }
 
-    if (!config->relocatable && sym->isUndefined()) {
+    if (!ctx.arg.relocatable && sym->isUndefined()) {
       switch (reloc.Type) {
       case R_WASM_TABLE_INDEX_REL_SLEB:
       case R_WASM_TABLE_INDEX_REL_SLEB64:
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index 4cbf44b..f573590 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -53,7 +53,7 @@ void SymbolTable::addFile(InputFile *file, StringRef symName) {
     return;
   }
 
-  if (config->trace)
+  if (ctx.arg.trace)
     message(toString(file));
 
   // LLVM bitcode file
@@ -125,7 +125,7 @@ std::pair<Symbol *, bool> SymbolTable::insertName(StringRef name) {
   sym->canInline = true;
   sym->traced = trace;
   sym->forceExport = false;
-  sym->referenced = !config->gcSections;
+  sym->referenced = !ctx.arg.gcSections;
   symVector.emplace_back(sym);
   return {sym, true};
 }
@@ -235,7 +235,7 @@ DefinedFunction *SymbolTable::addSyntheticFunction(StringRef name,
 DefinedData *SymbolTable::addOptionalDataSymbol(StringRef name,
                                                 uint64_t value) {
   Symbol *s = find(name);
-  if (!s && (config->exportAll || config->exportedSymbols.count(name) != 0))
+  if (!s && (ctx.arg.exportAll || ctx.arg.exportedSymbols.count(name) != 0))
     s = insertName(name).first;
   else if (!s || s->isDefined())
     return nullptr;
@@ -317,7 +317,7 @@ static bool shouldReplace(const Symbol *existing, InputFile *newFile,
   }
 
   // Neither symbol is week. They conflict.
-  if (config->allowMultipleDefinition)
+  if (ctx.arg.allowMultipleDefinition)
     return false;
 
   errorOrWarn("duplicate symbol: " + toString(*existing) + "\n>>> defined in " +
@@ -387,7 +387,7 @@ Symbol *SymbolTable::addSharedFunction(StringRef name, uint32_t flags,
     checkSig = ud->isCalledDirectly;
 
   if (checkSig && !signatureMatches(existingFunction, sig)) {
-    if (config->shlibSigCheck) {
+    if (ctx.arg.shlibSigCheck) {
       reportFunctionSignatureMismatch(name, existingFunction, sig, file);
     } else {
       // With --no-shlib-sigcheck we ignore the signature of the function as
@@ -637,7 +637,7 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name,
       lazy->signature = sig;
     } else {
       lazy->extract();
-      if (!config->whyExtract.empty())
+      if (!ctx.arg.whyExtract.empty())
         ctx.whyExtractRecords.emplace_back(toString(file), s->getFile(), *s);
     }
   } else {
@@ -652,7 +652,7 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name,
     if (isCalledDirectly && !signatureMatches(existingFunction, sig)) {
       if (existingFunction->isShared()) {
         // Special handling for when the existing function is a shared symbol
-        if (config->shlibSigCheck) {
+        if (ctx.arg.shlibSigCheck) {
           reportFunctionSignatureMismatch(name, existingFunction, sig, file);
         } else {
           existingFunction->signature = sig;
@@ -788,12 +788,12 @@ TableSymbol *SymbolTable::createUndefinedIndirectFunctionTable(StringRef name) {
   WasmTableType *type = make<WasmTableType>();
   type->ElemType = ValType::FUNCREF;
   type->Limits = limits;
-  uint32_t flags = config->exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN;
+  uint32_t flags = ctx.arg.exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN;
   flags |= WASM_SYMBOL_UNDEFINED;
   Symbol *sym =
       addUndefinedTable(name, name, defaultModule, flags, nullptr, type);
   sym->markLive();
-  sym->forceExport = config->exportTable;
+  sym->forceExport = ctx.arg.exportTable;
   return cast<TableSymbol>(sym);
 }
 
@@ -803,10 +803,10 @@ TableSymbol *SymbolTable::createDefinedIndirectFunctionTable(StringRef name) {
   WasmTableType type{ValType::FUNCREF, limits};
   WasmTable desc{invalidIndex, type, name};
   InputTable *table = make<InputTable>(desc, nullptr);
-  uint32_t flags = config->exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN;
+  uint32_t flags = ctx.arg.exportTable ? 0 : WASM_SYMBOL_VISIBILITY_HIDDEN;
   TableSymbol *sym = addSyntheticTable(name, flags, table);
   sym->markLive();
-  sym->forceExport = config->exportTable;
+  sym->forceExport = ctx.arg.exportTable;
   return sym;
 }
 
@@ -830,7 +830,7 @@ TableSymbol *SymbolTable::resolveIndirectFunctionTable(bool required) {
     }
   }
 
-  if (config->importTable) {
+  if (ctx.arg.importTable) {
     if (existing) {
       existing->importModule = defaultModule;
       existing->importName = functionTableName;
@@ -838,7 +838,7 @@ TableSymbol *SymbolTable::resolveIndirectFunctionTable(bool required) {
     }
     if (required)
       return createUndefinedIndirectFunctionTable(functionTableName);
-  } else if ((existing && existing->isLive()) || config->exportTable ||
+  } else if ((existing && existing->isLive()) || ctx.arg.exportTable ||
              required) {
     // A defined table is required.  Either because the user request an exported
     // table or because the table symbol is already live.  The existing table is
@@ -885,7 +885,7 @@ void SymbolTable::addLazy(StringRef name, InputFile *file) {
   LLVM_DEBUG(dbgs() << "replacing existing undefined\n");
   const InputFile *oldFile = s->getFile();
   LazySymbol(name, 0, file).extract();
-  if (!config->whyExtract.empty())
+  if (!ctx.arg.whyExtract.empty())
     ctx.whyExtractRecords.emplace_back(toString(oldFile), s->getFile(), *s);
 }
 
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index e62e7be..a687fd6 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -35,7 +35,7 @@ std::string maybeDemangleSymbol(StringRef name) {
   // `main` in the case where we need to pass it arguments.
   if (name == "__main_argc_argv")
     return "main";
-  if (wasm::config->demangle)
+  if (wasm::ctx.arg.demangle)
     return demangle(name);
   return name.str();
 }
@@ -235,10 +235,10 @@ bool Symbol::isExported() const {
   // Shared libraries must export all weakly defined symbols
   // in case they contain the version that will be chosen by
   // the dynamic linker.
-  if (config->shared && isLive() && isWeak() && !isHidden())
+  if (ctx.arg.shared && isLive() && isWeak() && !isHidden())
     return true;
 
-  if (config->exportAll || (config->exportDynamic && !isHidden()))
+  if (ctx.arg.exportAll || (ctx.arg.exportDynamic && !isHidden()))
     return true;
 
   return isExportedExplicit();
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 80b6587..b409fff 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -139,7 +139,7 @@ public:
 
 protected:
   Symbol(StringRef name, Kind k, uint32_t flags, InputFile *f)
-      : name(name), file(f), symbolKind(k), referenced(!config->gcSections),
+      : name(name), file(f), symbolKind(k), referenced(!ctx.arg.gcSections),
         requiresGOT(false), isUsedInRegularObj(false), forceExport(false),
         forceImport(false), canInline(false), traced(false), isStub(false),
         flags(flags) {}
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index 6b32d12..715fba1 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -55,7 +55,7 @@ public:
 
 bool DylinkSection::isNeeded() const {
   return ctx.isPic ||
-         config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic ||
+         ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic ||
          !ctx.sharedFiles.empty();
 }
 
@@ -162,7 +162,7 @@ void TypeSection::writeBody() {
 uint32_t ImportSection::getNumImports() const {
   assert(isSealed);
   uint32_t numImports = importedSymbols.size() + gotSymbols.size();
-  if (config->memoryImport.has_value())
+  if (ctx.arg.memoryImport.has_value())
     ++numImports;
   return numImports;
 }
@@ -232,20 +232,20 @@ void ImportSection::writeBody() {
 
   writeUleb128(os, getNumImports(), "import count");
 
-  bool is64 = config->is64.value_or(false);
+  bool is64 = ctx.arg.is64.value_or(false);
 
-  if (config->memoryImport) {
+  if (ctx.arg.memoryImport) {
     WasmImport import;
-    import.Module = config->memoryImport->first;
-    import.Field = config->memoryImport->second;
+    import.Module = ctx.arg.memoryImport->first;
+    import.Field = ctx.arg.memoryImport->second;
     import.Kind = WASM_EXTERNAL_MEMORY;
     import.Memory.Flags = 0;
     import.Memory.Minimum = out.memorySec->numMemoryPages;
-    if (out.memorySec->maxMemoryPages != 0 || config->sharedMemory) {
+    if (out.memorySec->maxMemoryPages != 0 || ctx.arg.sharedMemory) {
       import.Memory.Flags |= WASM_LIMITS_FLAG_HAS_MAX;
       import.Memory.Maximum = out.memorySec->maxMemoryPages;
     }
-    if (config->sharedMemory)
+    if (ctx.arg.sharedMemory)
       import.Memory.Flags |= WASM_LIMITS_FLAG_IS_SHARED;
     if (is64)
       import.Memory.Flags |= WASM_LIMITS_FLAG_IS_64;
@@ -351,14 +351,14 @@ void TableSection::assignIndexes() {
 void MemorySection::writeBody() {
   raw_ostream &os = bodyOutputStream;
 
-  bool hasMax = maxMemoryPages != 0 || config->sharedMemory;
+  bool hasMax = maxMemoryPages != 0 || ctx.arg.sharedMemory;
   writeUleb128(os, 1, "memory count");
   unsigned flags = 0;
   if (hasMax)
     flags |= WASM_LIMITS_FLAG_HAS_MAX;
-  if (config->sharedMemory)
+  if (ctx.arg.sharedMemory)
     flags |= WASM_LIMITS_FLAG_IS_SHARED;
-  if (config->is64.value_or(false))
+  if (ctx.arg.is64.value_or(false))
     flags |= WASM_LIMITS_FLAG_IS_64;
   writeUleb128(os, flags, "memory limits flags");
   writeUleb128(os, numMemoryPages, "initial pages");
@@ -415,8 +415,8 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) {
 }
 
 void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
-  assert(!config->extendedConst);
-  bool is64 = config->is64.value_or(false);
+  assert(!ctx.arg.extendedConst);
+  bool is64 = ctx.arg.is64.value_or(false);
   unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
                                    : WASM_OPCODE_I32_CONST;
   unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD
@@ -466,7 +466,7 @@ void GlobalSection::writeBody() {
     writeGlobalType(os, g->getType());
     writeInitExpr(os, g->getInitExpr());
   }
-  bool is64 = config->is64.value_or(false);
+  bool is64 = ctx.arg.is64.value_or(false);
   uint8_t itype = is64 ? WASM_TYPE_I64 : WASM_TYPE_I32;
   for (const Symbol *sym : internalGotSymbols) {
     bool mutable_ = false;
@@ -474,11 +474,11 @@ void GlobalSection::writeBody() {
       // In the case of dynamic linking, unless we have 'extended-const'
       // available, these global must to be mutable since they get updated to
       // the correct runtime value during `__wasm_apply_global_relocs`.
-      if (!config->extendedConst && ctx.isPic && !sym->isTLS())
+      if (!ctx.arg.extendedConst && ctx.isPic && !sym->isTLS())
         mutable_ = true;
       // With multi-theadeding any TLS globals must be mutable since they get
       // set during `__wasm_apply_global_tls_relocs`
-      if (config->sharedMemory && sym->isTLS())
+      if (ctx.arg.sharedMemory && sym->isTLS())
         mutable_ = true;
     }
     WasmGlobalType type{itype, mutable_};
@@ -487,7 +487,7 @@ void GlobalSection::writeBody() {
     bool useExtendedConst = false;
     uint32_t globalIdx;
     int64_t offset;
-    if (config->extendedConst && ctx.isPic) {
+    if (ctx.arg.extendedConst && ctx.isPic) {
       if (auto *d = dyn_cast<DefinedData>(sym)) {
         if (!sym->isTLS()) {
           globalIdx = WasmSym::memoryBase->getGlobalIndex();
@@ -518,7 +518,7 @@ void GlobalSection::writeBody() {
         // In the sharedMemory case TLS globals are set during
         // `__wasm_apply_global_tls_relocs`, but in the non-shared case
         // we know the absolute value at link time.
-        initExpr = intConst(d->getVA(/*absolute=*/!config->sharedMemory), is64);
+        initExpr = intConst(d->getVA(/*absolute=*/!ctx.arg.sharedMemory), is64);
       else if (auto *f = dyn_cast<FunctionSymbol>(sym))
         initExpr = intConst(f->isStub ? 0 : f->getTableIndex(), is64);
       else {
@@ -566,7 +566,7 @@ void ElemSection::addEntry(FunctionSymbol *sym) {
   // They only exist so that the calls to missing functions can validate.
   if (sym->hasTableIndex() || sym->isStub)
     return;
-  sym->setTableIndex(config->tableBase + indirectFunctions.size());
+  sym->setTableIndex(ctx.arg.tableBase + indirectFunctions.size());
   indirectFunctions.emplace_back(sym);
 }
 
@@ -589,8 +589,8 @@ void ElemSection::writeBody() {
     initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET;
     initExpr.Inst.Value.Global = WasmSym::tableBase->getGlobalIndex();
   } else {
-    bool is64 = config->is64.value_or(false);
-    initExpr = intConst(config->tableBase, is64);
+    bool is64 = ctx.arg.is64.value_or(false);
+    initExpr = intConst(ctx.arg.tableBase, is64);
   }
   writeInitExpr(os, initExpr);
 
@@ -602,7 +602,7 @@ void ElemSection::writeBody() {
   }
 
   writeUleb128(os, indirectFunctions.size(), "elem count");
-  uint32_t tableIndex = config->tableBase;
+  uint32_t tableIndex = ctx.arg.tableBase;
   for (const FunctionSymbol *sym : indirectFunctions) {
     assert(sym->getTableIndex() == tableIndex);
     (void) tableIndex;
@@ -622,7 +622,7 @@ void DataCountSection::writeBody() {
 }
 
 bool DataCountSection::isNeeded() const {
-  return numSegments && config->sharedMemory;
+  return numSegments && ctx.arg.sharedMemory;
 }
 
 void LinkingSection::writeBody() {
@@ -786,9 +786,9 @@ unsigned NameSection::numNamedDataSegments() const {
 void NameSection::writeBody() {
   {
     SubSection sub(WASM_NAMES_MODULE);
-    StringRef moduleName = config->soName;
-    if (config->soName.empty())
-      moduleName = llvm::sys::path::filename(config->outputFile);
+    StringRef moduleName = ctx.arg.soName;
+    if (ctx.arg.soName.empty())
+      moduleName = llvm::sys::path::filename(ctx.arg.outputFile);
     writeStr(sub.os, moduleName, "module name");
     sub.writeTo(bodyOutputStream);
   }
@@ -917,14 +917,14 @@ void RelocSection::writeBody() {
 }
 
 static size_t getHashSize() {
-  switch (config->buildId) {
+  switch (ctx.arg.buildId) {
   case BuildIdKind::Fast:
   case BuildIdKind::Uuid:
     return 16;
   case BuildIdKind::Sha1:
     return 20;
   case BuildIdKind::Hexstring:
-    return config->buildIdVector.size();
+    return ctx.arg.buildIdVector.size();
   case BuildIdKind::None:
     return 0;
   }
diff --git a/lld/wasm/SyntheticSections.h b/lld/wasm/SyntheticSections.h
index 10183e9..068fbed 100644
--- a/lld/wasm/SyntheticSections.h
+++ b/lld/wasm/SyntheticSections.h
@@ -228,7 +228,7 @@ class MemorySection : public SyntheticSection {
 public:
   MemorySection() : SyntheticSection(llvm::wasm::WASM_SEC_MEMORY) {}
 
-  bool isNeeded() const override { return !config->memoryImport.has_value(); }
+  bool isNeeded() const override { return !ctx.arg.memoryImport.has_value(); }
   void writeBody() override;
 
   uint64_t numMemoryPages = 0;
@@ -286,7 +286,7 @@ public:
   // transform a `global.get` to an `i32.const`.
   void addInternalGOTEntry(Symbol *sym);
   bool needsRelocations() {
-    if (config->extendedConst)
+    if (ctx.arg.extendedConst)
       return false;
     return llvm::any_of(internalGotSymbols,
                         [=](Symbol *sym) { return !sym->isTLS(); });
@@ -354,7 +354,7 @@ public:
       : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "linking"),
         initFunctions(initFunctions), dataSegments(dataSegments) {}
   bool isNeeded() const override {
-    return config->relocatable || config->emitRelocs;
+    return ctx.arg.relocatable || ctx.arg.emitRelocs;
   }
   void writeBody() override;
   void addToSymtab(Symbol *sym);
@@ -373,7 +373,7 @@ public:
       : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "name"),
         segments(segments) {}
   bool isNeeded() const override {
-    if (config->stripAll && !config->keepSections.count(name))
+    if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name))
       return false;
     return numNames() > 0;
   }
@@ -396,7 +396,7 @@ public:
   ProducersSection()
       : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "producers") {}
   bool isNeeded() const override {
-    if (config->stripAll && !config->keepSections.count(name))
+    if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name))
       return false;
     return fieldCount() > 0;
   }
@@ -417,7 +417,7 @@ public:
   TargetFeaturesSection()
       : SyntheticSection(llvm::wasm::WASM_SEC_CUSTOM, "target_features") {}
   bool isNeeded() const override {
-    if (config->stripAll && !config->keepSections.count(name))
+    if (ctx.arg.stripAll && !ctx.arg.keepSections.count(name))
       return false;
     return features.size() > 0;
   }
@@ -443,7 +443,7 @@ public:
   BuildIdSection();
   void writeBody() override;
   bool isNeeded() const override {
-    return config->buildId != BuildIdKind::None;
+    return ctx.arg.buildId != BuildIdKind::None;
   }
   void writeBuildId(llvm::ArrayRef<uint8_t> buf);
   void writeTo(uint8_t *buf) override {
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index aeac1a5..76e38f5 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -132,7 +132,7 @@ private:
 
 void Writer::calculateCustomSections() {
   log("calculateCustomSections");
-  bool stripDebug = config->stripDebug || config->stripAll;
+  bool stripDebug = ctx.arg.stripDebug || ctx.arg.stripAll;
   for (ObjFile *file : ctx.objectFiles) {
     for (InputChunk *section : file->customSections) {
       // Exclude COMDAT sections that are not selected for inclusion
@@ -172,7 +172,7 @@ void Writer::createCustomSections() {
     LLVM_DEBUG(dbgs() << "createCustomSection: " << name << "\n");
 
     OutputSection *sec = make<CustomSection>(std::string(name), pair.second);
-    if (config->relocatable || config->emitRelocs) {
+    if (ctx.arg.relocatable || ctx.arg.emitRelocs) {
       auto *sym = make<OutputSectionSymbol>(sec);
       out.linkingSec->addToSymtab(sym);
       sec->sectionSym = sym;
@@ -282,8 +282,8 @@ static void makeUUID(unsigned version, llvm::ArrayRef<uint8_t> fileHash,
 void Writer::writeBuildId() {
   if (!out.buildIdSec->isNeeded())
     return;
-  if (config->buildId == BuildIdKind::Hexstring) {
-    out.buildIdSec->writeBuildId(config->buildIdVector);
+  if (ctx.arg.buildId == BuildIdKind::Hexstring) {
+    out.buildIdSec->writeBuildId(ctx.arg.buildIdVector);
     return;
   }
 
@@ -292,7 +292,7 @@ void Writer::writeBuildId() {
   std::vector<uint8_t> buildId(hashSize);
   llvm::ArrayRef<uint8_t> buf{buffer->getBufferStart(), size_t(fileSize)};
 
-  switch (config->buildId) {
+  switch (ctx.arg.buildId) {
   case BuildIdKind::Fast: {
     std::vector<uint8_t> fileHash(8);
     computeHash(fileHash, buf, [](uint8_t *dest, ArrayRef<uint8_t> arr) {
@@ -324,9 +324,9 @@ static void setGlobalPtr(DefinedGlobal *g, uint64_t memoryPtr) {
 // to each of the input data sections as well as the explicit stack region.
 // The default memory layout is as follows, from low to high.
 //
-//  - initialized data (starting at config->globalBase)
+//  - initialized data (starting at ctx.arg.globalBase)
 //  - BSS data (not currently implemented in llvm)
-//  - explicit stack (config->ZStackSize)
+//  - explicit stack (ctx.arg.ZStackSize)
 //  - heap start / unallocated
 //
 // The --stack-first option means that stack is placed before any static data.
@@ -337,33 +337,33 @@ void Writer::layoutMemory() {
   uint64_t memoryPtr = 0;
 
   auto placeStack = [&]() {
-    if (config->relocatable || ctx.isPic)
+    if (ctx.arg.relocatable || ctx.isPic)
       return;
     memoryPtr = alignTo(memoryPtr, stackAlignment);
     if (WasmSym::stackLow)
       WasmSym::stackLow->setVA(memoryPtr);
-    if (config->zStackSize != alignTo(config->zStackSize, stackAlignment))
+    if (ctx.arg.zStackSize != alignTo(ctx.arg.zStackSize, stackAlignment))
       error("stack size must be " + Twine(stackAlignment) + "-byte aligned");
-    log("mem: stack size  = " + Twine(config->zStackSize));
+    log("mem: stack size  = " + Twine(ctx.arg.zStackSize));
     log("mem: stack base  = " + Twine(memoryPtr));
-    memoryPtr += config->zStackSize;
+    memoryPtr += ctx.arg.zStackSize;
     setGlobalPtr(cast<DefinedGlobal>(WasmSym::stackPointer), memoryPtr);
     if (WasmSym::stackHigh)
       WasmSym::stackHigh->setVA(memoryPtr);
     log("mem: stack top   = " + Twine(memoryPtr));
   };
 
-  if (config->stackFirst) {
+  if (ctx.arg.stackFirst) {
     placeStack();
-    if (config->globalBase) {
-      if (config->globalBase < memoryPtr) {
+    if (ctx.arg.globalBase) {
+      if (ctx.arg.globalBase < memoryPtr) {
         error("--global-base cannot be less than stack size when --stack-first is used");
         return;
       }
-      memoryPtr = config->globalBase;
+      memoryPtr = ctx.arg.globalBase;
     }
   } else {
-    memoryPtr = config->globalBase;
+    memoryPtr = ctx.arg.globalBase;
   }
 
   log("mem: global base = " + Twine(memoryPtr));
@@ -385,7 +385,7 @@ void Writer::layoutMemory() {
     log(formatv("mem: {0,-15} offset={1,-8} size={2,-8} align={3}", seg->name,
                 memoryPtr, seg->size, seg->alignment));
 
-    if (!config->relocatable && seg->isTLS()) {
+    if (!ctx.arg.relocatable && seg->isTLS()) {
       if (WasmSym::tlsSize) {
         auto *tlsSize = cast<DefinedGlobal>(WasmSym::tlsSize);
         setGlobalPtr(tlsSize, seg->size);
@@ -394,7 +394,7 @@ void Writer::layoutMemory() {
         auto *tlsAlign = cast<DefinedGlobal>(WasmSym::tlsAlign);
         setGlobalPtr(tlsAlign, int64_t{1} << seg->alignment);
       }
-      if (!config->sharedMemory && WasmSym::tlsBase) {
+      if (!ctx.arg.sharedMemory && WasmSym::tlsBase) {
         auto *tlsBase = cast<DefinedGlobal>(WasmSym::tlsBase);
         setGlobalPtr(tlsBase, memoryPtr);
       }
@@ -404,7 +404,7 @@ void Writer::layoutMemory() {
   }
 
   // Make space for the memory initialization flag
-  if (config->sharedMemory && hasPassiveInitializedSegments()) {
+  if (ctx.arg.sharedMemory && hasPassiveInitializedSegments()) {
     memoryPtr = alignTo(memoryPtr, 4);
     WasmSym::initMemoryFlag = symtab->addSyntheticDataSymbol(
         "__wasm_init_memory_flag", WASM_SYMBOL_VISIBILITY_HIDDEN);
@@ -423,7 +423,7 @@ void Writer::layoutMemory() {
   if (ctx.isPic)
     out.dylinkSec->memSize = staticDataSize;
 
-  if (!config->stackFirst)
+  if (!ctx.arg.stackFirst)
     placeStack();
 
   if (WasmSym::heapBase) {
@@ -438,31 +438,31 @@ void Writer::layoutMemory() {
   }
 
   uint64_t maxMemorySetting = 1ULL << 32;
-  if (config->is64.value_or(false)) {
+  if (ctx.arg.is64.value_or(false)) {
     // TODO: Update once we decide on a reasonable limit here:
     // https://github.com/WebAssembly/memory64/issues/33
     maxMemorySetting = 1ULL << 34;
   }
 
-  if (config->initialHeap != 0) {
-    if (config->initialHeap != alignTo(config->initialHeap, WasmPageSize))
+  if (ctx.arg.initialHeap != 0) {
+    if (ctx.arg.initialHeap != alignTo(ctx.arg.initialHeap, WasmPageSize))
       error("initial heap must be " + Twine(WasmPageSize) + "-byte aligned");
     uint64_t maxInitialHeap = maxMemorySetting - memoryPtr;
-    if (config->initialHeap > maxInitialHeap)
+    if (ctx.arg.initialHeap > maxInitialHeap)
       error("initial heap too large, cannot be greater than " +
             Twine(maxInitialHeap));
-    memoryPtr += config->initialHeap;
+    memoryPtr += ctx.arg.initialHeap;
   }
 
-  if (config->initialMemory != 0) {
-    if (config->initialMemory != alignTo(config->initialMemory, WasmPageSize))
+  if (ctx.arg.initialMemory != 0) {
+    if (ctx.arg.initialMemory != alignTo(ctx.arg.initialMemory, WasmPageSize))
       error("initial memory must be " + Twine(WasmPageSize) + "-byte aligned");
-    if (memoryPtr > config->initialMemory)
+    if (memoryPtr > ctx.arg.initialMemory)
       error("initial memory too small, " + Twine(memoryPtr) + " bytes needed");
-    if (config->initialMemory > maxMemorySetting)
+    if (ctx.arg.initialMemory > maxMemorySetting)
       error("initial memory too large, cannot be greater than " +
             Twine(maxMemorySetting));
-    memoryPtr = config->initialMemory;
+    memoryPtr = ctx.arg.initialMemory;
   }
 
   memoryPtr = alignTo(memoryPtr, WasmPageSize);
@@ -479,23 +479,23 @@ void Writer::layoutMemory() {
   }
 
   uint64_t maxMemory = 0;
-  if (config->maxMemory != 0) {
-    if (config->maxMemory != alignTo(config->maxMemory, WasmPageSize))
+  if (ctx.arg.maxMemory != 0) {
+    if (ctx.arg.maxMemory != alignTo(ctx.arg.maxMemory, WasmPageSize))
       error("maximum memory must be " + Twine(WasmPageSize) + "-byte aligned");
-    if (memoryPtr > config->maxMemory)
+    if (memoryPtr > ctx.arg.maxMemory)
       error("maximum memory too small, " + Twine(memoryPtr) + " bytes needed");
-    if (config->maxMemory > maxMemorySetting)
+    if (ctx.arg.maxMemory > maxMemorySetting)
       error("maximum memory too large, cannot be greater than " +
             Twine(maxMemorySetting));
 
-    maxMemory = config->maxMemory;
-  } else if (config->noGrowableMemory) {
+    maxMemory = ctx.arg.maxMemory;
+  } else if (ctx.arg.noGrowableMemory) {
     maxMemory = memoryPtr;
   }
 
   // If no maxMemory config was supplied but we are building with
   // shared memory, we need to pick a sensible upper limit.
-  if (config->sharedMemory && maxMemory == 0) {
+  if (ctx.arg.sharedMemory && maxMemory == 0) {
     if (ctx.isPic)
       maxMemory = maxMemorySetting;
     else
@@ -552,7 +552,7 @@ void Writer::addSections() {
   createCustomSections();
 
   addSection(out.linkingSec);
-  if (config->emitRelocs || config->relocatable) {
+  if (ctx.arg.emitRelocs || ctx.arg.relocatable) {
     createRelocSections();
   }
 
@@ -583,18 +583,18 @@ void Writer::populateTargetFeatures() {
     allowed.insert("mutable-globals");
   }
 
-  if (config->extraFeatures.has_value()) {
-    auto &extraFeatures = *config->extraFeatures;
+  if (ctx.arg.extraFeatures.has_value()) {
+    auto &extraFeatures = *ctx.arg.extraFeatures;
     allowed.insert(extraFeatures.begin(), extraFeatures.end());
   }
 
   // Only infer used features if user did not specify features
-  bool inferFeatures = !config->features.has_value();
+  bool inferFeatures = !ctx.arg.features.has_value();
 
   if (!inferFeatures) {
-    auto &explicitFeatures = *config->features;
+    auto &explicitFeatures = *ctx.arg.features;
     allowed.insert(explicitFeatures.begin(), explicitFeatures.end());
-    if (!config->checkFeatures)
+    if (!ctx.arg.checkFeatures)
       goto done;
   }
 
@@ -626,10 +626,10 @@ void Writer::populateTargetFeatures() {
     for (const auto &key : used.keys())
       allowed.insert(std::string(key));
 
-  if (!config->checkFeatures)
+  if (!ctx.arg.checkFeatures)
     goto done;
 
-  if (config->sharedMemory) {
+  if (ctx.arg.sharedMemory) {
     if (disallowed.count("shared-mem"))
       error("--shared-memory is disallowed by " + disallowed["shared-mem"] +
             " because it was not compiled with 'atomics' or 'bulk-memory' "
@@ -679,19 +679,19 @@ done:
   // instruction, then we can also avoid including the segments.
   // Finally, if we are emitting relocations, they may refer to locations within
   // the bss segments, so these segments need to exist in the binary.
-  if (config->emitRelocs ||
-      (config->memoryImport.has_value() && !allowed.count("bulk-memory")))
+  if (ctx.arg.emitRelocs ||
+      (ctx.arg.memoryImport.has_value() && !allowed.count("bulk-memory")))
     ctx.emitBssSegments = true;
 
   if (allowed.count("extended-const"))
-    config->extendedConst = true;
+    ctx.arg.extendedConst = true;
 
   for (auto &feature : allowed)
     log("Allowed feature: " + feature);
 }
 
 void Writer::checkImportExportTargetFeatures() {
-  if (config->relocatable || !config->checkFeatures)
+  if (ctx.arg.relocatable || !ctx.arg.checkFeatures)
     return;
 
   if (out.targetFeaturesSec->features.count("mutable-globals") == 0) {
@@ -727,14 +727,14 @@ static bool shouldImport(Symbol *sym) {
   // When a symbol is weakly defined in a shared library we need to allow
   // it to be overridden by another module so need to both import
   // and export the symbol.
-  if (config->shared && sym->isWeak() && !sym->isUndefined() &&
+  if (ctx.arg.shared && sym->isWeak() && !sym->isUndefined() &&
       !sym->isHidden())
     return true;
   if (sym->isShared())
     return true;
   if (!sym->isUndefined())
     return false;
-  if (sym->isWeak() && !config->relocatable && !ctx.isPic)
+  if (sym->isWeak() && !ctx.arg.relocatable && !ctx.isPic)
     return false;
 
   // In PIC mode we only need to import functions when they are called directly.
@@ -745,10 +745,10 @@ static bool shouldImport(Symbol *sym) {
         return false;
   }
 
-  if (ctx.isPic || config->relocatable || config->importUndefined ||
-      config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic)
+  if (ctx.isPic || ctx.arg.relocatable || ctx.arg.importUndefined ||
+      ctx.arg.unresolvedSymbols == UnresolvedPolicy::ImportDynamic)
     return true;
-  if (config->allowUndefinedSymbols.count(sym->getName()) != 0)
+  if (ctx.arg.allowUndefinedSymbols.count(sym->getName()) != 0)
     return true;
 
   return sym->isImported();
@@ -773,12 +773,12 @@ void Writer::calculateImports() {
 }
 
 void Writer::calculateExports() {
-  if (config->relocatable)
+  if (ctx.arg.relocatable)
     return;
 
-  if (!config->relocatable && config->memoryExport.has_value()) {
+  if (!ctx.arg.relocatable && ctx.arg.memoryExport.has_value()) {
     out.exportSec->exports.push_back(
-        WasmExport{*config->memoryExport, WASM_EXTERNAL_MEMORY, 0});
+        WasmExport{*ctx.arg.memoryExport, WASM_EXTERNAL_MEMORY, 0});
   }
 
   unsigned globalIndex =
@@ -827,7 +827,7 @@ void Writer::calculateExports() {
 }
 
 void Writer::populateSymtab() {
-  if (!config->relocatable && !config->emitRelocs)
+  if (!ctx.arg.relocatable && !ctx.arg.emitRelocs)
     return;
 
   for (Symbol *sym : symtab->symbols())
@@ -931,13 +931,13 @@ static void finalizeIndirectFunctionTable() {
     out.importSec->addImport(WasmSym::indirectFunctionTable);
   }
 
-  uint32_t tableSize = config->tableBase + out.elemSec->numEntries();
+  uint32_t tableSize = ctx.arg.tableBase + out.elemSec->numEntries();
   WasmLimits limits = {0, tableSize, 0};
-  if (WasmSym::indirectFunctionTable->isDefined() && !config->growableTable) {
+  if (WasmSym::indirectFunctionTable->isDefined() && !ctx.arg.growableTable) {
     limits.Flags |= WASM_LIMITS_FLAG_HAS_MAX;
     limits.Maximum = limits.Minimum;
   }
-  if (config->is64.value_or(false))
+  if (ctx.arg.is64.value_or(false))
     limits.Flags |= WASM_LIMITS_FLAG_IS_64;
   WasmSym::indirectFunctionTable->setLimits(limits);
 }
@@ -1001,7 +1001,7 @@ static StringRef getOutputDataSegmentName(const InputChunk &seg) {
   // symbols are be relative to single __tls_base.
   if (seg.isTLS())
     return ".tdata";
-  if (!config->mergeDataSegments)
+  if (!ctx.arg.mergeDataSegments)
     return seg.name;
   if (seg.name.starts_with(".text."))
     return ".text";
@@ -1017,9 +1017,9 @@ static StringRef getOutputDataSegmentName(const InputChunk &seg) {
 OutputSegment *Writer::createOutputSegment(StringRef name) {
   LLVM_DEBUG(dbgs() << "new segment: " << name << "\n");
   OutputSegment *s = make<OutputSegment>(name);
-  if (config->sharedMemory)
+  if (ctx.arg.sharedMemory)
     s->initFlags = WASM_DATA_SEGMENT_IS_PASSIVE;
-  if (!config->relocatable && name.starts_with(".bss"))
+  if (!ctx.arg.relocatable && name.starts_with(".bss"))
     s->isBss = true;
   segments.push_back(s);
   return s;
@@ -1035,7 +1035,7 @@ void Writer::createOutputSegments() {
       // When running in relocatable mode we can't merge segments that are part
       // of comdat groups since the ultimate linker needs to be able exclude or
       // include them individually.
-      if (config->relocatable && !segment->getComdatName().empty()) {
+      if (ctx.arg.relocatable && !segment->getComdatName().empty()) {
         s = createOutputSegment(name);
       } else {
         if (segmentMap.count(name) == 0)
@@ -1075,8 +1075,8 @@ void Writer::combineOutputSegments() {
   // combines all data segments into a single .data segment.
   // This restriction does not apply when the extended const extension is
   // available: https://github.com/WebAssembly/extended-const
-  assert(!config->extendedConst);
-  assert(ctx.isPic && !config->sharedMemory);
+  assert(!ctx.arg.extendedConst);
+  assert(ctx.isPic && !ctx.arg.sharedMemory);
   if (segments.size() <= 1)
     return;
   OutputSegment *combined = make<OutputSegment>(".data");
@@ -1117,7 +1117,7 @@ static void createFunction(DefinedFunction *func, StringRef bodyContent) {
 bool Writer::needsPassiveInitialization(const OutputSegment *segment) {
   // If bulk memory features is supported then we can perform bss initialization
   // (via memory.fill) during `__wasm_init_memory`.
-  if (config->memoryImport.has_value() && !segment->requiredInBinary())
+  if (ctx.arg.memoryImport.has_value() && !segment->requiredInBinary())
     return true;
   return segment->initFlags & WASM_DATA_SEGMENT_IS_PASSIVE;
 }
@@ -1129,7 +1129,7 @@ bool Writer::hasPassiveInitializedSegments() {
 }
 
 void Writer::createSyntheticInitFunctions() {
-  if (config->relocatable)
+  if (ctx.arg.relocatable)
     return;
 
   static WasmSignature nullSignature = {{}, {}};
@@ -1146,14 +1146,14 @@ void Writer::createSyntheticInitFunctions() {
         "__wasm_init_memory", WASM_SYMBOL_VISIBILITY_HIDDEN,
         make<SyntheticFunction>(nullSignature, "__wasm_init_memory"));
     WasmSym::initMemory->markLive();
-    if (config->sharedMemory) {
+    if (ctx.arg.sharedMemory) {
       // This global is assigned during  __wasm_init_memory in the shared memory
       // case.
       WasmSym::tlsBase->markLive();
     }
   }
 
-  if (config->sharedMemory) {
+  if (ctx.arg.sharedMemory) {
     if (out.globalSec->needsTLSRelocations()) {
       WasmSym::applyGlobalTLSRelocs = symtab->addSyntheticFunction(
           "__wasm_apply_global_tls_relocs", WASM_SYMBOL_VISIBILITY_HIDDEN,
@@ -1203,11 +1203,11 @@ void Writer::createInitMemoryFunction() {
   assert(WasmSym::initMemory);
   assert(hasPassiveInitializedSegments());
   uint64_t flagAddress;
-  if (config->sharedMemory) {
+  if (ctx.arg.sharedMemory) {
     assert(WasmSym::initMemoryFlag);
     flagAddress = WasmSym::initMemoryFlag->getVA();
   }
-  bool is64 = config->is64.value_or(false);
+  bool is64 = ctx.arg.is64.value_or(false);
   std::string bodyContent;
   {
     raw_string_ostream os(bodyContent);
@@ -1271,7 +1271,7 @@ void Writer::createInitMemoryFunction() {
       }
     };
 
-    if (config->sharedMemory) {
+    if (ctx.arg.sharedMemory) {
       // With PIC code we cache the flag address in local 0
       if (ctx.isPic) {
         writeUleb128(os, 1, "num local decls");
@@ -1334,7 +1334,7 @@ void Writer::createInitMemoryFunction() {
         // When we initialize the TLS segment we also set the `__tls_base`
         // global.  This allows the runtime to use this static copy of the
         // TLS data for the first/main thread.
-        if (config->sharedMemory && s->isTLS()) {
+        if (ctx.arg.sharedMemory && s->isTLS()) {
           if (ctx.isPic) {
             // Cache the result of the addionion in local 0
             writeU8(os, WASM_OPCODE_LOCAL_TEE, "local.tee");
@@ -1368,7 +1368,7 @@ void Writer::createInitMemoryFunction() {
       }
     }
 
-    if (config->sharedMemory) {
+    if (ctx.arg.sharedMemory) {
       // Set flag to 2 to mark end of initialization
       writeGetFlagAddress();
       writeI32Const(os, 2, "flag value");
@@ -1407,7 +1407,7 @@ void Writer::createInitMemoryFunction() {
       if (needsPassiveInitialization(s) && !s->isBss) {
         // The TLS region should not be dropped since its is needed
         // during the initialization of each thread (__wasm_init_tls).
-        if (config->sharedMemory && s->isTLS())
+        if (ctx.arg.sharedMemory && s->isTLS())
           continue;
         // data.drop instruction
         writeU8(os, WASM_OPCODE_MISC_PREFIX, "bulk-memory prefix");
@@ -1460,7 +1460,7 @@ void Writer::createApplyDataRelocationsFunction() {
     writeUleb128(os, 0, "num locals");
     bool generated = false;
     for (const OutputSegment *seg : segments)
-      if (!config->sharedMemory || !seg->isTLS())
+      if (!ctx.arg.sharedMemory || !seg->isTLS())
         for (const InputChunk *inSeg : seg->inputSegments)
           generated |= inSeg->generateRelocationCode(os);
 
@@ -1656,7 +1656,7 @@ void Writer::createInitTLSFunction() {
 // This is then used either when creating the output linking section or to
 // synthesize the "__wasm_call_ctors" function.
 void Writer::calculateInitFunctions() {
-  if (!config->relocatable && !WasmSym::callCtors->isLive())
+  if (!ctx.arg.relocatable && !WasmSym::callCtors->isLive())
     return;
 
   for (ObjFile *file : ctx.objectFiles) {
@@ -1708,7 +1708,7 @@ void Writer::run() {
   // For PIC code the table base is assigned dynamically by the loader.
   // For non-PIC, we start at 1 so that accessing table index 0 always traps.
   if (!ctx.isPic && WasmSym::definedTableBase)
-    WasmSym::definedTableBase->setVA(config->tableBase);
+    WasmSym::definedTableBase->setVA(ctx.arg.tableBase);
 
   log("-- createOutputSegments");
   createOutputSegments();
@@ -1717,7 +1717,7 @@ void Writer::run() {
   log("-- layoutMemory");
   layoutMemory();
 
-  if (!config->relocatable) {
+  if (!ctx.arg.relocatable) {
     // Create linker synthesized __start_SECNAME/__stop_SECNAME symbols
     // This has to be done after memory layout is performed.
     for (const OutputSegment *seg : segments) {
@@ -1725,7 +1725,7 @@ void Writer::run() {
     }
   }
 
-  for (auto &pair : config->exportedSymbols) {
+  for (auto &pair : ctx.arg.exportedSymbols) {
     Symbol *sym = symtab->find(pair.first());
     if (sym && sym->isDefined())
       sym->forceExport = true;
@@ -1733,12 +1733,12 @@ void Writer::run() {
 
   // Delay reporting errors about explicit exports until after
   // addStartStopSymbols which can create optional symbols.
-  for (auto &name : config->requiredExports) {
+  for (auto &name : ctx.arg.requiredExports) {
     Symbol *sym = symtab->find(name);
     if (!sym || !sym->isDefined()) {
-      if (config->unresolvedSymbols == UnresolvedPolicy::ReportError)
+      if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::ReportError)
         error(Twine("symbol exported via --export not found: ") + name);
-      if (config->unresolvedSymbols == UnresolvedPolicy::Warn)
+      if (ctx.arg.unresolvedSymbols == UnresolvedPolicy::Warn)
         warn(Twine("symbol exported via --export not found: ") + name);
     }
   }
@@ -1750,7 +1750,7 @@ void Writer::run() {
   // `__memory_base` import.  Unless we support the extended const expression we
   // can't do addition inside the constant expression, so we much combine the
   // segments into a single one that can live at `__memory_base`.
-  if (ctx.isPic && !config->extendedConst && !config->sharedMemory) {
+  if (ctx.isPic && !ctx.arg.extendedConst && !ctx.arg.sharedMemory) {
     // In shared memory mode all data segments are passive and initialized
     // via __wasm_init_memory.
     log("-- combineOutputSegments");
@@ -1774,7 +1774,7 @@ void Writer::run() {
   log("-- calculateInitFunctions");
   calculateInitFunctions();
 
-  if (!config->relocatable) {
+  if (!ctx.arg.relocatable) {
     // Create linker synthesized functions
     if (WasmSym::applyGlobalRelocs)
       createApplyGlobalRelocationsFunction();
@@ -1793,7 +1793,7 @@ void Writer::run() {
     // If the input contains a call to `__wasm_call_ctors`, either in one of
     // the input objects or an explicit export from the command-line, we
     // assume ctors and dtors are taken care of already.
-    if (!config->relocatable && !ctx.isPic &&
+    if (!ctx.arg.relocatable && !ctx.isPic &&
         !WasmSym::callCtors->isUsedInRegularObj &&
         !WasmSym::callCtors->isExported()) {
       log("-- createCommandExportWrappers");
@@ -1861,14 +1861,14 @@ void Writer::run() {
 
 // Open a result file.
 void Writer::openFile() {
-  log("writing: " + config->outputFile);
+  log("writing: " + ctx.arg.outputFile);
 
   Expected<std::unique_ptr<FileOutputBuffer>> bufferOrErr =
-      FileOutputBuffer::create(config->outputFile, fileSize,
+      FileOutputBuffer::create(ctx.arg.outputFile, fileSize,
                                FileOutputBuffer::F_executable);
 
   if (!bufferOrErr)
-    error("failed to open " + config->outputFile + ": " +
+    error("failed to open " + ctx.arg.outputFile + ": " +
           toString(bufferOrErr.takeError()));
   else
     buffer = std::move(*bufferOrErr);
diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
index 7a4f7e8..429baad 100644
--- a/lldb/bindings/python/python-swigsafecast.swig
+++ b/lldb/bindings/python/python-swigsafecast.swig
@@ -23,6 +23,11 @@ PythonObject SWIGBridge::ToSWIGWrapper(lldb::ProcessSP process_sp) {
                       SWIGTYPE_p_lldb__SBProcess);
 }
 
+PythonObject SWIGBridge::ToSWIGWrapper(lldb::ModuleSP module_sp) {
+  return ToSWIGHelper(new lldb::SBModule(std::move(module_sp)),
+                      SWIGTYPE_p_lldb__SBModule);
+}
+
 PythonObject SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP thread_plan_sp) {
   return ToSWIGHelper(new lldb::SBThreadPlan(std::move(thread_plan_sp)),
                       SWIGTYPE_p_lldb__SBThreadPlan);
diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index ee4c263..9bb37f5 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -292,7 +292,7 @@ endif()
 
 # Figure out if lldb could use lldb-server.  If so, then we'll
 # ensure we build lldb-server when an lldb target is being built.
-if (CMAKE_SYSTEM_NAME MATCHES "Android|Darwin|FreeBSD|Linux|NetBSD|Windows")
+if (CMAKE_SYSTEM_NAME MATCHES "Android|Darwin|FreeBSD|Linux|NetBSD|OpenBSD|Windows")
   set(LLDB_CAN_USE_LLDB_SERVER ON)
 else()
   set(LLDB_CAN_USE_LLDB_SERVER OFF)
diff --git a/lldb/docs/resources/formatterbytecode.rst b/lldb/docs/resources/formatterbytecode.rst
index 20e1483..34fb0f7 100644
--- a/lldb/docs/resources/formatterbytecode.rst
+++ b/lldb/docs/resources/formatterbytecode.rst
@@ -75,6 +75,7 @@ These manipulate the control stack and program counter. Both `if` and `ifelse` e
  0x12      `ifelse`    `(UInt -> )` pop two blocks from the control stack, if
                        the top of the data stack is nonzero, execute the first,
                        otherwise the second.
+ 0x13      `return`    pop the entire control stack and return
 ========  ==========  ============================================================
 
 Literals for basic types
diff --git a/lldb/docs/use/map.rst b/lldb/docs/use/map.rst
index fe9c3f5..ed285b2 100644
--- a/lldb/docs/use/map.rst
+++ b/lldb/docs/use/map.rst
@@ -235,6 +235,38 @@ Do a source level single step in the currently selected thread
   (lldb) step
   (lldb) s
 
+Ignore a function when doing a source level single step in
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: shell
+
+  (gdb) skip abc
+  Function abc will be skipped when stepping.
+
+.. code-block:: shell
+
+  (lldb) settings show target.process.thread.step-avoid-regexp
+  target.process.thread.step-avoid-regexp (regex) = ^std::
+  (lldb) settings set target.process.thread.step-avoid-regexp ^std::|^abc
+
+You can ignore a function once using:
+
+.. code-block:: shell
+
+  (lldb) thread step-in -r ^abc
+
+Or you can do the opposite, only step into functions matching a certain name:
+
+.. code-block:: shell
+
+  # Step in if abc is a substring of the function name.
+  (lldb) sif abc
+  # Which is equivalent to:
+  (lldb) thread step-in -t abc
+
+``thread step-in`` has more options which cover some of ``skip``'s other
+features. See ``help thread step-in`` for details.
+
 Do a source level single step over in the currently selected thread
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lldb/examples/python/formatter_bytecode.py b/lldb/examples/python/formatter_bytecode.py
index ccd0c68..36a14be 100644
--- a/lldb/examples/python/formatter_bytecode.py
+++ b/lldb/examples/python/formatter_bytecode.py
@@ -35,6 +35,7 @@ define_opcode(6, "rot", "rot")
 define_opcode(0x10, "{", "begin")
 define_opcode(0x11, "if", "if")
 define_opcode(0x12, "ifelse", "ifelse")
+define_opcode(0x13, "return", "return")
 
 define_opcode(0x20, None, "lit_uint")
 define_opcode(0x21, None, "lit_int")
@@ -342,6 +343,9 @@ def interpret(bytecode: bytearray, control: list, data: list, tracing: bool = Fa
             else:
                 frame.append(control.pop())
                 control.pop()
+        elif b == op_return:
+            control.clear()
+            return data[-1]
 
         # Literals.
         elif b == op_lit_uint:
diff --git a/lldb/include/lldb/API/SBModule.h b/lldb/include/lldb/API/SBModule.h
index 7200a1e..8533206 100644
--- a/lldb/include/lldb/API/SBModule.h
+++ b/lldb/include/lldb/API/SBModule.h
@@ -301,9 +301,12 @@ private:
   friend class SBFrame;
   friend class SBSection;
   friend class SBSymbolContext;
+  friend class SBPlatform;
   friend class SBTarget;
   friend class SBType;
 
+  friend class lldb_private::python::SWIGBridge;
+
   explicit SBModule(const lldb::ModuleSP &module_sp);
 
   ModuleSP GetSP() const;
diff --git a/lldb/include/lldb/Core/Progress.h b/lldb/include/lldb/Core/Progress.h
index f6cea28..5876eae 100644
--- a/lldb/include/lldb/Core/Progress.h
+++ b/lldb/include/lldb/Core/Progress.h
@@ -59,6 +59,12 @@ namespace lldb_private {
 
 class Progress {
 public:
+  /// Enum to indicate the origin of a progress event, internal or external.
+  enum class Origin : uint8_t {
+    eInternal = 0,
+    eExternal = 1,
+  };
+
   /// Construct a progress object that will report information.
   ///
   /// The constructor will create a unique progress reporting object and
@@ -83,7 +89,8 @@ public:
   Progress(std::string title, std::string details = {},
            std::optional<uint64_t> total = std::nullopt,
            lldb_private::Debugger *debugger = nullptr,
-           Timeout<std::nano> minimum_report_time = std::nullopt);
+           Timeout<std::nano> minimum_report_time = std::nullopt,
+           Origin origin = Origin::eInternal);
 
   /// Destroy the progress object.
   ///
@@ -118,6 +125,9 @@ public:
     /// The optional debugger ID to report progress to. If this has no value
     /// then all debuggers will receive this event.
     std::optional<lldb::user_id_t> debugger_id;
+
+    /// The origin of the progress event, wheter it is internal or external.
+    Origin origin;
   };
 
 private:
diff --git a/lldb/include/lldb/Host/aix/HostInfoAIX.h b/lldb/include/lldb/Host/aix/HostInfoAIX.h
new file mode 100644
index 0000000..7796a15
--- /dev/null
+++ b/lldb/include/lldb/Host/aix/HostInfoAIX.h
@@ -0,0 +1,28 @@
+//===-- HostInfoAIX.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_HOST_AIX_HOSTINFOAIX_H_
+#define LLDB_HOST_AIX_HOSTINFOAIX_H_
+
+#include "lldb/Host/posix/HostInfoPosix.h"
+#include "lldb/Utility/FileSpec.h"
+
+namespace lldb_private {
+
+class HostInfoAIX : public HostInfoPosix {
+  friend class HostInfoBase;
+
+public:
+  static void Initialize(SharedLibraryDirectoryHelper *helper = nullptr);
+  static void Terminate();
+
+  static FileSpec GetProgramFileSpec();
+};
+} // namespace lldb_private
+
+#endif // LLDB_HOST_AIX_HOSTINFOAIX_H_
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 0094fcd..50d2233 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1357,6 +1357,8 @@ enum DebuggerBroadcastBit {
   eBroadcastBitError = (1 << 2),
   eBroadcastSymbolChange = (1 << 3),
   eBroadcastBitProgressCategory = (1 << 4),
+  eBroadcastBitExternalProgress = (1 << 5),
+  eBroadcastBitExternalProgressCategory = (1 << 6),
 };
 
 /// Used for expressing severity in logs and diagnostics.
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
index d6cb68f..cbe430c 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
@@ -1410,7 +1410,17 @@ class GdbRemoteTestCaseBase(Base, metaclass=GdbRemoteTestCaseFactory):
             p_response = context.get("p_response")
             self.assertIsNotNone(p_response)
             self.assertTrue(len(p_response) > 0)
-            self.assertFalse(p_response[0] == "E")
+
+            # on x86 Darwin, 4 GPR registers are often
+            # unavailable, this is expected and correct.
+            if (
+                self.getArchitecture() == "x86_64"
+                and self.platformIsDarwin()
+                and p_response[0] == "E"
+            ):
+                values[reg_index] = 0
+            else:
+                self.assertFalse(p_response[0] == "E")
 
             values[reg_index] = unpack_register_hex_unsigned(endian, p_response)
 
diff --git a/lldb/source/Core/Progress.cpp b/lldb/source/Core/Progress.cpp
index ed8dfb8..63f9804 100644
--- a/lldb/source/Core/Progress.cpp
+++ b/lldb/source/Core/Progress.cpp
@@ -28,12 +28,14 @@ static llvm::ManagedStatic<llvm::SignpostEmitter> g_progress_signposts;
 Progress::Progress(std::string title, std::string details,
                    std::optional<uint64_t> total,
                    lldb_private::Debugger *debugger,
-                   Timeout<std::nano> minimum_report_time)
+                   Timeout<std::nano> minimum_report_time,
+                   Progress::Origin origin)
     : m_total(total.value_or(Progress::kNonDeterministicTotal)),
       m_minimum_report_time(minimum_report_time),
       m_progress_data{title, ++g_id,
                       debugger ? std::optional<user_id_t>(debugger->GetID())
-                               : std::nullopt},
+                               : std::nullopt,
+                      origin},
       m_last_report_time_ns(
           std::chrono::nanoseconds(
               std::chrono::steady_clock::now().time_since_epoch())
@@ -106,9 +108,15 @@ void Progress::ReportProgress() {
   if (completed < m_prev_completed)
     return; // An overflow in the m_completed counter. Just ignore these events.
 
+  // Change the category bit if we're an internal or external progress.
+  uint32_t progress_category_bit =
+      m_progress_data.origin == Progress::Origin::eExternal
+          ? lldb::eBroadcastBitExternalProgress
+          : lldb::eBroadcastBitProgress;
+
   Debugger::ReportProgress(m_progress_data.progress_id, m_progress_data.title,
                            m_details, completed, m_total,
-                           m_progress_data.debugger_id);
+                           m_progress_data.debugger_id, progress_category_bit);
   m_prev_completed = completed;
 }
 
@@ -201,10 +209,13 @@ void ProgressManager::ReportProgress(
   // broadcasting to it since that bit doesn't need that information.
   const uint64_t completed =
       (type == EventType::Begin) ? 0 : Progress::kNonDeterministicTotal;
+  const uint32_t progress_category_bit =
+      progress_data.origin == Progress::Origin::eExternal
+          ? lldb::eBroadcastBitExternalProgressCategory
+          : lldb::eBroadcastBitProgressCategory;
   Debugger::ReportProgress(progress_data.progress_id, progress_data.title, "",
                            completed, Progress::kNonDeterministicTotal,
-                           progress_data.debugger_id,
-                           lldb::eBroadcastBitProgressCategory);
+                           progress_data.debugger_id, progress_category_bit);
 }
 
 void ProgressManager::Expire(llvm::StringRef key) {
diff --git a/lldb/source/DataFormatters/FormatterBytecode.cpp b/lldb/source/DataFormatters/FormatterBytecode.cpp
index e49c750..7f3dbe0 100644
--- a/lldb/source/DataFormatters/FormatterBytecode.cpp
+++ b/lldb/source/DataFormatters/FormatterBytecode.cpp
@@ -304,6 +304,9 @@ llvm::Error Interpret(std::vector<ControlStackElement> &control,
       control.pop_back();
       activate_block();
       continue;
+    case op_return:
+      control.clear();
+      return pc.takeError();
 
     // Literals.
     case op_lit_uint:
diff --git a/lldb/source/DataFormatters/FormatterBytecode.def b/lldb/source/DataFormatters/FormatterBytecode.def
index c664563..29e0bee 100644
--- a/lldb/source/DataFormatters/FormatterBytecode.def
+++ b/lldb/source/DataFormatters/FormatterBytecode.def
@@ -27,6 +27,7 @@ DEFINE_OPCODE(0x06, "rot",  rot)
 DEFINE_OPCODE(0x10, "{", begin)
 DEFINE_OPCODE(0x11, "if", if)
 DEFINE_OPCODE(0x12, "ifelse", ifelse)
+DEFINE_OPCODE(0x13, "return", return)
 
 DEFINE_OPCODE(0x20, nullptr, lit_uint)
 DEFINE_OPCODE(0x21, nullptr, lit_int)
diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index c2e091e..e0cd856 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -133,6 +133,11 @@ else()
       openbsd/Host.cpp
       openbsd/HostInfoOpenBSD.cpp
       )
+
+  elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    add_host_subdirectory(aix
+      aix/HostInfoAIX.cpp
+      )    
   endif()
 endif()
 
diff --git a/lldb/source/Host/aix/HostInfoAIX.cpp b/lldb/source/Host/aix/HostInfoAIX.cpp
new file mode 100644
index 0000000..61b4746
--- /dev/null
+++ b/lldb/source/Host/aix/HostInfoAIX.cpp
@@ -0,0 +1,22 @@
+//===-- HostInfoAIX.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/aix/HostInfoAIX.h"
+
+using namespace lldb_private;
+
+void HostInfoAIX::Initialize(SharedLibraryDirectoryHelper *helper) {
+  HostInfoPosix::Initialize(helper);
+}
+
+void HostInfoAIX::Terminate() { HostInfoBase::Terminate(); }
+
+FileSpec HostInfoAIX::GetProgramFileSpec() {
+  static FileSpec g_program_filespec;
+  return g_program_filespec;
+}
diff --git a/lldb/source/Host/openbsd/Host.cpp b/lldb/source/Host/openbsd/Host.cpp
index a4dc391..24650ff9 100644
--- a/lldb/source/Host/openbsd/Host.cpp
+++ b/lldb/source/Host/openbsd/Host.cpp
@@ -41,18 +41,7 @@ namespace lldb_private {
 class ProcessLaunchInfo;
 }
 
-Environment Host::GetEnvironment() {
-  Environment env;
-  char *v;
-  char **var = environ;
-  for (; var != NULL && *var != NULL; ++var) {
-    v = strchr(*var, (int)'-');
-    if (v == NULL)
-      continue;
-    env.insert(v);
-  }
-  return env;
-}
+Environment Host::GetEnvironment() { return Environment(environ); }
 
 static bool
 GetOpenBSDProcessArgs(const ProcessInstanceInfoMatch *match_info_ptr,
@@ -127,7 +116,7 @@ static bool GetOpenBSDProcessUserAndGroup(ProcessInstanceInfo &process_info) {
         process_info.SetUserID(proc_kinfo.p_ruid);
         process_info.SetGroupID(proc_kinfo.p_rgid);
         process_info.SetEffectiveUserID(proc_kinfo.p_uid);
-	process_info.SetEffectiveGroupID(proc_kinfo.p_gid);
+        process_info.SetEffectiveGroupID(proc_kinfo.p_gid);
         return true;
       }
     }
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index 6bdc33f..0ed2016 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -119,8 +119,7 @@ bool ConnectionFileDescriptor::IsConnected() const {
 
 ConnectionStatus ConnectionFileDescriptor::Connect(llvm::StringRef path,
                                                    Status *error_ptr) {
-  return Connect(
-      path, [](llvm::StringRef) {}, error_ptr);
+  return Connect(path, [](llvm::StringRef) {}, error_ptr);
 }
 
 ConnectionStatus
@@ -731,9 +730,19 @@ ConnectionStatus ConnectionFileDescriptor::ConnectFile(
     struct termios options;
     ::tcgetattr(fd, &options);
 
-    // Set port speed to maximum
+    // Set port speed to the available maximum
+#ifdef B115200
     ::cfsetospeed(&options, B115200);
     ::cfsetispeed(&options, B115200);
+#elif B57600
+    ::cfsetospeed(&options, B57600);
+    ::cfsetispeed(&options, B57600);
+#elif B38400
+    ::cfsetospeed(&options, B38400);
+    ::cfsetispeed(&options, B38400);
+#else
+#error "Maximum Baud rate is Unknown"
+#endif
 
     // Raw input, disable echo and signals
     options.c_lflag &= ~(ICANON | ECHO | ECHOE | ISIG);
diff --git a/lldb/source/Host/posix/DomainSocket.cpp b/lldb/source/Host/posix/DomainSocket.cpp
index 9a0b385..be8fcdf 100644
--- a/lldb/source/Host/posix/DomainSocket.cpp
+++ b/lldb/source/Host/posix/DomainSocket.cpp
@@ -86,7 +86,8 @@ Status DomainSocket::Connect(llvm::StringRef name) {
   if (error.Fail())
     return error;
   if (llvm::sys::RetryAfterSignal(-1, ::connect, GetNativeSocket(),
-        (struct sockaddr *)&saddr_un, saddr_un_len) < 0)
+                                  (struct sockaddr *)&saddr_un,
+                                  saddr_un_len) < 0)
     SetLastError(error);
 
   return error;
@@ -181,7 +182,7 @@ std::vector<std::string> DomainSocket::GetListeningConnectionURI() const {
     return {};
 
   struct sockaddr_un addr;
-  bzero(&addr, sizeof(struct sockaddr_un));
+  memset(&addr, 0, sizeof(struct sockaddr_un));
   addr.sun_family = AF_UNIX;
   socklen_t addr_len = sizeof(struct sockaddr_un);
   if (::getsockname(m_socket, (struct sockaddr *)&addr, &addr_len) != 0)
diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp
index 945e2af..4c326a2 100644
--- a/lldb/source/Host/posix/FileSystemPosix.cpp
+++ b/lldb/source/Host/posix/FileSystemPosix.cpp
@@ -9,16 +9,8 @@
 #include "lldb/Host/FileSystem.h"
 
 // C includes
-#include <dirent.h>
 #include <fcntl.h>
-#include <sys/mount.h>
-#include <sys/param.h>
-#include <sys/stat.h>
-#include <sys/types.h>
 #include <unistd.h>
-#if defined(__NetBSD__)
-#include <sys/statvfs.h>
-#endif
 
 // lldb Includes
 #include "lldb/Host/Host.h"
diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp
index aecdeb9..ce7caa3 100644
--- a/lldb/source/Host/posix/MainLoopPosix.cpp
+++ b/lldb/source/Host/posix/MainLoopPosix.cpp
@@ -99,6 +99,7 @@ public:
   ~RunImpl() = default;
 
   Status Poll();
+
   void ProcessReadEvents();
 
 private:
@@ -159,6 +160,22 @@ MainLoopPosix::RunImpl::RunImpl(MainLoopPosix &loop) : loop(loop) {
   read_fds.reserve(loop.m_read_fds.size());
 }
 
+static int StartPoll(llvm::MutableArrayRef<struct pollfd> fds,
+                     std::optional<MainLoopPosix::TimePoint> point) {
+#if HAVE_PPOLL
+  return ppoll(fds.data(), fds.size(), ToTimeSpec(point),
+               /*sigmask=*/nullptr);
+#else
+  using namespace std::chrono;
+  int timeout = -1;
+  if (point) {
+    nanoseconds dur = std::max(*point - steady_clock::now(), nanoseconds(0));
+    timeout = ceil<milliseconds>(dur).count();
+  }
+  return poll(fds.data(), fds.size(), timeout);
+#endif
+}
+
 Status MainLoopPosix::RunImpl::Poll() {
   read_fds.clear();
 
@@ -169,11 +186,9 @@ Status MainLoopPosix::RunImpl::Poll() {
     pfd.revents = 0;
     read_fds.push_back(pfd);
   }
+  int ready = StartPoll(read_fds, loop.GetNextWakeupTime());
 
-  if (ppoll(read_fds.data(), read_fds.size(),
-            ToTimeSpec(loop.GetNextWakeupTime()),
-            /*sigmask=*/nullptr) == -1 &&
-      errno != EINTR)
+  if (ready == -1 && errno != EINTR)
     return Status(errno, eErrorTypePOSIX);
 
   return Status();
diff --git a/lldb/source/Initialization/CMakeLists.txt b/lldb/source/Initialization/CMakeLists.txt
index c1a1678..b6282e1 100644
--- a/lldb/source/Initialization/CMakeLists.txt
+++ b/lldb/source/Initialization/CMakeLists.txt
@@ -1,4 +1,4 @@
-if ( CMAKE_SYSTEM_NAME MATCHES "Linux|Android|FreeBSD|NetBSD" )
+if ( CMAKE_SYSTEM_NAME MATCHES "Linux|Android|FreeBSD|NetBSD|OpenBSD" )
   list(APPEND EXTRA_PLUGINS lldbPluginProcessPOSIX)
 endif()
 
diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
index ad13a44..54028b1 100644
--- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
@@ -95,6 +95,7 @@ ABISysV_x86_64::CreateInstance(lldb::ProcessSP process_sp, const ArchSpec &arch)
     case llvm::Triple::OSType::Linux:
     case llvm::Triple::OSType::MacOSX:
     case llvm::Triple::OSType::NetBSD:
+    case llvm::Triple::OSType::OpenBSD:
     case llvm::Triple::OSType::Solaris:
     case llvm::Triple::OSType::UnknownOS:
       return ABISP(
diff --git a/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp b/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
index cd91f4a..b1151fe 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
@@ -90,17 +90,9 @@ void InstrumentationRuntimeASanLibsanitizers::Activate() {
   if (!process_sp)
     return;
 
-  lldb::ModuleSP module_sp = GetRuntimeModuleSP();
-
   Breakpoint *breakpoint = ReportRetriever::SetupBreakpoint(
-      module_sp, process_sp, ConstString("sanitizers_address_on_report"));
-
-  if (!breakpoint) {
-    breakpoint = ReportRetriever::SetupBreakpoint(
-        module_sp, process_sp,
-        ConstString("_Z22raise_sanitizers_error23sanitizer_error_context"));
-  }
-
+      GetRuntimeModuleSP(), process_sp,
+      ConstString("sanitizers_address_on_report"));
   if (!breakpoint)
     return;
 
diff --git a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
index 04ce339..74e0fa7 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
@@ -219,7 +219,6 @@ bool ReportRetriever::NotifyBreakpointHit(ProcessSP process_sp,
   return true; // Return true to stop the target
 }
 
-// FIXME: Setup the breakpoint using a less fragile SPI. rdar://124399066
 Breakpoint *ReportRetriever::SetupBreakpoint(ModuleSP module_sp,
                                              ProcessSP process_sp,
                                              ConstString symbol_name) {
@@ -235,19 +234,13 @@ Breakpoint *ReportRetriever::SetupBreakpoint(ModuleSP module_sp,
   if (!symbol->ValueIsAddress() || !symbol->GetAddressRef().IsValid())
     return nullptr;
 
-  Target &target = process_sp->GetTarget();
-  addr_t symbol_address = symbol->GetAddressRef().GetOpcodeLoadAddress(&target);
-
-  if (symbol_address == LLDB_INVALID_ADDRESS)
-    return nullptr;
-
+  const Address &address = symbol->GetAddressRef();
   const bool internal = true;
   const bool hardware = false;
 
-  Breakpoint *breakpoint =
-      process_sp->GetTarget()
-          .CreateBreakpoint(symbol_address, internal, hardware)
-          .get();
+  Breakpoint *breakpoint = process_sp->GetTarget()
+                               .CreateBreakpoint(address, internal, hardware)
+                               .get();
 
   return breakpoint;
 }
diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
index bbe5d4c..1d79edb 100644
--- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
+++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
@@ -31,7 +31,6 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/bit.h"
 
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::formatters;
@@ -267,21 +266,21 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider(
     if (class_name == "NSIndexSet" || class_name == "NSMutableIndexSet") {
       // Foundation version 2000 added a bitmask if the index set fit in 64 bits
       // and a Tagged Pointer version if the bitmask is small enough to fit in
-      // the tagged pointer payload.  
+      // the tagged pointer payload.
       // It also changed the layout (but not the size) of the set descriptor.
 
       // First check whether this is a tagged pointer.  The bitmask will be in
       // the payload of the tagged pointer.
       uint64_t payload;
-      if (runtime->GetFoundationVersion() >= 2000  
-          && descriptor->GetTaggedPointerInfo(nullptr, nullptr, &payload)) {
+      if (runtime->GetFoundationVersion() >= 2000 &&
+          descriptor->GetTaggedPointerInfo(nullptr, nullptr, &payload)) {
         count = llvm::popcount(payload);
         break;
       }
       // The first 32 bits describe the index set in all cases:
       Status error;
       uint32_t mode = process_sp->ReadUnsignedIntegerFromMemory(
-            valobj_addr + ptr_size, 4, 0, error);
+          valobj_addr + ptr_size, 4, 0, error);
       if (error.Fail())
         return false;
       // Now check if the index is held in a bitmask in the object:
@@ -292,7 +291,7 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider(
         if ((mode & 2) == 2) {
           // The bitfield is a 64 bit uint at the beginning of the data var.
           uint64_t bitfield = process_sp->ReadUnsignedIntegerFromMemory(
-            valobj_addr + 2 * ptr_size, 8, 0, error);
+              valobj_addr + 2 * ptr_size, 8, 0, error);
           if (error.Fail())
             return false;
           count = llvm::popcount(bitfield);
@@ -309,7 +308,7 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider(
           count = 0;
           break;
         }
-      
+
         if ((mode & 2) == 2)
           mode = 1; // this means the set only has one range
         else
@@ -1227,7 +1226,7 @@ bool lldb_private::formatters::ObjCSELSummaryProvider(
 time_t lldb_private::formatters::GetOSXEpoch() {
   static time_t epoch = 0;
   if (!epoch) {
-#ifndef _WIN32
+#if !defined(_WIN32) && !defined(_AIX)
     tzset();
     tm tm_epoch;
     tm_epoch.tm_sec = 0;
diff --git a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
index 7aa5b8d..b202898 100644
--- a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
+++ b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
@@ -8,7 +8,7 @@
 
 #include "ObjectContainerBSDArchive.h"
 
-#if defined(_WIN32) || defined(__ANDROID__)
+#if defined(_WIN32) || defined(__ANDROID__) || defined(_AIX)
 // Defines from ar, missing on Windows
 #define SARMAG 8
 #define ARFMAG "`\n"
@@ -81,10 +81,10 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() {
 
   std::unique_ptr<llvm::MemoryBuffer> mem_buffer =
       llvm::MemoryBuffer::getMemBuffer(
-            llvm::StringRef((const char *)data.GetDataStart(),
-                            data.GetByteSize()),
-            llvm::StringRef(),
-            /*RequiresNullTerminator=*/false);
+          llvm::StringRef((const char *)data.GetDataStart(),
+                          data.GetByteSize()),
+          llvm::StringRef(),
+          /*RequiresNullTerminator=*/false);
 
   auto exp_ar = llvm::object::Archive::create(mem_buffer->getMemBufferRef());
   if (!exp_ar) {
@@ -95,7 +95,7 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() {
 
   llvm::Error iter_err = llvm::Error::success();
   Object obj;
-  for (const auto &child: llvm_archive->children(iter_err)) {
+  for (const auto &child : llvm_archive->children(iter_err)) {
     obj.Clear();
     auto exp_name = child.getName();
     if (exp_name) {
@@ -111,7 +111,9 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() {
       obj.modification_time =
           std::chrono::duration_cast<std::chrono::seconds>(
               std::chrono::time_point_cast<std::chrono::seconds>(
-                    exp_mtime.get()).time_since_epoch()).count();
+                  exp_mtime.get())
+                  .time_since_epoch())
+              .count();
     } else {
       LLDB_LOG_ERROR(l, exp_mtime.takeError(),
                      "failed to get archive object time: {0}");
@@ -331,21 +333,21 @@ ObjectContainer *ObjectContainerBSDArchive::CreateInstance(
 ArchiveType
 ObjectContainerBSDArchive::MagicBytesMatch(const DataExtractor &data) {
   uint32_t offset = 0;
-  const char *armag = (const char *)data.PeekData(offset,
-                                                  sizeof(ar_hdr) + SARMAG);
+  const char *armag =
+      (const char *)data.PeekData(offset, sizeof(ar_hdr) + SARMAG);
   if (armag == nullptr)
     return ArchiveType::Invalid;
   ArchiveType result = ArchiveType::Invalid;
   if (strncmp(armag, ArchiveMagic, SARMAG) == 0)
-      result = ArchiveType::Archive;
+    result = ArchiveType::Archive;
   else if (strncmp(armag, ThinArchiveMagic, SARMAG) == 0)
-      result = ArchiveType::ThinArchive;
+    result = ArchiveType::ThinArchive;
   else
-      return ArchiveType::Invalid;
+    return ArchiveType::Invalid;
 
   armag += offsetof(struct ar_hdr, ar_fmag) + SARMAG;
   if (strncmp(armag, ARFMAG, 2) == 0)
-      return result;
+    return result;
   return ArchiveType::Invalid;
 }
 
@@ -443,7 +445,8 @@ size_t ObjectContainerBSDArchive::GetModuleSpecifications(
     return 0;
 
   const size_t initial_count = specs.GetSize();
-  llvm::sys::TimePoint<> file_mod_time = FileSystem::Instance().GetModificationTime(file);
+  llvm::sys::TimePoint<> file_mod_time =
+      FileSystem::Instance().GetModificationTime(file);
   Archive::shared_ptr archive_sp(
       Archive::FindCachedArchive(file, ArchSpec(), file_mod_time, file_offset));
   bool set_archive_arch = false;
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index bcac5ed..c5013ea 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -58,8 +58,8 @@ Status MinidumpFileBuilder::AddHeaderAndCalculateDirectories() {
   // First set the offset on the file, and on the bytes saved
   m_saved_data_size = HEADER_SIZE;
   // We know we will have at least Misc, SystemInfo, Modules, and ThreadList
-  // (corresponding memory list for stacks) And an additional memory list for
-  // non-stacks.
+  // (corresponding memory list for stacks), an additional memory list for
+  // non-stacks, and a stream to mark this minidump was generated by LLDB.
   lldb_private::Target &target = m_process_sp->GetTarget();
   m_expected_directories = 6;
   // Check if OS is linux and reserve directory space for all linux specific
@@ -90,7 +90,10 @@ Status MinidumpFileBuilder::AddHeaderAndCalculateDirectories() {
         "sections. Written / Expected (%" PRIx64 " / %" PRIx64 ")",
         new_offset, m_saved_data_size);
 
-  return error;
+  if (error.Fail())
+    return error;
+
+  return AddLLDBGeneratedStream();
 }
 
 Status MinidumpFileBuilder::AddDirectory(StreamType type,
@@ -126,6 +129,12 @@ Status MinidumpFileBuilder::AddDirectory(StreamType type,
   return error;
 }
 
+Status MinidumpFileBuilder::AddLLDBGeneratedStream() {
+  Status error;
+  StreamType type = StreamType::LLDBGenerated;
+  return AddDirectory(type, 0);
+}
+
 Status MinidumpFileBuilder::AddSystemInfo() {
   Status error;
   const llvm::Triple &target_triple =
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index 58b2846..48293ee 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -120,6 +120,7 @@ public:
   void DeleteFile() noexcept;
 
 private:
+  lldb_private::Status AddLLDBGeneratedStream();
   // Add data to the end of the buffer, if the buffer exceeds the flush level,
   // trigger a flush.
   lldb_private::Status AddData(const void *data, uint64_t size);
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
index d18b718d..0cf6480 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
@@ -64,7 +64,7 @@ static Status DeleteForwardPortWithAdb(uint16_t local_port,
 
 static Status FindUnusedPort(uint16_t &port) {
   Status error;
-  std::unique_ptr<TCPSocket> tcp_socket(new TCPSocket(true, false));
+  std::unique_ptr<TCPSocket> tcp_socket(new TCPSocket(true));
   if (error.Fail())
     return error;
 
diff --git a/lldb/source/Plugins/Process/CMakeLists.txt b/lldb/source/Plugins/Process/CMakeLists.txt
index a51d0f7..7f4f6fe 100644
--- a/lldb/source/Plugins/Process/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/CMakeLists.txt
@@ -7,6 +7,8 @@ elseif (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 elseif (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
   add_subdirectory(NetBSD)
   add_subdirectory(POSIX)
+elseif (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+  add_subdirectory(POSIX)
 elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
   add_subdirectory(Windows/Common)
 elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index b3916cc..5f85f99 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -1031,6 +1031,8 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) {
 
   std::vector<uint8_t> ph_bytes;
   ph_bytes.resize(elf_header.e_phentsize);
+  lldb::addr_t base_addr = 0;
+  bool found_first_load_segment = false;
   for (unsigned int i = 0; i < elf_header.e_phnum; ++i) {
     byte_read = ReadMemory(ph_addr + i * elf_header.e_phentsize,
                            ph_bytes.data(), elf_header.e_phentsize, error);
@@ -1041,6 +1043,11 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) {
     offset = 0;
     elf::ELFProgramHeader program_header;
     program_header.Parse(program_header_data, &offset);
+    if (program_header.p_type == llvm::ELF::PT_LOAD &&
+        !found_first_load_segment) {
+      base_addr = program_header.p_vaddr;
+      found_first_load_segment = true;
+    }
     if (program_header.p_type != llvm::ELF::PT_NOTE)
       continue;
 
@@ -1049,7 +1056,7 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) {
 
     // We need to slide the address of the p_vaddr as these values don't get
     // relocated in memory.
-    const lldb::addr_t vaddr = program_header.p_vaddr + address;
+    const lldb::addr_t vaddr = program_header.p_vaddr + address - base_addr;
     byte_read =
         ReadMemory(vaddr, note_bytes.data(), program_header.p_memsz, error);
     if (byte_read != program_header.p_memsz)
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
index afc095d..94c0a5f 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -49,6 +49,11 @@ llvm::ArrayRef<uint8_t> MinidumpParser::GetStream(StreamType stream_type) {
   return m_file->getRawStream(stream_type).value_or(llvm::ArrayRef<uint8_t>());
 }
 
+std::optional<llvm::ArrayRef<uint8_t>>
+MinidumpParser::GetRawStream(StreamType stream_type) {
+  return m_file->getRawStream(stream_type);
+}
+
 UUID MinidumpParser::GetModuleUUID(const minidump::Module *module) {
   auto cv_record =
       GetData().slice(module->CvRecord.RVA, module->CvRecord.DataSize);
@@ -651,6 +656,7 @@ MinidumpParser::GetStreamTypeAsString(StreamType stream_type) {
     ENUM_TO_CSTR(FacebookAbortReason);
     ENUM_TO_CSTR(FacebookThreadName);
     ENUM_TO_CSTR(FacebookLogcat);
+    ENUM_TO_CSTR(LLDBGenerated);
   }
   return "unknown stream type";
 }
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
index f0b6e60..2c5e6f1 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
@@ -59,6 +59,7 @@ public:
   llvm::ArrayRef<uint8_t> GetData();
 
   llvm::ArrayRef<uint8_t> GetStream(StreamType stream_type);
+  std::optional<llvm::ArrayRef<uint8_t>> GetRawStream(StreamType stream_type);
 
   UUID GetModuleUUID(const minidump::Module *module);
 
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index 5b0df72..05b3bb9 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -354,6 +354,22 @@ DataExtractor ProcessMinidump::GetAuxvData() {
                        GetAddressByteSize(), GetAddressByteSize());
 }
 
+bool ProcessMinidump::IsLLDBMinidump() {
+  std::optional<llvm::ArrayRef<uint8_t>> lldb_generated_section =
+      m_minidump_parser->GetRawStream(StreamType::LLDBGenerated);
+  return lldb_generated_section.has_value();
+}
+
+DynamicLoader *ProcessMinidump::GetDynamicLoader() {
+  // This is a workaround for the dynamic loader not playing nice in issue
+  // #119598. The specific reason we use the dynamic loader is to get the TLS
+  // info sections, which we can assume are not being written to the minidump
+  // unless it's an LLDB generate minidump.
+  if (IsLLDBMinidump())
+    return PostMortemProcess::GetDynamicLoader();
+  return nullptr;
+}
+
 void ProcessMinidump::BuildMemoryRegions() {
   if (m_memory_regions)
     return;
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
index 3d23567..ad8d0ed 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
@@ -53,6 +53,8 @@ public:
 
   Status DoLoadCore() override;
 
+  DynamicLoader *GetDynamicLoader() override;
+
   // Returns AUXV structure found in the core file
   lldb_private::DataExtractor GetAuxvData() override;
 
@@ -74,8 +76,8 @@ public:
 
   ArchSpec GetArchitecture();
 
-  Status GetMemoryRegions(
-      lldb_private::MemoryRegionInfos &region_list) override;
+  Status
+  GetMemoryRegions(lldb_private::MemoryRegionInfos &region_list) override;
 
   bool GetProcessInfo(ProcessInstanceInfo &info) override;
 
@@ -113,6 +115,7 @@ private:
   std::optional<MemoryRegionInfos> m_memory_regions;
 
   void BuildMemoryRegions();
+  bool IsLLDBMinidump();
 };
 
 } // namespace minidump
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 518a478..0f0e4a5 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -84,6 +84,7 @@ public:
   static PythonObject ToSWIGWrapper(lldb::ValueObjectSP value_sp);
   static PythonObject ToSWIGWrapper(lldb::TargetSP target_sp);
   static PythonObject ToSWIGWrapper(lldb::ProcessSP process_sp);
+  static PythonObject ToSWIGWrapper(lldb::ModuleSP module_sp);
   static PythonObject ToSWIGWrapper(lldb::ThreadPlanSP thread_plan_sp);
   static PythonObject ToSWIGWrapper(lldb::BreakpointSP breakpoint_sp);
   static PythonObject ToSWIGWrapper(Status &&status);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 28e7cce..e2f76e8 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1222,11 +1222,9 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
   }
 
   if (die.HasChildren()) {
-    bool skip_artificial = true;
-    ParseChildParameters(containing_decl_ctx, die, skip_artificial, is_static,
-                         is_variadic, has_template_params,
-                         function_param_types, function_param_decls,
-                         type_quals);
+    ParseChildParameters(containing_decl_ctx, die, is_static, is_variadic,
+                         has_template_params, function_param_types,
+                         function_param_decls, type_quals);
   }
 
   bool ignore_containing_context = false;
@@ -2325,7 +2323,7 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) {
 
   clang::DeclContext *containing_decl_ctx =
       GetClangDeclContextContainingDIE(die, nullptr);
-  ParseChildParameters(containing_decl_ctx, die, true, is_static, is_variadic,
+  ParseChildParameters(containing_decl_ctx, die, is_static, is_variadic,
                        has_template_params, param_types, param_decls,
                        type_quals);
   sstr << "(";
@@ -3069,8 +3067,8 @@ bool DWARFASTParserClang::ParseChildMembers(
 
 size_t DWARFASTParserClang::ParseChildParameters(
     clang::DeclContext *containing_decl_ctx, const DWARFDIE &parent_die,
-    bool skip_artificial, bool &is_static, bool &is_variadic,
-    bool &has_template_params, std::vector<CompilerType> &function_param_types,
+    bool &is_static, bool &is_variadic, bool &has_template_params,
+    std::vector<CompilerType> &function_param_types,
     std::vector<clang::ParmVarDecl *> &function_param_decls,
     unsigned &type_quals) {
   if (!parent_die)
@@ -3125,7 +3123,7 @@ size_t DWARFASTParserClang::ParseChildParameters(
       }
 
       bool skip = false;
-      if (skip_artificial && is_artificial) {
+      if (is_artificial) {
         // In order to determine if a C++ member function is "const" we
         // have to look at the const-ness of "this"...
         if (arg_idx == 0 &&
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 55f8e38..5b1c204 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -189,7 +189,7 @@ protected:
   size_t
   ParseChildParameters(clang::DeclContext *containing_decl_ctx,
                        const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-                       bool skip_artificial, bool &is_static, bool &is_variadic,
+                       bool &is_static, bool &is_variadic,
                        bool &has_template_params,
                        std::vector<lldb_private::CompilerType> &function_args,
                        std::vector<clang::ParmVarDecl *> &function_param_decls,
diff --git a/lldb/test/API/commands/register/register/register_command/TestRegisters.py b/lldb/test/API/commands/register/register/register_command/TestRegisters.py
index 0b80a09..99290e0 100644
--- a/lldb/test/API/commands/register/register/register_command/TestRegisters.py
+++ b/lldb/test/API/commands/register/register/register_command/TestRegisters.py
@@ -58,6 +58,13 @@ class RegisterCommandsTestCase(TestBase):
             # could not be read.  This is expected.
             error_str_matched = True
 
+        if self.getArchitecture() == "x86_64" and self.platformIsDarwin():
+            # debugserver on x86 will provide ds/es/ss/gsbase when the
+            # kernel provides them, but most of the time they will be
+            # unavailable.  So "register read -a" will report that
+            # 4 registers were unavailable, it is expected.
+            error_str_matched = True
+
         self.expect(
             "register read -a",
             MISSING_EXPECTED_REGISTERS,
diff --git a/lldb/unittests/Core/ProgressReportTest.cpp b/lldb/unittests/Core/ProgressReportTest.cpp
index 20324e9..0943d7b 100644
--- a/lldb/unittests/Core/ProgressReportTest.cpp
+++ b/lldb/unittests/Core/ProgressReportTest.cpp
@@ -425,3 +425,104 @@ TEST_F(ProgressReportTest, TestProgressManagerDisjointReports) {
 
   ASSERT_FALSE(listener_sp->GetEvent(event_sp, TIMEOUT));
 }
+
+TEST_F(ProgressReportTest, TestExternalReportCreation) {
+  ListenerSP listener_sp =
+      CreateListenerFor(lldb::eBroadcastBitExternalProgress);
+  EventSP event_sp;
+  const ProgressEventData *data;
+
+  // Scope this for RAII on the progress objects.
+  // Create progress reports and check that their respective events for having
+  // started and ended are broadcasted.
+  {
+    Progress progress1("Progress report 1", "Starting report 1",
+                       /*total=*/std::nullopt, /*debugger=*/nullptr,
+                       /*minimum_report_time=*/std::chrono::seconds(0),
+                       Progress::Origin::eExternal);
+    Progress progress2("Progress report 2", "Starting report 2",
+                       /*total=*/std::nullopt, /*debugger=*/nullptr,
+                       /*minimum_report_time=*/std::chrono::seconds(0),
+                       Progress::Origin::eExternal);
+    Progress progress3("Progress report 3", "Starting report 3",
+                       /*total=*/std::nullopt, /*debugger=*/nullptr,
+                       /*minimum_report_time=*/std::chrono::seconds(0),
+                       Progress::Origin::eExternal);
+  }
+
+  // Start popping events from the queue, they should have been recevied
+  // in this order:
+  // Starting progress: 1, 2, 3
+  // Ending progress: 3, 2, 1
+  ASSERT_TRUE(listener_sp->GetEvent(event_sp, TIMEOUT));
+  data = ProgressEventData::GetEventDataFromEvent(event_sp.get());
+
+  EXPECT_EQ(data->GetDetails(), "Starting report 1");
+  EXPECT_FALSE(data->IsFinite());
+  EXPECT_FALSE(data->GetCompleted());
+  EXPECT_EQ(data->GetTotal(), Progress::kNonDeterministicTotal);
+  EXPECT_EQ(data->GetMessage(), "Progress report 1: Starting report 1");
+
+  ASSERT_TRUE(listener_sp->GetEvent(event_sp, TIMEOUT));
+  data = ProgressEventData::GetEventDataFromEvent(event_sp.get());
+
+  EXPECT_EQ(data->GetDetails(), "Starting report 2");
+  EXPECT_FALSE(data->IsFinite());
+  EXPECT_FALSE(data->GetCompleted());
+  EXPECT_EQ(data->GetTotal(), Progress::kNonDeterministicTotal);
+  EXPECT_EQ(data->GetMessage(), "Progress report 2: Starting report 2");
+
+  ASSERT_TRUE(listener_sp->GetEvent(event_sp, TIMEOUT));
+  data = ProgressEventData::GetEventDataFromEvent(event_sp.get());
+
+  EXPECT_EQ(data->GetDetails(), "Starting report 3");
+  EXPECT_FALSE(data->IsFinite());
+  EXPECT_FALSE(data->GetCompleted());
+  EXPECT_EQ(data->GetTotal(), Progress::kNonDeterministicTotal);
+  EXPECT_EQ(data->GetMessage(), "Progress report 3: Starting report 3");
+
+  // Progress report objects should be destroyed at this point so
+  // get each report from the queue and check that they've been
+  // destroyed in reverse order.
+  ASSERT_TRUE(listener_sp->GetEvent(event_sp, TIMEOUT));
+  data = ProgressEventData::GetEventDataFromEvent(event_sp.get());
+
+  EXPECT_EQ(data->GetTitle(), "Progress report 3");
+  EXPECT_TRUE(data->GetCompleted());
+  EXPECT_FALSE(data->IsFinite());
+  EXPECT_EQ(data->GetMessage(), "Progress report 3: Starting report 3");
+
+  ASSERT_TRUE(listener_sp->GetEvent(event_sp, TIMEOUT));
+  data = ProgressEventData::GetEventDataFromEvent(event_sp.get());
+
+  EXPECT_EQ(data->GetTitle(), "Progress report 2");
+  EXPECT_TRUE(data->GetCompleted());
+  EXPECT_FALSE(data->IsFinite());
+  EXPECT_EQ(data->GetMessage(), "Progress report 2: Starting report 2");
+
+  ASSERT_TRUE(listener_sp->GetEvent(event_sp, TIMEOUT));
+  data = ProgressEventData::GetEventDataFromEvent(event_sp.get());
+
+  EXPECT_EQ(data->GetTitle(), "Progress report 1");
+  EXPECT_TRUE(data->GetCompleted());
+  EXPECT_FALSE(data->IsFinite());
+  EXPECT_EQ(data->GetMessage(), "Progress report 1: Starting report 1");
+}
+
+TEST_F(ProgressReportTest, TestExternalReportNotReceived) {
+  ListenerSP listener_sp = CreateListenerFor(lldb::eBroadcastBitProgress);
+  EventSP event_sp;
+
+  // Scope this for RAII on the progress objects.
+  // Create progress reports and check that their respective events for having
+  // started and ended are broadcasted.
+  {
+    Progress progress1("External Progress report 1",
+                       "Starting external report 1",
+                       /*total=*/std::nullopt, /*debugger=*/nullptr,
+                       /*minimum_report_time=*/std::chrono::seconds(0),
+                       Progress::Origin::eExternal);
+  }
+
+  ASSERT_FALSE(listener_sp->GetEvent(event_sp, TIMEOUT));
+}
diff --git a/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp b/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
index 7307db6..5e980c3e19 100644
--- a/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
+++ b/lldb/unittests/DataFormatter/FormatterBytecodeTest.cpp
@@ -98,6 +98,16 @@ TEST_F(FormatterBytecodeTest, ControlOps) {
     ASSERT_EQ(data.Pop<uint64_t>(), 42u);
   }
   {
+    DataStack data;
+    ASSERT_TRUE(Interpret({op_lit_uint, 1, op_begin, 3, op_lit_uint, 42,
+                           op_return, op_if, op_lit_uint, 23},
+                          data));
+    ASSERT_EQ(data.Pop<uint64_t>(), 42u);
+  }
+}
+
+TEST_F(FormatterBytecodeTest, ConversionOps) {
+  {
     DataStack data(lldb::ValueObjectSP{});
     ASSERT_TRUE(Interpret({op_is_null}, data));
     ASSERT_EQ(data.Pop<uint64_t>(), 1u);
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 6d0fda1..2ccf30b 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -23,8 +23,10 @@ llvm@npopov.com, npopov@redhat.com (email), [nikic](https://github.com/nikic) (G
 
 #### AliasAnalysis
 
-Hal Finkel \
-hfinkel@anl.gov (email), [hfinkel](https://github.com/hfinkel) (GitHub)
+Nikita Popov \
+llvm@npopov.com, npopov@redhat.com (email), [nikic](https://github.com/nikic) (GitHub), nikic (Discourse) \
+Florian Hahn \
+flo@fhahn.com (email), [fhahn](https://github.com/fhahn) (GitHub)
 
 #### Attributor, OpenMPOpt
 
@@ -43,8 +45,12 @@ Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.co
 
 #### Inlining
 
-Chandler Carruth \
-chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
+Arthur Eubanks \
+aeubanks@google.com (email), [aeubanks](https://github.com/aeubanks) (GitHub) \
+Mircea Trofin (esp. ML inliner) \
+mtrofin@google.com (email), [mtrofin](https://github.com/mtrofin) (GitHub) \
+Kazu Hirata (esp. module inliner and inline order) \
+kazu@google.com (email), [kazutakahirata](https://github.com/kazutakahirata) (GitHub)
 
 #### InstCombine, InstSimplify, ValueTracking, ConstantFold
 
@@ -63,6 +69,11 @@ mail@justinbogner.com (email), [bogner](https://github.com/bogner) (GitHub)
 Diego Novillo \
 dnovillo@google.com (email), [dnovillo](https://github.com/dnovillo) (GitHub)
 
+#### New pass manager, CGSCC, LazyCallGraph
+
+Arthur Eubanks \
+aeubanks@google.com (email), [aeubanks](https://github.com/aeubanks) (GitHub)
+
 #### LoopStrengthReduce
 
 Quentin Colombet \
@@ -73,10 +84,15 @@ quentin.colombet@gmail.com (email), [qcolombet](https://github.com/qcolombet) (G
 Florian Hahn \
 flo@fhahn.com (email), [fhahn](https://github.com/fhahn) (GitHub)
 
+#### MemorySSA
+
+Alina Sbirlea \
+asbirlea@google.com (email), [alinas](https://github.com/alinas) (GitHub)
+
 #### SandboxVectorizer
 
 Vasileios Porpodas \
-vporpodas@google.com (email), [vporpo](https://github.com/vporpo) (GitHub)
+vporpodas@google.com (email), [vporpo](https://github.com/vporpo) (GitHub) \
 Jorge Gorbe Moya \
 jgorbe@google.com (email), [slackito](https://github.com/slackito) (GitHub)
 
@@ -233,11 +249,11 @@ anton@korobeynikov.info (email), [asl](https://github.com/asl) (GitHub)
 #### NVPTX backend
 
 Justin Holewinski \
-jholewinski@nvidia.com (email), [jholewinski](https://github.com/jholewinski) (GitHub)
+jholewinski@nvidia.com (email), [jholewinski](https://github.com/jholewinski) (GitHub) \
 Artem Belevich \
-tra@google.com (email), [Artem-B](https://github.com/Artem-B) (GitHub)
+tra@google.com (email), [Artem-B](https://github.com/Artem-B) (GitHub) \
 Alex MacLean \
-amaclean@nvidia.com (email), [AlexMaclean](https://github.com/AlexMaclean) (GitHub)
+amaclean@nvidia.com (email), [AlexMaclean](https://github.com/AlexMaclean) (GitHub) \
 Justin Fargnoli \
 jfargnoli@nvidia.com (email), [justinfargnoli](https://github.com/justinfargnoli) (GitHub)
 
@@ -249,7 +265,11 @@ czhengsz@cn.ibm.com (email), [chenzheng1030](https://github.com/chenzheng1030) (
 #### RISCV backend
 
 Alex Bradbury \
-asb@igalia.com (email), [asb](https://github.com/asb) (GitHub)
+asb@igalia.com (email), [asb](https://github.com/asb) (GitHub) \
+Craig Topper \
+craig.topper@sifive.com (email), [topperc](https://github.com/topperc) (GitHub) \
+Philip Reames \
+listmail@philipreames.com (email), [preames](https://github.com/preames) (GitHub)
 
 #### Sparc backend
 
@@ -312,16 +332,19 @@ peter@pcc.me.uk (email), [pcc](https://github.com/pcc) (GitHub)
 Chandler Carruth \
 chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
 
-#### Debug info
+#### Debug info and DWARF
 
+Adrian Prantl \
+aprantl@apple.com (email), [adrian-prantl](https://github.com/adrian-prantl) (GitHub) \
+David Blaikie (especially type information) \
+dblaikie@gmail.com (email), [dwblaikie](https://github.com/dwblaike) (GitHub) \
+Jeremy Morse (especially variable information) \
+jeremy.morse@sony.com (email), [jmorse](https://github.com/jmorse) (GitHub) \
+Jonas Devlieghere (especially dsymutil/DWARFLinker) \
+jonas@devlieghere.com (email), [JDevlieghere](https://github.com/JDevlieghere) (GitHub) \
 Eric Christopher \
 echristo@gmail.com (email), [echristo](https://github.com/echristo) (GitHub)
 
-#### DWARF Parser
-
-Benjamin Kramer \
-benny.kra@gmail.com (email), [d0k](https://github.com/d0k) (GitHub)
-
 #### IR Linker and LTO
 
 Teresa Johnson \
@@ -335,7 +358,7 @@ lhames@gmail.com (email), [lhames](https://github.com/lhames) (GitHub)
 #### SandboxIR
 
 Vasileios Porpodas \
-vporpodas@google.com (email), [vporpo](https://github.com/vporpo) (GitHub)
+vporpodas@google.com (email), [vporpo](https://github.com/vporpo) (GitHub) \
 Jorge Gorbe Moya \
 jgorbe@google.com (email), [slackito](https://github.com/slackito) (GitHub)
 
@@ -410,7 +433,6 @@ gkistanova@gmail.com (email), [gkistanova](https://github.com/gkistanova) (GitHu
 ### Other subprojects
 
 Some subprojects maintain their own list of per-component maintainers.
-Others only have a lead maintainer listed here.
 
 [Bolt maintainers](https://github.com/llvm/llvm-project/blob/main/bolt/Maintainers.txt)
 
@@ -424,6 +446,8 @@ Others only have a lead maintainer listed here.
 
 [libc++ maintainers](https://github.com/llvm/llvm-project/blob/main/libcxx/Maintainers.md)
 
+[libclc maintainers](https://github.com/llvm/llvm-project/blob/main/libclc/Maintainers.md)
+
 [LLD maintainers](https://github.com/llvm/llvm-project/blob/main/lld/Maintainers.md)
 
 [LLDB maintainers](https://github.com/llvm/llvm-project/blob/main/lldb/Maintainers.rst)
@@ -432,11 +456,6 @@ Others only have a lead maintainer listed here.
 
 [Polly maintainers](https://github.com/llvm/llvm-project/blob/main/polly/Maintainers.md)
 
-#### libclc
-
-Tom Stellard \
-tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
-
 ## Inactive Maintainers
 
 The following people have graciously spent time performing maintainer
@@ -452,14 +471,16 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 
 Paul C. Anagnostopoulos (paul@windfall.com, [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
-Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support \
+Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support, Inlining \
 Peter Collingbourne (peter@pcc.me.uk, [pcc](https://github.com/pcc)) -- LTO \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
+Hal Finkel (hfinkel@anl.gov, [hfinkel](https://github.com/hfinkel) -- AliasAnalysis \
 Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -- ARM backend \
 Venkatraman Govindaraju (venkatra@cs.wisc.edu, [vegovin](https://github.com/vegovin) -- Sparc backend \
 James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI \
+Benjamin Kramer (benny.kra@gmail.com, [d0k](https://github.com/d0k)) -- DWARF Parser \
 David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 54a54db..e046e37 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -932,7 +932,7 @@ endfunction()
 
 macro(add_llvm_library name)
   cmake_parse_arguments(ARG
-    "SHARED;BUILDTREE_ONLY;MODULE;INSTALL_WITH_TOOLCHAIN"
+    "SHARED;BUILDTREE_ONLY;MODULE;INSTALL_WITH_TOOLCHAIN;NO_EXPORT"
     ""
     ""
     ${ARGN})
@@ -967,7 +967,11 @@ macro(add_llvm_library name)
         set(umbrella)
       endif()
 
-      get_target_export_arg(${name} LLVM export_to_llvmexports ${umbrella})
+      if(ARG_NO_EXPORT)
+        set(export_to_llvmexports)
+      else()
+        get_target_export_arg(${name} LLVM export_to_llvmexports ${umbrella})
+      endif()
       install(TARGETS ${name}
               ${export_to_llvmexports}
               LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT ${name}
@@ -980,7 +984,9 @@ macro(add_llvm_library name)
                                  COMPONENT ${name})
       endif()
     endif()
-    set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name})
+    if(NOT ARG_NO_EXPORT)
+      set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name})
+    endif()
   endif()
 
   get_subproject_title(subproject_title)
diff --git a/llvm/cmake/modules/Findzstd.cmake b/llvm/cmake/modules/Findzstd.cmake
index 86b6d48..f6ca5d1 100644
--- a/llvm/cmake/modules/Findzstd.cmake
+++ b/llvm/cmake/modules/Findzstd.cmake
@@ -10,7 +10,7 @@
 # zstd::libzstd_shared
 # zstd::libzstd_static
 
-if(MSVC)
+if(MSVC OR "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC")
   set(zstd_STATIC_LIBRARY_SUFFIX "_static\\${CMAKE_STATIC_LIBRARY_SUFFIX}$")
 else()
   set(zstd_STATIC_LIBRARY_SUFFIX "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$")
@@ -33,7 +33,7 @@ if(zstd_FOUND)
     set(zstd_STATIC_LIBRARY "${zstd_LIBRARY}")
   elseif (NOT TARGET zstd::libzstd_shared)
     add_library(zstd::libzstd_shared SHARED IMPORTED)
-    if(MSVC)
+    if(MSVC OR "${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC")
       include(GNUInstallDirs) # For CMAKE_INSTALL_LIBDIR and friends.
       # IMPORTED_LOCATION is the path to the DLL and IMPORTED_IMPLIB is the "library".
       get_filename_component(zstd_DIRNAME "${zstd_LIBRARY}" DIRECTORY)
diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
index c268153..8266d89 100644
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -473,6 +473,14 @@ OPTIONS
    flag can be specified multiple times to measure multiple events. The maximum
    number of validation counters is platform dependent.
 
+.. option:: --benchmark-process-cpu=<cpu id>
+
+  This option specifies the number of the CPU that should be used to run the
+  benchmarking subprocess. When starting the subprocess,
+  :program:`llvm-exegesis` will set the affinity of the subprocess to only
+  include the specified CPU. This option only works in the subprocess execution
+  mode.
+
 EXIT STATUS
 -----------
 
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index e6af47c..be4876c 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -78,10 +78,47 @@ multiple file formats.
  Enable deterministic mode when copying archives, i.e. use 0 for archive member
  header UIDs, GIDs and timestamp fields. On by default.
 
+.. option:: --globalize-symbol <symbol>
+
+ Mark any defined symbols named ``<symbol>`` as global symbols in the output.
+ Can be specified multiple times to mark multiple symbols.
+
+.. option:: --globalize-symbols <filename>
+
+ Read a list of names from the file ``<filename>`` and mark defined symbols with
+ those names as global in the output. In the file, each line represents a single
+ symbol, with leading and trailing whitespace ignored, as is anything following
+ a '#'. Can be specified multiple times to read names from multiple files.
+
 .. option:: --help, -h
 
  Print a summary of command line options.
 
+.. option:: --keep-global-symbol <symbol>, -G
+
+ Mark all symbols local in the output, except for symbols with the name
+ ``<symbol>``. Can be specified multiple times to ignore multiple symbols.
+
+.. option:: --keep-global-symbols <filename>
+
+ Mark all symbols local in the output, except for symbols named in the file
+ ``<filename>``. In the file, each line represents a single symbol, with leading
+ and trailing whitespace ignored, as is anything following a '#'. Can be
+ specified multiple times to read names from multiple files.
+
+.. option:: --localize-symbol <symbol>, -L
+
+ Mark any defined non-common symbol named ``<symbol>`` as a local symbol in the
+ output. Can be specified multiple times to mark multiple symbols as local.
+
+.. option:: --localize-symbols <filename>
+
+ Read a list of names from the file ``<filename>`` and mark defined non-common
+ symbols with those names as local in the output. In the file, each line
+ represents a single symbol, with leading and trailing whitespace ignored, as is
+ anything following a '#'. Can be specified multiple times to read names from
+ multiple files.
+
 .. option:: --only-keep-debug
 
  Produce a debug file as the output that only preserves contents of sections
@@ -177,6 +214,19 @@ multiple file formats.
    flags.
  - `share` = add the `IMAGE_SCN_MEM_SHARED` and `IMAGE_SCN_MEM_READ` flags.
 
+.. option:: --skip-symbol <symbol>
+
+ Do not change the parameters of symbol ``<symbol>`` when executing other
+ options that can change the symbol's name, binding or visibility.
+
+.. option:: --skip-symbols <filename>
+
+ Do not change the parameters of symbols named in the file ``<filename>`` when
+ executing other options that can change the symbol's name, binding or
+ visibility. In the file, each line represents a single symbol, with leading
+ and trailing whitespace ignored, as is anything following a '#'.
+ Can be specified multiple times to read names from multiple files.
+
 .. option:: --strip-all-gnu
 
  Remove all symbols, debug sections and relocations from the output. This option
@@ -355,18 +405,6 @@ them.
  For binary outputs, fill the gaps between sections with ``<value>`` instead
  of zero. The value must be an unsigned 8-bit integer.
 
-.. option:: --globalize-symbol <symbol>
-
- Mark any defined symbols named ``<symbol>`` as global symbols in the output.
- Can be specified multiple times to mark multiple symbols.
-
-.. option:: --globalize-symbols <filename>
-
- Read a list of names from the file ``<filename>`` and mark defined symbols with
- those names as global in the output. In the file, each line represents a single
- symbol, with leading and trailing whitespace ignored, as is anything following
- a '#'. Can be specified multiple times to read names from multiple files.
-
 .. option:: --input-target <format>, -I
 
  Read the input as the specified format. See `SUPPORTED FORMATS`_ for a list of
@@ -377,18 +415,6 @@ them.
 
  Keep symbols of type `STT_FILE`, even if they would otherwise be stripped.
 
-.. option:: --keep-global-symbol <symbol>, -G
-
- Mark all symbols local in the output, except for symbols with the name
- ``<symbol>``. Can be specified multiple times to ignore multiple symbols.
-
-.. option:: --keep-global-symbols <filename>
-
- Mark all symbols local in the output, except for symbols named in the file
- ``<filename>``. In the file, each line represents a single symbol, with leading
- and trailing whitespace ignored, as is anything following a '#'. Can be
- specified multiple times to read names from multiple files.
-
 .. option:: --keep-section <section>
 
  When removing sections from the output, do not remove sections named
@@ -410,19 +436,6 @@ them.
 
  Mark all symbols with hidden or internal visibility local in the output.
 
-.. option:: --localize-symbol <symbol>, -L
-
- Mark any defined non-common symbol named ``<symbol>`` as a local symbol in the
- output. Can be specified multiple times to mark multiple symbols as local.
-
-.. option:: --localize-symbols <filename>
-
- Read a list of names from the file ``<filename>`` and mark defined non-common
- symbols with those names as local in the output. In the file, each line
- represents a single symbol, with leading and trailing whitespace ignored, as is
- anything following a '#'. Can be specified multiple times to read names from
- multiple files.
-
 .. option:: --new-symbol-visibility <visibility>
 
  Specify the visibility of the symbols automatically created when using binary
@@ -489,19 +502,6 @@ them.
  Read a list of symbols from <filename> and change their visibility to the
  specified value. Visibility values: default, internal, hidden, protected.
 
-.. option:: --skip-symbol <symbol>
-
- Do not change the parameters of symbol ``<symbol>`` when executing other
- options that can change the symbol's name, binding or visibility.
-
-.. option:: --skip-symbols <filename>
-
- Do not change the parameters of symbols named in the file ``<filename>`` when
- executing other options that can change the symbol's name, binding or
- visibility. In the file, each line represents a single symbol, with leading
- and trailing whitespace ignored, as is anything following a '#'.
- Can be specified multiple times to read names from multiple files.
-
 .. option:: --split-dwo <dwo-file>
 
  Equivalent to running :program:`llvm-objcopy` with :option:`--extract-dwo` and
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index 3971d37..857d29e 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -274,39 +274,87 @@ Examples:
                @llvm.dx.handle.fromHeap.tdx.RawBuffer_v4f32_1_0(
                    i32 2, i1 false)
 
-16-byte Loads, Samples, and Gathers
------------------------------------
-
-*relevant types: TypedBuffer, CBuffer, and Textures*
-
-TypedBuffer, CBuffer, and Texture loads, as well as samples and gathers, can
-return 1 to 4 elements from the given resource, to a maximum of 16 bytes of
-data. DXIL's modeling of this is influenced by DirectX and DXBC's history and
-it generally treats these operations as returning 4 32-bit values. For 16-bit
-elements the values are 16-bit values, and for 64-bit values the operations
-return 4 32-bit integers and emit further code to construct the double.
-
-In DXIL, these operations return `ResRet`_ and `CBufRet`_ values, are structs
-containing 4 elements of the same type, and in the case of `ResRet` a 5th
-element that is used by the `CheckAccessFullyMapped`_ operation.
-
-In LLVM IR the intrinsics will return the contained type of the resource
-instead. That is, ``llvm.dx.resource.load.typedbuffer`` from a
-``Buffer<float>`` would return a single float, from ``Buffer<float4>`` a vector
-of 4 floats, and from ``Buffer<double2>`` a vector of two doubles, etc. The
-operations are then expanded out to match DXIL's format during lowering.
-
-In cases where we need ``CheckAccessFullyMapped``, we have a second intrinsic
-that returns an anonymous struct with element-0 being the contained type, and
-element-1 being the ``i1`` result of a ``CheckAccessFullyMapped`` call. We
-don't have a separate call to ``CheckAccessFullyMapped`` at all, since that's
-the only operation that can possibly be done on this value. In practice this
-may mean we insert a DXIL operation for the check when this was missing in the
-HLSL source, but this actually matches DXC's behaviour in practice.
+Accessing Resources as Memory
+-----------------------------
+
+*relevant types: Buffers, CBuffer, and Textures*
+
+Loading and storing from resources is generally represented in LLVM using
+operations on memory that is only accessible via a handle object. Given a
+handle, `llvm.dx.resource.getpointer` gives a pointer that can be used to read
+and (depending on type) write to the resource.
+
+Accesses using `llvm.dx.resource.getpointer` are replaced with direct load and
+store operations in the `DXILResourceAccess` pass. These direct loads and
+stores are described later in this document.
+
+.. note:: Currently the pointers returned by `dx.resource.getpointer` are in
+          the default address space, but that will likely change in the future.
+
+.. list-table:: ``@llvm.dx.resource.getpointer``
+   :header-rows: 1
+
+   * - Argument
+     -
+     - Type
+     - Description
+   * - Return value
+     -
+     - Pointer
+     - A pointer to an object in the buffer
+   * - ``%buffer``
+     - 0
+     - ``target(dx.TypedBuffer, ...)``
+     - The buffer to access
+   * - ``%index``
+     - 1
+     - ``i32``
+     - Index into the buffer
+
+Examples:
+
+.. code-block:: llvm
+
+   %ptr = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_v4f32_0_0_0t(
+       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
+
+Loads, Samples, and Gathers
+---------------------------
+
+*relevant types: Buffers, CBuffers, and Textures*
+
+All load, sample, and gather operations in DXIL return a `ResRet`_ type, and
+CBuffer loads return a similar `CBufRet`_ type. These types are structs
+containing 4 elements of some basic type, and in the case of `ResRet` a 5th
+element that is used by the `CheckAccessFullyMapped`_ operation. Some of these
+operations, like `RawBufferLoad`_ include a mask and/or alignment that tell us
+some information about how to interpret those four values.
+
+In the LLVM IR representations of these operations we instead return scalars or
+vectors, but we keep the requirement that we only return up to 4 elements of a
+basic type. This avoids some unnecessary casting and structure manipulation in
+the intermediate format while also keeping lowering to DXIL straightforward.
+
+LLVM intrinsics that map to operations returning `ResRet` return an anonymous
+struct with element-0 being the scalar or vector type, and element-1 being the
+``i1`` result of a ``CheckAccessFullyMapped`` call. We don't have a separate
+call to ``CheckAccessFullyMapped`` at all, since that's the only operation that
+can possibly be done on this value. In practice this may mean we insert a DXIL
+operation for the check when this was missing in the HLSL source, but this
+actually matches DXC's behaviour in practice.
+
+For TypedBuffer and Texture, we map directly from the contained type of the
+resource to the return value of the intrinsic. Since these resources are
+constrained to contain only scalars and vectors of up to 4 elements, the
+lowering to DXIL ops is generally straightforward. The one exception we have
+here is that `double` types in the elements are special - these are allowed in
+the LLVM intrinsics, but are lowered to pairs of `i32` followed by
+``MakeDouble`` operations for DXIL.
 
 .. _ResRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-operation-return-types
 .. _CBufRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#cbufferloadlegacy
 .. _CheckAccessFullyMapped: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/checkaccessfullymapped
+.. _RawBufferLoad: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferload
 
 .. list-table:: ``@llvm.dx.resource.load.typedbuffer``
    :header-rows: 1
@@ -317,8 +365,8 @@ HLSL source, but this actually matches DXC's behaviour in practice.
      - Description
    * - Return value
      -
-     - The contained type of the buffer
-     - The data loaded from the buffer
+     - A structure of the contained type and the check bit
+     - The data loaded from the buffer and the check bit
    * - ``%buffer``
      - 0
      - ``target(dx.TypedBuffer, ...)``
@@ -332,23 +380,32 @@ Examples:
 
 .. code-block:: llvm
 
-   %ret = call <4 x float>
+   %ret = call {<4 x float>, i1}
        @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
            target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
-   %ret = call float
+   %ret = call {float, i1}
        @llvm.dx.resource.load.typedbuffer.f32.tdx.TypedBuffer_f32_0_0_0t(
            target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 %index)
-   %ret = call <4 x i32>
+   %ret = call {<4 x i32>, i1}
        @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v4i32_0_0_0t(
            target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 %index)
-   %ret = call <4 x half>
+   %ret = call {<4 x half>, i1}
        @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_0_0_0t(
            target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 %index)
-   %ret = call <2 x double>
+   %ret = call {<2 x double>, i1}
        @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_0_0t(
            target("dx.TypedBuffer", <2 x double>, 0, 0, 0) %buffer, i32 %index)
 
-.. list-table:: ``@llvm.dx.resource.loadchecked.typedbuffer``
+For RawBuffer, an HLSL load operation may return an arbitrarily sized result,
+but we still constrain the LLVM intrinsic to return only up to 4 elements of a
+basic type. This means that larger loads are represented as a series of loads,
+which matches DXIL. Unlike in the `RawBufferLoad`_ operation, we do not need
+arguments for the mask/type size and alignment, since we can calculate these
+from the return type of the load during lowering.
+
+.. _RawBufferLoad: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferload
+
+.. list-table:: ``@llvm.dx.resource.load.rawbuffer``
    :header-rows: 1
 
    * - Argument
@@ -357,22 +414,82 @@ Examples:
      - Description
    * - Return value
      -
-     - A structure of the contained type and the check bit
+     - A structure of a scalar or vector and the check bit
      - The data loaded from the buffer and the check bit
    * - ``%buffer``
      - 0
-     - ``target(dx.TypedBuffer, ...)``
+     - ``target(dx.RawBuffer, ...)``
      - The buffer to load from
    * - ``%index``
      - 1
      - ``i32``
      - Index into the buffer
+   * - ``%offset``
+     - 2
+     - ``i32``
+     - Offset into the structure at the given index
+
+Examples:
 
 .. code-block:: llvm
 
+   ; float
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+           target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
+
+   ; float4
    %ret = call {<4 x float>, i1}
-       @llvm.dx.resource.loadchecked.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
-           target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_v4f32_0_0_0t(
+           target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_i8_0_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
+
+   ; struct S0 { float4 f; int4 i; };
+   %ret = call {<4 x float>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+           target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {<4 x i32>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4i32.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+           target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 1)
+
+   ; struct Q { float4 f; int3 i; }
+   ; struct R { int z; S x; }
+   %ret = call {i32, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 0)
+   %ret = call {<4 x float>, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 4)
+   %ret = call {<3 x i32>, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 20)
+
+   ; byteaddressbuf.Load<int64_t4>
+   %ret = call {<4 x i64>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4i64.tdx.RawBuffer_i8_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
 
 Texture and Typed Buffer Stores
 -------------------------------
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 7f4b62f..7f73b61e 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -233,12 +233,13 @@ Hardware
 
 LLVM is known to work on the following host platforms:
 
-================== ===================== =============
+================== ===================== ==============================
 OS                 Arch                  Compilers
-================== ===================== =============
+================== ===================== ==============================
 Linux              x86\ :sup:`1`         GCC, Clang
 Linux              amd64                 GCC, Clang
 Linux              ARM                   GCC, Clang
+Linux              AArch64               GCC, Clang
 Linux              Mips                  GCC, Clang
 Linux              PowerPC               GCC, Clang
 Linux              SystemZ               GCC, Clang
@@ -246,16 +247,19 @@ Solaris            V9 (Ultrasparc)       GCC
 DragonFlyBSD       amd64                 GCC, Clang
 FreeBSD            x86\ :sup:`1`         GCC, Clang
 FreeBSD            amd64                 GCC, Clang
+FreeBSD            AArch64               GCC, Clang
 NetBSD             x86\ :sup:`1`         GCC, Clang
 NetBSD             amd64                 GCC, Clang
 OpenBSD            x86\ :sup:`1`         GCC, Clang
 OpenBSD            amd64                 GCC, Clang
 macOS\ :sup:`2`    PowerPC               GCC
 macOS              x86                   GCC, Clang
+macOS              arm64                 Clang
 Cygwin/Win32       x86\ :sup:`1, 3`      GCC
 Windows            x86\ :sup:`1`         Visual Studio
-Windows x64        x86-64                Visual Studio
-================== ===================== =============
+Windows x64        x86-64                Visual Studio, Clang\ :sup:`4`
+Windows on Arm     ARM64                 Visual Studio, Clang\ :sup:`4`
+================== ===================== ==============================
 
 .. note::
 
@@ -263,6 +267,8 @@ Windows x64        x86-64                Visual Studio
   #. Code generation supported for 32-bit ABI only
   #. To use LLVM modules on Win32-based system, you may configure LLVM
      with ``-DBUILD_SHARED_LIBS=On``.
+  #. Visual Studio alone can compile LLVM. When using Clang, you
+     must also have Visual Studio installed.
 
 Note that Debug builds require a lot of time and disk space.  An LLVM-only build
 will need about 1-3 GB of space.  A full build of LLVM and Clang will need around
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index 98803dd..e2829eb 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -3358,15 +3358,15 @@ the ``PassManager.h`` system, and there is a more detailed introduction to it
 by Sean Parent in several of his talks and papers:
 
 #. `Inheritance Is The Base Class of Evil
-   <http://channel9.msdn.com/Events/GoingNative/2013/Inheritance-Is-The-Base-Class-of-Evil>`_
+   <https://learn.microsoft.com/en-us/shows/goingnative-2013/inheritance-base-class-of-evil>`_
    - The GoingNative 2013 talk describing this technique, and probably the best
    place to start.
 #. `Value Semantics and Concepts-based Polymorphism
    <http://www.youtube.com/watch?v=_BpMYeUFXv8>`_ - The C++Now! 2012 talk
    describing this technique in more detail.
 #. `Sean Parent's Papers and Presentations
-   <http://github.com/sean-parent/sean-parent.github.com/wiki/Papers-and-Presentations>`_
-   - A GitHub project full of links to slides, video, and sometimes code.
+   <https://sean-parent.stlab.cc/papers-and-presentations>`_
+   - Links to slides, videos, and sometimes code.
 
 When deciding between creating a type hierarchy (with either tagged or virtual
 dispatch) and using templates or concepts-based polymorphism, consider whether
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index f6a0dd4..0dc63f3 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -326,6 +326,9 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zvbc32e``, ``experimental-zvkgs``
   LLVM implements the `0.7 release specification <https://github.com/user-attachments/files/16450464/riscv-crypto-spec-vector-extra_v0.0.7.pdf>`__.
 
+``experimental-sdext``, ``experimental-sdtrig``
+  LLVM implements the `1.0-rc4 specification <https://github.com/riscv/riscv-debug-spec/releases/download/1.0.0-rc4/riscv-debug-specification.pdf>`__.
+
 ``experimental-smctr``, ``experimental-ssctr``
   LLVM implements the `1.0-rc3 specification <https://github.com/riscv/riscv-control-transfer-records/releases/tag/v1.0_rc3>`__.
 
@@ -429,6 +432,15 @@ The current vendor extensions supported are:
 ``experimental-Xqcia``
   LLVM implements `version 0.2 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
+``experimental-Xqciac``
+  LLVM implements `version 0.2 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
+
+``experimental-Xqcicli``
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
+
+``experimental-Xqcicm``
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Move extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
+
 ``experimental-Xqcics``
   LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index cc0e21d..3463dc8 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -131,6 +131,8 @@ Changes to the AArch64 Backend
 * Assembler/disassembler support has been added for Armv9.6-A (2024)
   architecture extensions.
 
+* Added support for the FUJITSU-MONAKA CPU.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
@@ -196,6 +198,7 @@ Changes to the RISC-V Backend
 * `-mcpu=syntacore-scr7` was added.
 * `-mcpu=tt-ascalon-d8` was added.
 * `-mcpu=mips-p8700` was added.
+* `-mcpu=sifive-p550` was added.
 * The `Zacas` extension is no longer marked as experimental.
 * Added Smdbltrp, Ssdbltrp extensions to -march.
 * The `Smmpm`, `Smnpm`, `Ssnpm`, `Supm`, and `Sspm` pointer masking extensions
@@ -222,10 +225,17 @@ Changes to the RISC-V Backend
   extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcia` (Arithmetic)
   extension.
+* Adds experimental assembler support for the Qualcomm uC 'Xqciac` (Load-Store Address Calculation)
+  extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcics` (Conditonal Select)
   extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcilsm` (Load Store Multiple)
   extension.
+* Adds experimental assembler support for the Qualcomm uC 'Xqcicli` (Conditional Load Immediate)
+  extension.
+* Adds experimental assembler support for the Qualcomm uC 'Xqcicm` (Conditonal Move)
+  extension.
+* Added ``Sdext`` and ``Sdtrig`` extensions.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -350,6 +360,12 @@ Changes to the Debug Info
 Changes to the LLVM tools
 ---------------------------------
 
+* llvm-objcopy now supports the following options for Mach-O:
+  `--globalize-symbol`, `--globalize-symbols`,
+  `--keep-global-symbol`, `--keep-global-symbols`,
+  `--localize-symbol`, `--localize-symbols`,
+  `--skip-symbol`, `--skip-symbols`.
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst
index f73269e..94af2e4 100644
--- a/llvm/docs/TableGen/BackEnds.rst
+++ b/llvm/docs/TableGen/BackEnds.rst
@@ -1071,8 +1071,6 @@ function. This class provides three fields.
 
 * ``bit EarlyOut``. See the third example in `Generic Tables`_.
 
-* ``bit ReturnRange``. See the second example in `Generic Tables`_.
-
 Here is an example of a secondary key added to the ``CTable`` above. The
 generated function looks up entries based on the ``Name`` and ``Kind`` fields.
 
diff --git a/llvm/include/llvm/ADT/STLFunctionalExtras.h b/llvm/include/llvm/ADT/STLFunctionalExtras.h
index 3b9d409..a4d50dc 100644
--- a/llvm/include/llvm/ADT/STLFunctionalExtras.h
+++ b/llvm/include/llvm/ADT/STLFunctionalExtras.h
@@ -36,8 +36,8 @@ namespace llvm {
 /// a function_ref.
 template<typename Fn> class function_ref;
 
-template<typename Ret, typename ...Params>
-class function_ref<Ret(Params...)> {
+template <typename Ret, typename... Params>
+class LLVM_GSL_POINTER function_ref<Ret(Params...)> {
   Ret (*callback)(intptr_t callable, Params ...params) = nullptr;
   intptr_t callable;
 
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 8aa024a..b4918c2 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -1102,6 +1102,13 @@ bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
                                    Instruction *OnPathTo,
                                    DominatorTree *DT);
 
+/// Convert an integer comparison with a constant RHS into an equivalent
+/// form with the strictness flipped predicate. Return the new predicate and
+/// corresponding constant RHS if possible. Otherwise return std::nullopt.
+/// E.g., (icmp sgt X, 0) -> (icmp sle X, 1).
+std::optional<std::pair<CmpPredicate, Constant *>>
+getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred, Constant *C);
+
 /// Specific patterns of select instructions we can match.
 enum SelectPatternFlavor {
   SPF_UNKNOWN = 0,
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 8abacf1..1bc69f7 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -689,6 +689,9 @@ enum : unsigned {
 // ELF Relocation types for RISC-V
 enum {
 #include "ELFRelocs/RISCV.def"
+#define ELF_RISCV_NONSTANDARD_RELOC(_vendor, name, value) name = value,
+#include "ELFRelocs/RISCV_nonstandard.def"
+#undef ELF_RISCV_NONSTANDARD_RELOC
 };
 
 enum {
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
new file mode 100644
index 0000000..7ae3d3f
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
@@ -0,0 +1,28 @@
+//===--- RISC-V Nonstandard Relocation List ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ELF_RISCV_NONSTANDARD_RELOC
+#error "ELF_RISCV_NONSTANDARD_RELOC must be defined"
+#endif
+
+// ELF_RISCV_NONSTANDARD_RELOC(VENDOR, NAME, ID) defines information about
+// nonstandard relocation codes. This can be used when parsing relocations, or
+// when printing them, to provide better information.
+//
+// VENDOR should be the symbol name expected in the associated `R_RISCV_VENDOR`
+// relocation. NAME and ID work like `ELF_RELOC` but the mapping is not expected
+// to be 1:1.
+//
+// The mapping in RISCV.def is 1:1, and should be used when the only information
+// available is the relocation enum value.
+
+// Qualcomm Nonstandard Relocations
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_ABS20_U,    192)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_BRANCH,   193)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_32,       194)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_JUMP_PLT, 195)
diff --git a/llvm/include/llvm/BinaryFormat/MinidumpConstants.def b/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
index 5226da3..722a70f 100644
--- a/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
+++ b/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
@@ -85,6 +85,10 @@ HANDLE_MDMP_STREAM_TYPE(0xFACECCCC, FacebookAppStateLog)
 HANDLE_MDMP_STREAM_TYPE(0xFACEDEAD, FacebookAbortReason)
 HANDLE_MDMP_STREAM_TYPE(0xFACEE000, FacebookThreadName)
 
+// LLDB specific stream types
+// Ascii for 'LLDB'
+HANDLE_MDMP_STREAM_TYPE(0x4C4C4442, LLDBGenerated)
+
 HANDLE_MDMP_ARCH(0x0000, X86)       // PROCESSOR_ARCHITECTURE_INTEL
 HANDLE_MDMP_ARCH(0x0001, MIPS)      // PROCESSOR_ARCHITECTURE_MIPS
 HANDLE_MDMP_ARCH(0x0002, Alpha)     // PROCESSOR_ARCHITECTURE_ALPHA
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index ed4541f..c9f142d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -259,6 +259,33 @@ private:
     return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
   }
 
+  /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
+  /// or same non -1 index value and this index value contained at least twice.
+  /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
+  /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
+  /// with \p Index=2.
+  static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
+    // Check that the broadcast index meets at least twice.
+    bool IsCompared = false;
+    if (int SplatIdx = PoisonMaskElem;
+        all_of(enumerate(Mask), [&](const auto &P) {
+          if (P.value() == PoisonMaskElem)
+            return P.index() != Mask.size() - 1 || IsCompared;
+          if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
+            return false;
+          if (SplatIdx == PoisonMaskElem) {
+            SplatIdx = P.value();
+            return P.index() != Mask.size() - 1;
+          }
+          IsCompared = true;
+          return SplatIdx == P.value();
+        })) {
+      Index = SplatIdx;
+      return true;
+    }
+    return false;
+  }
+
 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
       : BaseT(DL) {}
@@ -1014,17 +1041,20 @@ public:
       return Kind;
     int NumSrcElts = Ty->getElementCount().getKnownMinValue();
     switch (Kind) {
-    case TTI::SK_PermuteSingleSrc:
+    case TTI::SK_PermuteSingleSrc: {
       if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
         return TTI::SK_Reverse;
       if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
         return TTI::SK_Broadcast;
+      if (isSplatMask(Mask, NumSrcElts, Index))
+        return TTI::SK_Broadcast;
       if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
           (Index + Mask.size()) <= (size_t)NumSrcElts) {
         SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
         return TTI::SK_ExtractSubvector;
       }
       break;
+    }
     case TTI::SK_PermuteTwoSrc: {
       int NumSubElts;
       if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index 84a2673..4383249 100644
--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -35,6 +35,7 @@ public:
 enum class ComplexDeinterleavingOperation {
   CAdd,
   CMulPartial,
+  CDot,
   // The following 'operations' are used to represent internal states. Backends
   // are not expected to try and support these in any capacity.
   Deinterleave,
@@ -43,6 +44,7 @@ enum class ComplexDeinterleavingOperation {
   ReductionPHI,
   ReductionOperation,
   ReductionSelect,
+  ReductionSingle
 };
 
 enum class ComplexDeinterleavingRotation {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 871456d..94e36e4 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -864,6 +864,9 @@ public:
   /// Combine select to integer min/max.
   bool matchSelectIMinMax(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
+  /// Tranform (neg (min/max x, (neg x))) into (max/min x, (neg x)).
+  bool matchSimplifyNegMinMax(MachineInstr &MI, BuildFnTy &MatchInfo) const;
+
   /// Combine selects.
   bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 2384b22..4e18f5c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -22,6 +22,7 @@
 
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 
@@ -297,6 +298,13 @@ public:
   MachineInstrBuilder createStackTemporary(TypeSize Bytes, Align Alignment,
                                            MachinePointerInfo &PtrInfo);
 
+  /// Create a store of \p Val to a stack temporary and return a load as the
+  /// same type as \p Res.
+  MachineInstrBuilder createStackStoreLoad(const DstOp &Res, const SrcOp &Val);
+
+  /// Given a store of a boolean vector, scalarize it.
+  LegalizeResult scalarizeVectorBooleanStore(GStore &MI);
+
   /// Get a pointer to vector element \p Index located in memory for a vector of
   /// type \p VecTy starting at a base address of \p VecPtr. If \p Index is out
   /// of bounds the returned pointer is unspecified, but will be within the
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index ea6ed32..78a92c8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -338,7 +338,7 @@ template <> struct bind_helper<MachineInstr *> {
 };
 
 template <> struct bind_helper<LLT> {
-  static bool bind(const MachineRegisterInfo &MRI, LLT Ty, Register Reg) {
+  static bool bind(const MachineRegisterInfo &MRI, LLT &Ty, Register Reg) {
     Ty = MRI.getType(Reg);
     if (Ty.isValid())
       return true;
@@ -368,10 +368,40 @@ template <typename Class> struct bind_ty {
 
 inline bind_ty<Register> m_Reg(Register &R) { return R; }
 inline bind_ty<MachineInstr *> m_MInstr(MachineInstr *&MI) { return MI; }
-inline bind_ty<LLT> m_Type(LLT Ty) { return Ty; }
+inline bind_ty<LLT> m_Type(LLT &Ty) { return Ty; }
 inline bind_ty<CmpInst::Predicate> m_Pred(CmpInst::Predicate &P) { return P; }
 inline operand_type_match m_Pred() { return operand_type_match(); }
 
+template <typename BindTy> struct deferred_helper {
+  static bool match(const MachineRegisterInfo &MRI, BindTy &VR, BindTy &V) {
+    return VR == V;
+  }
+};
+
+template <> struct deferred_helper<LLT> {
+  static bool match(const MachineRegisterInfo &MRI, LLT VT, Register R) {
+    return VT == MRI.getType(R);
+  }
+};
+
+template <typename Class> struct deferred_ty {
+  Class &VR;
+
+  deferred_ty(Class &V) : VR(V) {}
+
+  template <typename ITy> bool match(const MachineRegisterInfo &MRI, ITy &&V) {
+    return deferred_helper<Class>::match(MRI, VR, V);
+  }
+};
+
+/// Similar to m_SpecificReg/Type, but the specific value to match originated
+/// from an earlier sub-pattern in the same mi_match expression. For example,
+/// we cannot match `(add X, X)` with `m_GAdd(m_Reg(X), m_SpecificReg(X))`
+/// because `X` is not initialized at the time it's passed to `m_SpecificReg`.
+/// Instead, we can use `m_GAdd(m_Reg(x), m_DeferredReg(X))`.
+inline deferred_ty<Register> m_DeferredReg(Register &R) { return R; }
+inline deferred_ty<LLT> m_DeferredType(LLT &Ty) { return Ty; }
+
 struct ImplicitDefMatch {
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
     MachineInstr *TmpMI;
@@ -401,8 +431,13 @@ struct BinaryOp_match {
       if (TmpMI->getOpcode() == Opcode && TmpMI->getNumOperands() == 3) {
         return (L.match(MRI, TmpMI->getOperand(1).getReg()) &&
                 R.match(MRI, TmpMI->getOperand(2).getReg())) ||
-               (Commutable && (R.match(MRI, TmpMI->getOperand(1).getReg()) &&
-                               L.match(MRI, TmpMI->getOperand(2).getReg())));
+               // NOTE: When trying the alternative operand ordering
+               // with a commutative operation, it is imperative to always run
+               // the LHS sub-pattern  (i.e. `L`) before the RHS sub-pattern
+               // (i.e. `R`). Otherwsie, m_DeferredReg/Type will not work as
+               // expected.
+               (Commutable && (L.match(MRI, TmpMI->getOperand(2).getReg()) &&
+                               R.match(MRI, TmpMI->getOperand(1).getReg())));
       }
     }
     return false;
@@ -426,8 +461,13 @@ struct BinaryOpc_match {
           TmpMI->getNumOperands() == 3) {
         return (L.match(MRI, TmpMI->getOperand(1).getReg()) &&
                 R.match(MRI, TmpMI->getOperand(2).getReg())) ||
-               (Commutable && (R.match(MRI, TmpMI->getOperand(1).getReg()) &&
-                               L.match(MRI, TmpMI->getOperand(2).getReg())));
+               // NOTE: When trying the alternative operand ordering
+               // with a commutative operation, it is imperative to always run
+               // the LHS sub-pattern  (i.e. `L`) before the RHS sub-pattern
+               // (i.e. `R`). Otherwsie, m_DeferredReg/Type will not work as
+               // expected.
+               (Commutable && (L.match(MRI, TmpMI->getOperand(2).getReg()) &&
+                               R.match(MRI, TmpMI->getOperand(1).getReg())));
       }
     }
     return false;
@@ -538,15 +578,27 @@ m_GAShr(const LHS &L, const RHS &R) {
 }
 
 template <typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, false>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, true>
 m_GSMax(const LHS &L, const RHS &R) {
-  return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, false>(L, R);
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, true>(L, R);
 }
 
 template <typename LHS, typename RHS>
-inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, false>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, true>
 m_GSMin(const LHS &L, const RHS &R) {
-  return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, false>(L, R);
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, true>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_UMAX, true>
+m_GUMax(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_UMAX, true>(L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_UMIN, true>
+m_GUMin(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_UMIN, true>(L, R);
 }
 
 // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
@@ -662,6 +714,10 @@ struct CompareOp_match {
     Register RHS = TmpMI->getOperand(3).getReg();
     if (L.match(MRI, LHS) && R.match(MRI, RHS))
       return true;
+    // NOTE: When trying the alternative operand ordering
+    // with a commutative operation, it is imperative to always run
+    // the LHS sub-pattern  (i.e. `L`) before the RHS sub-pattern
+    // (i.e. `R`). Otherwsie, m_DeferredReg/Type will not work as expected.
     if (Commutable && L.match(MRI, RHS) && R.match(MRI, LHS) &&
         P.match(MRI, CmpInst::getSwappedPredicate(TmpPred)))
       return true;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index 3765363..a35ecae 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -171,6 +171,10 @@ void reportGISelWarning(MachineFunction &MF, const TargetPassConfig &TPC,
                         MachineOptimizationRemarkEmitter &MORE,
                         MachineOptimizationRemarkMissed &R);
 
+/// Returns the inverse opcode of \p MinMaxOpc, which is a generic min/max
+/// opcode like G_SMIN.
+unsigned getInverseGMinMaxOpcode(unsigned MinMaxOpc);
+
 /// If \p VReg is defined by a G_CONSTANT, return the corresponding value.
 std::optional<APInt> getIConstantVRegVal(Register VReg,
                                          const MachineRegisterInfo &MRI);
@@ -522,6 +526,13 @@ std::optional<APInt>
 isConstantOrConstantSplatVector(MachineInstr &MI,
                                 const MachineRegisterInfo &MRI);
 
+/// Determines if \p MI defines a float constant integer or a splat vector of
+/// float constant integers.
+/// \returns the float constant or std::nullopt.
+std::optional<APFloat>
+isConstantOrConstantSplatVectorFP(MachineInstr &MI,
+                                  const MachineRegisterInfo &MRI);
+
 /// Attempt to match a unary predicate against a scalar/splat constant or every
 /// element of a constant G_BUILD_VECTOR. If \p ConstVal is null, the source
 /// value was undef.
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 69820ae..604dc94 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1495,6 +1495,10 @@ inline bool isBitwiseLogicOp(unsigned Opcode) {
   return Opcode == ISD::AND || Opcode == ISD::OR || Opcode == ISD::XOR;
 }
 
+/// Given a \p MinMaxOpc of ISD::(U|S)MIN or ISD::(U|S)MAX, returns
+/// ISD::(U|S)MAX and ISD::(U|S)MIN, respectively.
+NodeType getInverseMinMaxOpcode(unsigned MinMaxOpc);
+
 /// Get underlying scalar opcode for VECREDUCE opcode.
 /// For example ISD::AND for ISD::VECREDUCE_AND.
 NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode);
diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index 486392c..373f440 100644
--- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -48,7 +48,7 @@ class LiveRegMatrix {
   unsigned UserTag = 0;
 
   // The matrix is represented as a LiveIntervalUnion per register unit.
-  LiveIntervalUnion::Allocator LIUAlloc;
+  std::unique_ptr<LiveIntervalUnion::Allocator> LIUAlloc;
   LiveIntervalUnion::Array Matrix;
 
   // Cached queries per register unit.
@@ -59,15 +59,12 @@ class LiveRegMatrix {
   unsigned RegMaskVirtReg = 0;
   BitVector RegMaskUsable;
 
-  LiveRegMatrix() = default;
+  LiveRegMatrix()
+      : LIUAlloc(std::make_unique<LiveIntervalUnion::Allocator>()) {};
   void releaseMemory();
 
 public:
-  LiveRegMatrix(LiveRegMatrix &&Other)
-      : TRI(Other.TRI), LIS(Other.LIS), VRM(Other.VRM), UserTag(Other.UserTag),
-        Matrix(std::move(Other.Matrix)), Queries(std::move(Other.Queries)),
-        RegMaskTag(Other.RegMaskTag), RegMaskVirtReg(Other.RegMaskVirtReg),
-        RegMaskUsable(std::move(Other.RegMaskUsable)) {}
+  LiveRegMatrix(LiveRegMatrix &&Other) = default;
 
   void init(MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM);
 
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index e2343ab..2caa3bd 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -152,8 +152,9 @@ public:
     MOTargetFlag1 = 1u << 6,
     MOTargetFlag2 = 1u << 7,
     MOTargetFlag3 = 1u << 8,
+    MOTargetFlag4 = 1u << 9,
 
-    LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ MOTargetFlag3)
+    LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ MOTargetFlag4)
   };
 
 private:
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 5dc51aa..5ee3aef 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -1095,9 +1095,6 @@ public:
       return !operator==(x);
     }
 
-    /// atEnd - return true if this iterator is equal to reg_end() on the value.
-    bool atEnd() const { return Op == nullptr; }
-
     // Iterator traversal: forward iteration only
     defusechain_iterator &operator++() {          // Preincrement
       assert(Op && "Cannot increment end iterator!");
@@ -1203,9 +1200,6 @@ public:
       return !operator==(x);
     }
 
-    /// atEnd - return true if this iterator is equal to reg_end() on the value.
-    bool atEnd() const { return Op == nullptr; }
-
     // Iterator traversal: forward iteration only
     defusechain_instr_iterator &operator++() {          // Preincrement
       assert(Op && "Cannot increment end iterator!");
diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index d6a1f06..0c1e707 100644
--- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -176,26 +176,25 @@ public:
   void traverse();
 
   /// Provides the instruction id of the closest reaching def instruction of
-  /// PhysReg that reaches MI, relative to the begining of MI's basic block.
-  int getReachingDef(MachineInstr *MI, MCRegister PhysReg) const;
+  /// Reg that reaches MI, relative to the begining of MI's basic block.
+  int getReachingDef(MachineInstr *MI, MCRegister Reg) const;
 
-  /// Return whether A and B use the same def of PhysReg.
+  /// Return whether A and B use the same def of Reg.
   bool hasSameReachingDef(MachineInstr *A, MachineInstr *B,
-                          MCRegister PhysReg) const;
+                          MCRegister Reg) const;
 
   /// Return whether the reaching def for MI also is live out of its parent
   /// block.
-  bool isReachingDefLiveOut(MachineInstr *MI, MCRegister PhysReg) const;
+  bool isReachingDefLiveOut(MachineInstr *MI, MCRegister Reg) const;
 
-  /// Return the local MI that produces the live out value for PhysReg, or
+  /// Return the local MI that produces the live out value for Reg, or
   /// nullptr for a non-live out or non-local def.
   MachineInstr *getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                     MCRegister PhysReg) const;
+                                     MCRegister Reg) const;
 
   /// If a single MachineInstr creates the reaching definition, then return it.
   /// Otherwise return null.
-  MachineInstr *getUniqueReachingMIDef(MachineInstr *MI,
-                                       MCRegister PhysReg) const;
+  MachineInstr *getUniqueReachingMIDef(MachineInstr *MI, MCRegister Reg) const;
 
   /// If a single MachineInstr creates the reaching definition, for MIs operand
   /// at Idx, then return it. Otherwise return null.
@@ -207,44 +206,43 @@ public:
 
   /// Provide whether the register has been defined in the same basic block as,
   /// and before, MI.
-  bool hasLocalDefBefore(MachineInstr *MI, MCRegister PhysReg) const;
+  bool hasLocalDefBefore(MachineInstr *MI, MCRegister Reg) const;
 
   /// Return whether the given register is used after MI, whether it's a local
   /// use or a live out.
-  bool isRegUsedAfter(MachineInstr *MI, MCRegister PhysReg) const;
+  bool isRegUsedAfter(MachineInstr *MI, MCRegister Reg) const;
 
   /// Return whether the given register is defined after MI.
-  bool isRegDefinedAfter(MachineInstr *MI, MCRegister PhysReg) const;
+  bool isRegDefinedAfter(MachineInstr *MI, MCRegister Reg) const;
 
   /// Provides the clearance - the number of instructions since the closest
-  /// reaching def instuction of PhysReg that reaches MI.
-  int getClearance(MachineInstr *MI, MCRegister PhysReg) const;
+  /// reaching def instuction of Reg that reaches MI.
+  int getClearance(MachineInstr *MI, MCRegister Reg) const;
 
   /// Provides the uses, in the same block as MI, of register that MI defines.
   /// This does not consider live-outs.
-  void getReachingLocalUses(MachineInstr *MI, MCRegister PhysReg,
+  void getReachingLocalUses(MachineInstr *MI, MCRegister Reg,
                             InstSet &Uses) const;
 
-  /// Search MBB for a definition of PhysReg and insert it into Defs. If no
+  /// Search MBB for a definition of Reg and insert it into Defs. If no
   /// definition is found, recursively search the predecessor blocks for them.
-  void getLiveOuts(MachineBasicBlock *MBB, MCRegister PhysReg, InstSet &Defs,
+  void getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg, InstSet &Defs,
                    BlockSet &VisitedBBs) const;
-  void getLiveOuts(MachineBasicBlock *MBB, MCRegister PhysReg,
-                   InstSet &Defs) const;
+  void getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg, InstSet &Defs) const;
 
   /// For the given block, collect the instructions that use the live-in
   /// value of the provided register. Return whether the value is still
   /// live on exit.
-  bool getLiveInUses(MachineBasicBlock *MBB, MCRegister PhysReg,
+  bool getLiveInUses(MachineBasicBlock *MBB, MCRegister Reg,
                      InstSet &Uses) const;
 
-  /// Collect the users of the value stored in PhysReg, which is defined
+  /// Collect the users of the value stored in Reg, which is defined
   /// by MI.
-  void getGlobalUses(MachineInstr *MI, MCRegister PhysReg, InstSet &Uses) const;
+  void getGlobalUses(MachineInstr *MI, MCRegister Reg, InstSet &Uses) const;
 
-  /// Collect all possible definitions of the value stored in PhysReg, which is
+  /// Collect all possible definitions of the value stored in Reg, which is
   /// used by MI.
-  void getGlobalReachingDefs(MachineInstr *MI, MCRegister PhysReg,
+  void getGlobalReachingDefs(MachineInstr *MI, MCRegister Reg,
                              InstSet &Defs) const;
 
   /// Return whether From can be moved forwards to just before To.
@@ -269,12 +267,12 @@ public:
 
   /// Return whether a MachineInstr could be inserted at MI and safely define
   /// the given register without affecting the program.
-  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg) const;
+  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg) const;
 
   /// Return whether a MachineInstr could be inserted at MI and safely define
   /// the given register without affecting the program, ignoring any effects
   /// on the provided instructions.
-  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
+  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg,
                         InstSet &Ignore) const;
 
 private:
@@ -309,9 +307,8 @@ private:
   MachineInstr *getInstFromId(MachineBasicBlock *MBB, int InstId) const;
 
   /// Provides the instruction of the closest reaching def instruction of
-  /// PhysReg that reaches MI, relative to the begining of MI's basic block.
-  MachineInstr *getReachingLocalMIDef(MachineInstr *MI,
-                                      MCRegister PhysReg) const;
+  /// Reg that reaches MI, relative to the begining of MI's basic block.
+  MachineInstr *getReachingLocalMIDef(MachineInstr *MI, MCRegister Reg) const;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index d21cc96..fc8ef71 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -547,6 +547,39 @@ struct BinaryOpc_match {
   }
 };
 
+/// Matching while capturing mask
+template <typename T0, typename T1, typename T2> struct SDShuffle_match {
+  T0 Op1;
+  T1 Op2;
+  T2 Mask;
+
+  SDShuffle_match(const T0 &Op1, const T1 &Op2, const T2 &Mask)
+      : Op1(Op1), Op2(Op2), Mask(Mask) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    if (auto *I = dyn_cast<ShuffleVectorSDNode>(N)) {
+      return Op1.match(Ctx, I->getOperand(0)) &&
+             Op2.match(Ctx, I->getOperand(1)) && Mask.match(I->getMask());
+    }
+    return false;
+  }
+};
+struct m_Mask {
+  ArrayRef<int> &MaskRef;
+  m_Mask(ArrayRef<int> &MaskRef) : MaskRef(MaskRef) {}
+  bool match(ArrayRef<int> Mask) {
+    MaskRef = Mask;
+    return true;
+  }
+};
+
+struct m_SpecificMask {
+  ArrayRef<int> MaskRef;
+  m_SpecificMask(ArrayRef<int> MaskRef) : MaskRef(MaskRef) {}
+  bool match(ArrayRef<int> Mask) { return MaskRef == Mask; }
+};
+
 template <typename LHS_P, typename RHS_P, typename Pred_t,
           bool Commutable = false, bool ExcludeChain = false>
 struct MaxMin_match {
@@ -797,6 +830,17 @@ inline BinaryOpc_match<LHS, RHS> m_FRem(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS>(ISD::FREM, L, R);
 }
 
+template <typename V1_t, typename V2_t>
+inline BinaryOpc_match<V1_t, V2_t> m_Shuffle(const V1_t &v1, const V2_t &v2) {
+  return BinaryOpc_match<V1_t, V2_t>(ISD::VECTOR_SHUFFLE, v1, v2);
+}
+
+template <typename V1_t, typename V2_t, typename Mask_t>
+inline SDShuffle_match<V1_t, V2_t, Mask_t>
+m_Shuffle(const V1_t &v1, const V2_t &v2, const Mask_t &mask) {
+  return SDShuffle_match<V1_t, V2_t, Mask_t>(v1, v2, mask);
+}
+
 template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS> m_ExtractElt(const LHS &Vec, const RHS &Idx) {
   return BinaryOpc_match<LHS, RHS>(ISD::EXTRACT_VECTOR_ELT, Vec, Idx);
diff --git a/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h b/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h
index fd4ac31..18764225 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/FunctionInfo.h
@@ -187,13 +187,17 @@ struct FunctionInfo {
   ///
   /// \param Addr The address to lookup.
   ///
+  /// \param MergedFuncsData A pointer to an optional DataExtractor that, if
+  /// non-null, will be set to the raw data of the MergedFunctionInfo, if
+  /// present.
+  ///
   /// \returns An LookupResult or an error describing the issue that was
   /// encountered during decoding. An error should only be returned if the
   /// address is not contained in the FunctionInfo or if the data is corrupted.
-  static llvm::Expected<LookupResult> lookup(DataExtractor &Data,
-                                             const GsymReader &GR,
-                                             uint64_t FuncAddr,
-                                             uint64_t Addr);
+  static llvm::Expected<LookupResult>
+  lookup(DataExtractor &Data, const GsymReader &GR, uint64_t FuncAddr,
+         uint64_t Addr,
+         std::optional<DataExtractor> *MergedFuncsData = nullptr);
 
   uint64_t startAddress() const { return Range.start(); }
   uint64_t endAddress() const { return Range.end(); }
diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h b/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h
index 3d53258..ee7929a 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h
@@ -127,10 +127,29 @@ public:
   /// is much faster for lookups.
   ///
   /// \param Addr A virtual address from the orignal object file to lookup.
+  ///
+  /// \param MergedFuncsData A pointer to an optional DataExtractor that, if
+  /// non-null, will be set to the raw data of the MergedFunctionInfo, if
+  /// present.
+  ///
   /// \returns An expected LookupResult that contains only the information
   /// needed for the current address, or an error object that indicates reason
   /// for failing to lookup the address.
-  llvm::Expected<LookupResult> lookup(uint64_t Addr) const;
+  llvm::Expected<LookupResult>
+  lookup(uint64_t Addr,
+         std::optional<DataExtractor> *MergedFuncsData = nullptr) const;
+
+  /// Lookup all merged functions for a given address.
+  ///
+  /// This function performs a lookup for the specified address and then
+  /// retrieves additional LookupResults from any merged functions associated
+  /// with the primary LookupResult.
+  ///
+  /// \param Addr The address to lookup.
+  ///
+  /// \returns A vector of LookupResult objects, where the first element is the
+  /// primary result, followed by results for any merged functions
+  llvm::Expected<std::vector<LookupResult>> lookupAll(uint64_t Addr) const;
 
   /// Get a string from the string table.
   ///
diff --git a/llvm/include/llvm/DebugInfo/GSYM/MergedFunctionsInfo.h b/llvm/include/llvm/DebugInfo/GSYM/MergedFunctionsInfo.h
index b68f9b6..203fb13 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/MergedFunctionsInfo.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/MergedFunctionsInfo.h
@@ -31,6 +31,18 @@ struct MergedFunctionsInfo {
   /// \returns A boolean indicating if this FunctionInfo is valid.
   bool isValid() { return !MergedFunctions.empty(); }
 
+  /// Get a vector of DataExtractor objects for the functions in this
+  /// MergedFunctionsInfo object.
+  ///
+  /// \param Data The binary stream to read the data from. This object must have
+  /// the data for the MergedFunctionsInfo object starting at offset zero. The
+  /// data can contain more data than needed.
+  ///
+  /// \returns An llvm::Expected containing a vector of DataExtractor objects on
+  /// success, or an error object if parsing fails.
+  static llvm::Expected<std::vector<DataExtractor>>
+  getFuncsDataExtractors(DataExtractor &Data);
+
   /// Decode an MergedFunctionsInfo object from a binary data stream.
   ///
   /// \param Data The binary stream to read the data from. This object must have
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h b/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h
index 39a7db3..1db4b82 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/loongarch.h
@@ -41,6 +41,50 @@ enum EdgeKind_loongarch : Edge::Kind {
   ///
   Pointer32,
 
+  /// A 16-bit PC-relative branch.
+  ///
+  /// Represents a PC-relative branch to a target within +/-128Kb. The target
+  /// must be 4-byte aligned.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 2 : int16
+  ///
+  /// Notes:
+  ///   The '16' in the name refers to the number operand bits and follows the
+  /// naming convention used by the corresponding ELF relocations. Since the low
+  /// two bits must be zero (because of the 4-byte alignment of the target) the
+  /// operand is effectively a signed 18-bit number.
+  ///
+  /// Errors:
+  ///   - The result of the unshifted part of the fixup expression must be
+  ///     4-byte aligned otherwise an alignment error will be returned.
+  ///   - The result of the fixup expression must fit into an int16 otherwise an
+  ///     out-of-range error will be returned.
+  ///
+  Branch16PCRel,
+
+  /// A 21-bit PC-relative branch.
+  ///
+  /// Represents a PC-relative branch to a target within +/-4Mb. The Target must
+  /// be 4-byte aligned.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 2 : int21
+  ///
+  /// Notes:
+  ///   The '21' in the name refers to the number operand bits and follows the
+  /// naming convention used by the corresponding ELF relocations. Since the low
+  /// two bits must be zero (because of the 4-byte alignment of the target) the
+  /// operand is effectively a signed 23-bit number.
+  ///
+  /// Errors:
+  ///   - The result of the unshifted part of the fixup expression must be
+  ///     4-byte aligned otherwise an alignment error will be returned.
+  ///   - The result of the fixup expression must fit into an int21 otherwise an
+  ///     out-of-range error will be returned.
+  ///
+  Branch21PCRel,
+
   /// A 26-bit PC-relative branch.
   ///
   /// Represents a PC-relative call or branch to a target within +/-128Mb. The
@@ -189,7 +233,7 @@ const char *getEdgeKindName(Edge::Kind K);
 
 // Returns extract bits Val[Hi:Lo].
 inline uint32_t extractBits(uint64_t Val, unsigned Hi, unsigned Lo) {
-  return Hi == 63 ? Val >> Lo : (Val & (((1UL << (Hi + 1)) - 1))) >> Lo;
+  return Hi == 63 ? Val >> Lo : (Val & ((((uint64_t)1 << (Hi + 1)) - 1))) >> Lo;
 }
 
 /// Apply fixup expression for edge to block content.
@@ -213,6 +257,37 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E) {
     *(ulittle32_t *)FixupPtr = Value;
     break;
   }
+  case Branch16PCRel: {
+    int64_t Value = TargetAddress - FixupAddress + Addend;
+
+    if (!isInt<18>(Value))
+      return makeTargetOutOfRangeError(G, B, E);
+
+    if (!isShiftedInt<16, 2>(Value))
+      return makeAlignmentError(orc::ExecutorAddr(FixupAddress), Value, 4, E);
+
+    uint32_t RawInstr = *(little32_t *)FixupPtr;
+    uint32_t Imm = static_cast<uint32_t>(Value >> 2);
+    uint32_t Imm15_0 = extractBits(Imm, /*Hi=*/15, /*Lo=*/0) << 10;
+    *(little32_t *)FixupPtr = RawInstr | Imm15_0;
+    break;
+  }
+  case Branch21PCRel: {
+    int64_t Value = TargetAddress - FixupAddress + Addend;
+
+    if (!isInt<23>(Value))
+      return makeTargetOutOfRangeError(G, B, E);
+
+    if (!isShiftedInt<21, 2>(Value))
+      return makeAlignmentError(orc::ExecutorAddr(FixupAddress), Value, 4, E);
+
+    uint32_t RawInstr = *(little32_t *)FixupPtr;
+    uint32_t Imm = static_cast<uint32_t>(Value >> 2);
+    uint32_t Imm15_0 = extractBits(Imm, /*Hi=*/15, /*Lo=*/0) << 10;
+    uint32_t Imm20_16 = extractBits(Imm, /*Hi=*/20, /*Lo=*/16);
+    *(little32_t *)FixupPtr = RawInstr | Imm15_0 | Imm20_16;
+    break;
+  }
   case Branch26PCRel: {
     int64_t Value = TargetAddress - FixupAddress + Addend;
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 2788932..db85336 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -1312,6 +1312,7 @@ public:
   MaterializationTask(std::unique_ptr<MaterializationUnit> MU,
                       std::unique_ptr<MaterializationResponsibility> MR)
       : MU(std::move(MU)), MR(std::move(MR)) {}
+  ~MaterializationTask() override;
   void printDescription(raw_ostream &OS) override;
   void run() override;
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
index 3da5e90..f19cfce 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
@@ -156,6 +156,7 @@ private:
         RuntimeFunction *func1, RuntimeFunction *func2,
         const shared::WrapperFunctionCall::ArgDataBufferType &arg1,
         const shared::WrapperFunctionCall::ArgDataBufferType &arg2) {
+      std::lock_guard<std::mutex> Lock(Mutex);
       auto &argList = DeferredRTFnMap[std::make_pair(func1, func2)];
       argList.emplace_back(arg1, arg2);
     }
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
index 6ffd286..8e29f21 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
@@ -460,8 +460,8 @@ private:
       return;
 
     StrTab.resize(Strings.size());
-    for (auto &KV : Strings)
-      StrTab[KV.second] = {KV.first, 0};
+    for (auto &[Str, Idx] : Strings)
+      StrTab[Idx] = {Str, 0};
     size_t Offset = 0;
     for (auto &Elem : StrTab) {
       Elem.Offset = Offset;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
index 8c65677..d793986 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
@@ -122,7 +122,7 @@ public:
   void shutdown() override;
 private:
   std::mutex DispatchMutex;
-  bool Running = true;
+  bool Shutdown = false;
   size_t Outstanding = 0;
   std::condition_variable OutstandingCV;
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index e36eb77..a4c1964 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -49,6 +49,7 @@ def OMPC_Affinity : Clause<"affinity"> {
 }
 def OMPC_Align : Clause<"align"> {
   let clangClass = "OMPAlignClause";
+  let flangClass = "OmpAlignClause";
 }
 def OMPC_Aligned : Clause<"aligned"> {
   let clangClass = "OMPAlignedClause";
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 8cdfa27..0332a6c 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -87,6 +87,28 @@ public:
   }
 };
 
+/// This provides a helper for copying FMF from an instruction or setting
+/// specified flags.
+class FMFSource {
+  std::optional<FastMathFlags> FMF;
+
+public:
+  FMFSource() = default;
+  FMFSource(Instruction *Source) {
+    if (Source)
+      FMF = Source->getFastMathFlags();
+  }
+  FMFSource(FastMathFlags FMF) : FMF(FMF) {}
+  FastMathFlags get(FastMathFlags Default) const {
+    return FMF.value_or(Default);
+  }
+  /// Intersect the FMF from two instructions.
+  static FMFSource intersect(Value *A, Value *B) {
+    return FMFSource(cast<FPMathOperator>(A)->getFastMathFlags() &
+                     cast<FPMathOperator>(B)->getFastMathFlags());
+  }
+};
+
 /// Common base class shared among various IRBuilders.
 class IRBuilderBase {
   /// Pairs of (metadata kind, MDNode *) that should be added to all newly
@@ -958,29 +980,27 @@ public:
   /// Create a call to intrinsic \p ID with 1 operand which is mangled on its
   /// type.
   CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
-                                 Instruction *FMFSource = nullptr,
+                                 FMFSource FMFSource = {},
                                  const Twine &Name = "");
 
   /// Create a call to intrinsic \p ID with 2 operands which is mangled on the
   /// first type.
   Value *CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS,
-                               Instruction *FMFSource = nullptr,
+                               FMFSource FMFSource = {},
                                const Twine &Name = "");
 
   /// Create a call to intrinsic \p ID with \p Args, mangled using \p Types. If
   /// \p FMFSource is provided, copy fast-math-flags from that instruction to
   /// the intrinsic.
   CallInst *CreateIntrinsic(Intrinsic::ID ID, ArrayRef<Type *> Types,
-                            ArrayRef<Value *> Args,
-                            Instruction *FMFSource = nullptr,
+                            ArrayRef<Value *> Args, FMFSource FMFSource = {},
                             const Twine &Name = "");
 
   /// Create a call to intrinsic \p ID with \p RetTy and \p Args. If
   /// \p FMFSource is provided, copy fast-math-flags from that instruction to
   /// the intrinsic.
   CallInst *CreateIntrinsic(Type *RetTy, Intrinsic::ID ID,
-                            ArrayRef<Value *> Args,
-                            Instruction *FMFSource = nullptr,
+                            ArrayRef<Value *> Args, FMFSource FMFSource = {},
                             const Twine &Name = "");
 
   /// Create call to the minnum intrinsic.
@@ -1026,15 +1046,14 @@ public:
   }
 
   /// Create call to the copysign intrinsic.
-  Value *CreateCopySign(Value *LHS, Value *RHS,
-                        Instruction *FMFSource = nullptr,
+  Value *CreateCopySign(Value *LHS, Value *RHS, FMFSource FMFSource = {},
                         const Twine &Name = "") {
     return CreateBinaryIntrinsic(Intrinsic::copysign, LHS, RHS, FMFSource,
                                  Name);
   }
 
   /// Create call to the ldexp intrinsic.
-  Value *CreateLdexp(Value *Src, Value *Exp, Instruction *FMFSource = nullptr,
+  Value *CreateLdexp(Value *Src, Value *Exp, FMFSource FMFSource = {},
                      const Twine &Name = "") {
     assert(!IsFPConstrained && "TODO: Support strictfp");
     return CreateIntrinsic(Intrinsic::ldexp, {Src->getType(), Exp->getType()},
@@ -1555,144 +1574,113 @@ public:
 
   Value *CreateFAdd(Value *L, Value *R, const Twine &Name = "",
                     MDNode *FPMD = nullptr) {
-    if (IsFPConstrained)
-      return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fadd,
-                                      L, R, nullptr, Name, FPMD);
-
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FAdd, L, R, FMF))
-      return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFAdd(L, R), FPMD, FMF);
-    return Insert(I, Name);
+    return CreateFAddFMF(L, R, {}, Name, FPMD);
   }
 
-  /// Copy fast-math-flags from an instruction rather than using the builder's
-  /// default FMF.
-  Value *CreateFAddFMF(Value *L, Value *R, Instruction *FMFSource,
-                       const Twine &Name = "") {
+  Value *CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource,
+                       const Twine &Name = "", MDNode *FPMD = nullptr) {
     if (IsFPConstrained)
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fadd,
-                                      L, R, FMFSource, Name);
+                                      L, R, FMFSource, Name, FPMD);
 
-    FastMathFlags FMF = FMFSource->getFastMathFlags();
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FAdd, L, R, FMF))
+    if (Value *V =
+            Folder.FoldBinOpFMF(Instruction::FAdd, L, R, FMFSource.get(FMF)))
       return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFAdd(L, R), nullptr, FMF);
+    Instruction *I =
+        setFPAttrs(BinaryOperator::CreateFAdd(L, R), FPMD, FMFSource.get(FMF));
     return Insert(I, Name);
   }
 
   Value *CreateFSub(Value *L, Value *R, const Twine &Name = "",
                     MDNode *FPMD = nullptr) {
-    if (IsFPConstrained)
-      return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fsub,
-                                      L, R, nullptr, Name, FPMD);
-
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FSub, L, R, FMF))
-      return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFSub(L, R), FPMD, FMF);
-    return Insert(I, Name);
+    return CreateFSubFMF(L, R, {}, Name, FPMD);
   }
 
-  /// Copy fast-math-flags from an instruction rather than using the builder's
-  /// default FMF.
-  Value *CreateFSubFMF(Value *L, Value *R, Instruction *FMFSource,
-                       const Twine &Name = "") {
+  Value *CreateFSubFMF(Value *L, Value *R, FMFSource FMFSource,
+                       const Twine &Name = "", MDNode *FPMD = nullptr) {
     if (IsFPConstrained)
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fsub,
-                                      L, R, FMFSource, Name);
+                                      L, R, FMFSource, Name, FPMD);
 
-    FastMathFlags FMF = FMFSource->getFastMathFlags();
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FSub, L, R, FMF))
+    if (Value *V =
+            Folder.FoldBinOpFMF(Instruction::FSub, L, R, FMFSource.get(FMF)))
       return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFSub(L, R), nullptr, FMF);
+    Instruction *I =
+        setFPAttrs(BinaryOperator::CreateFSub(L, R), FPMD, FMFSource.get(FMF));
     return Insert(I, Name);
   }
 
   Value *CreateFMul(Value *L, Value *R, const Twine &Name = "",
                     MDNode *FPMD = nullptr) {
-    if (IsFPConstrained)
-      return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fmul,
-                                      L, R, nullptr, Name, FPMD);
-
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FMul, L, R, FMF))
-      return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFMul(L, R), FPMD, FMF);
-    return Insert(I, Name);
+    return CreateFMulFMF(L, R, {}, Name, FPMD);
   }
 
-  /// Copy fast-math-flags from an instruction rather than using the builder's
-  /// default FMF.
-  Value *CreateFMulFMF(Value *L, Value *R, Instruction *FMFSource,
-                       const Twine &Name = "") {
+  Value *CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource,
+                       const Twine &Name = "", MDNode *FPMD = nullptr) {
     if (IsFPConstrained)
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fmul,
-                                      L, R, FMFSource, Name);
+                                      L, R, FMFSource, Name, FPMD);
 
-    FastMathFlags FMF = FMFSource->getFastMathFlags();
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FMul, L, R, FMF))
+    if (Value *V =
+            Folder.FoldBinOpFMF(Instruction::FMul, L, R, FMFSource.get(FMF)))
       return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFMul(L, R), nullptr, FMF);
+    Instruction *I =
+        setFPAttrs(BinaryOperator::CreateFMul(L, R), FPMD, FMFSource.get(FMF));
     return Insert(I, Name);
   }
 
   Value *CreateFDiv(Value *L, Value *R, const Twine &Name = "",
                     MDNode *FPMD = nullptr) {
-    if (IsFPConstrained)
-      return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fdiv,
-                                      L, R, nullptr, Name, FPMD);
-
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FDiv, L, R, FMF))
-      return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFDiv(L, R), FPMD, FMF);
-    return Insert(I, Name);
+    return CreateFDivFMF(L, R, {}, Name, FPMD);
   }
 
-  /// Copy fast-math-flags from an instruction rather than using the builder's
-  /// default FMF.
-  Value *CreateFDivFMF(Value *L, Value *R, Instruction *FMFSource,
-                       const Twine &Name = "") {
+  Value *CreateFDivFMF(Value *L, Value *R, FMFSource FMFSource,
+                       const Twine &Name = "", MDNode *FPMD = nullptr) {
     if (IsFPConstrained)
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_fdiv,
-                                      L, R, FMFSource, Name);
+                                      L, R, FMFSource, Name, FPMD);
 
-    FastMathFlags FMF = FMFSource->getFastMathFlags();
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FDiv, L, R, FMF))
+    if (Value *V =
+            Folder.FoldBinOpFMF(Instruction::FDiv, L, R, FMFSource.get(FMF)))
       return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFDiv(L, R), nullptr, FMF);
+    Instruction *I =
+        setFPAttrs(BinaryOperator::CreateFDiv(L, R), FPMD, FMFSource.get(FMF));
     return Insert(I, Name);
   }
 
   Value *CreateFRem(Value *L, Value *R, const Twine &Name = "",
                     MDNode *FPMD = nullptr) {
-    if (IsFPConstrained)
-      return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_frem,
-                                      L, R, nullptr, Name, FPMD);
-
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FRem, L, R, FMF)) return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFRem(L, R), FPMD, FMF);
-    return Insert(I, Name);
+    return CreateFRemFMF(L, R, {}, Name, FPMD);
   }
 
-  /// Copy fast-math-flags from an instruction rather than using the builder's
-  /// default FMF.
-  Value *CreateFRemFMF(Value *L, Value *R, Instruction *FMFSource,
-                       const Twine &Name = "") {
+  Value *CreateFRemFMF(Value *L, Value *R, FMFSource FMFSource,
+                       const Twine &Name = "", MDNode *FPMD = nullptr) {
     if (IsFPConstrained)
       return CreateConstrainedFPBinOp(Intrinsic::experimental_constrained_frem,
-                                      L, R, FMFSource, Name);
+                                      L, R, FMFSource, Name, FPMD);
 
-    FastMathFlags FMF = FMFSource->getFastMathFlags();
-    if (Value *V = Folder.FoldBinOpFMF(Instruction::FRem, L, R, FMF)) return V;
-    Instruction *I = setFPAttrs(BinaryOperator::CreateFRem(L, R), nullptr, FMF);
+    if (Value *V =
+            Folder.FoldBinOpFMF(Instruction::FRem, L, R, FMFSource.get(FMF)))
+      return V;
+    Instruction *I =
+        setFPAttrs(BinaryOperator::CreateFRem(L, R), FPMD, FMFSource.get(FMF));
     return Insert(I, Name);
   }
 
   Value *CreateBinOp(Instruction::BinaryOps Opc,
                      Value *LHS, Value *RHS, const Twine &Name = "",
                      MDNode *FPMathTag = nullptr) {
-    if (Value *V = Folder.FoldBinOp(Opc, LHS, RHS)) return V;
+    return CreateBinOpFMF(Opc, LHS, RHS, {}, Name, FPMathTag);
+  }
+
+  Value *CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS,
+                        FMFSource FMFSource, const Twine &Name = "",
+                        MDNode *FPMathTag = nullptr) {
+    if (Value *V = Folder.FoldBinOp(Opc, LHS, RHS))
+      return V;
     Instruction *BinOp = BinaryOperator::Create(Opc, LHS, RHS);
     if (isa<FPMathOperator>(BinOp))
-      setFPAttrs(BinOp, FPMathTag, FMF);
+      setFPAttrs(BinOp, FPMathTag, FMFSource.get(FMF));
     return Insert(BinOp, Name);
   }
 
@@ -1731,13 +1719,13 @@ public:
   }
 
   CallInst *CreateConstrainedFPBinOp(
-      Intrinsic::ID ID, Value *L, Value *R, Instruction *FMFSource = nullptr,
+      Intrinsic::ID ID, Value *L, Value *R, FMFSource FMFSource = {},
       const Twine &Name = "", MDNode *FPMathTag = nullptr,
       std::optional<RoundingMode> Rounding = std::nullopt,
       std::optional<fp::ExceptionBehavior> Except = std::nullopt);
 
   CallInst *CreateConstrainedFPUnroundedBinOp(
-      Intrinsic::ID ID, Value *L, Value *R, Instruction *FMFSource = nullptr,
+      Intrinsic::ID ID, Value *L, Value *R, FMFSource FMFSource = {},
       const Twine &Name = "", MDNode *FPMathTag = nullptr,
       std::optional<fp::ExceptionBehavior> Except = std::nullopt);
 
@@ -1752,21 +1740,17 @@ public:
 
   Value *CreateFNeg(Value *V, const Twine &Name = "",
                     MDNode *FPMathTag = nullptr) {
-    if (Value *Res = Folder.FoldUnOpFMF(Instruction::FNeg, V, FMF))
-      return Res;
-    return Insert(setFPAttrs(UnaryOperator::CreateFNeg(V), FPMathTag, FMF),
-                  Name);
+    return CreateFNegFMF(V, {}, Name, FPMathTag);
   }
 
-  /// Copy fast-math-flags from an instruction rather than using the builder's
-  /// default FMF.
-  Value *CreateFNegFMF(Value *V, Instruction *FMFSource,
-                       const Twine &Name = "") {
-   FastMathFlags FMF = FMFSource->getFastMathFlags();
-    if (Value *Res = Folder.FoldUnOpFMF(Instruction::FNeg, V, FMF))
+  Value *CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    if (Value *Res =
+            Folder.FoldUnOpFMF(Instruction::FNeg, V, FMFSource.get(FMF)))
       return Res;
-   return Insert(setFPAttrs(UnaryOperator::CreateFNeg(V), nullptr, FMF),
-                 Name);
+    return Insert(
+        setFPAttrs(UnaryOperator::CreateFNeg(V), FPMathTag, FMFSource.get(FMF)),
+        Name);
   }
 
   Value *CreateNot(Value *V, const Twine &Name = "") {
@@ -2127,19 +2111,31 @@ public:
 
   Value *CreateFPTrunc(Value *V, Type *DestTy, const Twine &Name = "",
                        MDNode *FPMathTag = nullptr) {
+    return CreateFPTruncFMF(V, DestTy, {}, Name, FPMathTag);
+  }
+
+  Value *CreateFPTruncFMF(Value *V, Type *DestTy, FMFSource FMFSource,
+                          const Twine &Name = "", MDNode *FPMathTag = nullptr) {
     if (IsFPConstrained)
       return CreateConstrainedFPCast(
-          Intrinsic::experimental_constrained_fptrunc, V, DestTy, nullptr, Name,
-          FPMathTag);
-    return CreateCast(Instruction::FPTrunc, V, DestTy, Name, FPMathTag);
+          Intrinsic::experimental_constrained_fptrunc, V, DestTy, FMFSource,
+          Name, FPMathTag);
+    return CreateCast(Instruction::FPTrunc, V, DestTy, Name, FPMathTag,
+                      FMFSource);
   }
 
   Value *CreateFPExt(Value *V, Type *DestTy, const Twine &Name = "",
                      MDNode *FPMathTag = nullptr) {
+    return CreateFPExtFMF(V, DestTy, {}, Name, FPMathTag);
+  }
+
+  Value *CreateFPExtFMF(Value *V, Type *DestTy, FMFSource FMFSource,
+                        const Twine &Name = "", MDNode *FPMathTag = nullptr) {
     if (IsFPConstrained)
       return CreateConstrainedFPCast(Intrinsic::experimental_constrained_fpext,
-                                     V, DestTy, nullptr, Name, FPMathTag);
-    return CreateCast(Instruction::FPExt, V, DestTy, Name, FPMathTag);
+                                     V, DestTy, FMFSource, Name, FPMathTag);
+    return CreateCast(Instruction::FPExt, V, DestTy, Name, FPMathTag,
+                      FMFSource);
   }
 
   Value *CreatePtrToInt(Value *V, Type *DestTy,
@@ -2187,14 +2183,15 @@ public:
   }
 
   Value *CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy,
-                    const Twine &Name = "", MDNode *FPMathTag = nullptr) {
+                    const Twine &Name = "", MDNode *FPMathTag = nullptr,
+                    FMFSource FMFSource = {}) {
     if (V->getType() == DestTy)
       return V;
     if (Value *Folded = Folder.FoldCast(Op, V, DestTy))
       return Folded;
     Instruction *Cast = CastInst::Create(Op, V, DestTy);
     if (isa<FPMathOperator>(Cast))
-      setFPAttrs(Cast, FPMathTag, FMF);
+      setFPAttrs(Cast, FPMathTag, FMFSource.get(FMF));
     return Insert(Cast, Name);
   }
 
@@ -2255,9 +2252,8 @@ public:
   }
 
   CallInst *CreateConstrainedFPCast(
-      Intrinsic::ID ID, Value *V, Type *DestTy,
-      Instruction *FMFSource = nullptr, const Twine &Name = "",
-      MDNode *FPMathTag = nullptr,
+      Intrinsic::ID ID, Value *V, Type *DestTy, FMFSource FMFSource = {},
+      const Twine &Name = "", MDNode *FPMathTag = nullptr,
       std::optional<RoundingMode> Rounding = std::nullopt,
       std::optional<fp::ExceptionBehavior> Except = std::nullopt);
 
@@ -2392,7 +2388,16 @@ public:
   // Note that this differs from CreateFCmpS only if IsFPConstrained is true.
   Value *CreateFCmp(CmpInst::Predicate P, Value *LHS, Value *RHS,
                     const Twine &Name = "", MDNode *FPMathTag = nullptr) {
-    return CreateFCmpHelper(P, LHS, RHS, Name, FPMathTag, false);
+    return CreateFCmpHelper(P, LHS, RHS, Name, FPMathTag, {}, false);
+  }
+
+  // Create a quiet floating-point comparison (i.e. one that raises an FP
+  // exception only in the case where an input is a signaling NaN).
+  // Note that this differs from CreateFCmpS only if IsFPConstrained is true.
+  Value *CreateFCmpFMF(CmpInst::Predicate P, Value *LHS, Value *RHS,
+                       FMFSource FMFSource, const Twine &Name = "",
+                       MDNode *FPMathTag = nullptr) {
+    return CreateFCmpHelper(P, LHS, RHS, Name, FPMathTag, FMFSource, false);
   }
 
   Value *CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
@@ -2407,14 +2412,14 @@ public:
   // Note that this differs from CreateFCmp only if IsFPConstrained is true.
   Value *CreateFCmpS(CmpInst::Predicate P, Value *LHS, Value *RHS,
                      const Twine &Name = "", MDNode *FPMathTag = nullptr) {
-    return CreateFCmpHelper(P, LHS, RHS, Name, FPMathTag, true);
+    return CreateFCmpHelper(P, LHS, RHS, Name, FPMathTag, {}, true);
   }
 
 private:
   // Helper routine to create either a signaling or a quiet FP comparison.
   Value *CreateFCmpHelper(CmpInst::Predicate P, Value *LHS, Value *RHS,
                           const Twine &Name, MDNode *FPMathTag,
-                          bool IsSignaling);
+                          FMFSource FMFSource, bool IsSignaling);
 
 public:
   CallInst *CreateConstrainedFPCmp(
@@ -2436,8 +2441,7 @@ public:
 
 private:
   CallInst *createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
-                             const Twine &Name = "",
-                             Instruction *FMFSource = nullptr,
+                             const Twine &Name = "", FMFSource FMFSource = {},
                              ArrayRef<OperandBundleDef> OpBundles = {});
 
 public:
@@ -2483,6 +2487,9 @@ public:
 
   Value *CreateSelect(Value *C, Value *True, Value *False,
                       const Twine &Name = "", Instruction *MDFrom = nullptr);
+  Value *CreateSelectFMF(Value *C, Value *True, Value *False,
+                         FMFSource FMFSource, const Twine &Name = "",
+                         Instruction *MDFrom = nullptr);
 
   VAArgInst *CreateVAArg(Value *List, Type *Ty, const Twine &Name = "") {
     return Insert(new VAArgInst(List, Ty), Name);
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 53a6609..cc7a81e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2974,6 +2974,7 @@ let TargetPrefix = "aarch64" in {
 
 
   def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+  def int_aarch64_sme_in_streaming_mode : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrNoMem]>, ClangBuiltin<"__builtin_arm_in_streaming_mode">;
 
   class SME_OuterProduct_Intrinsic
       : DefaultAttrsIntrinsic<[],
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d31d5af..3b1d1a8 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -31,19 +31,20 @@ def int_dx_resource_getpointer
     : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty],
                             [IntrNoMem]>;
 def int_dx_resource_load_typedbuffer
-    : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty],
-                            [IntrReadMem]>;
-def int_dx_resource_loadchecked_typedbuffer
     : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
                             [llvm_any_ty, llvm_i32_ty], [IntrReadMem]>;
 def int_dx_resource_store_typedbuffer
     : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty],
                             [IntrWriteMem]>;
+def int_dx_resource_load_rawbuffer
+    : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
+                            [llvm_any_ty, llvm_i32_ty, llvm_i32_ty],
+                            [IntrReadMem]>;
 
 def int_dx_resource_updatecounter
     : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                             [IntrInaccessibleMemOrArgMemOnly]>;
-    
+
 // Cast between target extension handle types and dxil-style opaque handles
 def int_dx_resource_casthandle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
 
@@ -105,7 +106,7 @@ def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrCon
 def int_dx_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
 def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>;
-def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>], 
+def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>],
     [LLVMScalarOrSameVectorWidth<0, llvm_double_ty>], [IntrNoMem]>;
 def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_discard : DefaultAttrsIntrinsic<[], [llvm_i1_ty], []>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index bcff0f2..b4d2dce 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -59,11 +59,13 @@ let TargetPrefix = "spv" in {
 
   // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support.
   def int_spv_thread_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
+  def int_spv_group_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
   def int_spv_thread_id_in_group : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
   def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>;
   def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>;
   def int_spv_cross : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
+  def int_spv_distance : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_frac : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>],
     [IntrNoMem] >;
@@ -116,6 +118,10 @@ let TargetPrefix = "spv" in {
       : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                               [IntrInaccessibleMemOrArgMemOnly]>;
 
+  def int_spv_resource_getpointer
+      : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty],
+                              [IntrNoMem]>;
+
   // Read a value from the image buffer. It does not translate directly to a
   // single OpImageRead because the result type is not necessarily a 4 element
   // vector.
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 35580f3..df2384c 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -1464,6 +1464,8 @@ public:
   static MDNode *getMergedProfMetadata(MDNode *A, MDNode *B,
                                        const Instruction *AInstr,
                                        const Instruction *BInstr);
+  static MDNode *getMergedMemProfMetadata(MDNode *A, MDNode *B);
+  static MDNode *getMergedCallsiteMetadata(MDNode *A, MDNode *B);
 };
 
 /// Tuple of metadata.
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h
deleted file mode 100644
index dfb6e85..0000000
--- a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===--- NVVMIntrinsicFlags.h -----------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file contains the definitions of the enumerations and flags
-/// associated with NVVM Intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_NVVMINTRINSICFLAGS_H
-#define LLVM_IR_NVVMINTRINSICFLAGS_H
-
-#include <stdint.h>
-
-namespace llvm {
-namespace nvvm {
-
-// Reduction Ops supported with TMA Copy from Shared
-// to Global Memory for the "cp.reduce.async.bulk.tensor.*"
-// family of PTX instructions.
-enum class TMAReductionOp : uint8_t {
-  ADD = 0,
-  MIN = 1,
-  MAX = 2,
-  INC = 3,
-  DEC = 4,
-  AND = 5,
-  OR = 6,
-  XOR = 7,
-};
-
-} // namespace nvvm
-} // namespace llvm
-#endif // LLVM_IR_NVVMINTRINSICFLAGS_H
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
new file mode 100644
index 0000000..8ca073b
--- /dev/null
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -0,0 +1,176 @@
+//===--- NVVMIntrinsicUtils.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the definitions of the enumerations and flags
+/// associated with NVVM Intrinsics, along with some helper functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_NVVMINTRINSICUTILS_H
+#define LLVM_IR_NVVMINTRINSICUTILS_H
+
+#include <stdint.h>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+
+namespace llvm {
+namespace nvvm {
+
+// Reduction Ops supported with TMA Copy from Shared
+// to Global Memory for the "cp.reduce.async.bulk.tensor.*"
+// family of PTX instructions.
+enum class TMAReductionOp : uint8_t {
+  ADD = 0,
+  MIN = 1,
+  MAX = 2,
+  INC = 3,
+  DEC = 4,
+  AND = 5,
+  OR = 6,
+  XOR = 7,
+};
+
+inline bool IntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  // Float to i32 / i64 conversion intrinsics:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+
+  case Intrinsic::nvvm_f2ui_rm_ftz:
+  case Intrinsic::nvvm_f2ui_rn_ftz:
+  case Intrinsic::nvvm_f2ui_rp_ftz:
+  case Intrinsic::nvvm_f2ui_rz_ftz:
+
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+
+  case Intrinsic::nvvm_f2ull_rm_ftz:
+  case Intrinsic::nvvm_f2ull_rn_ftz:
+  case Intrinsic::nvvm_f2ull_rp_ftz:
+  case Intrinsic::nvvm_f2ull_rz_ftz:
+    return true;
+  }
+  return false;
+}
+
+inline bool IntrinsicConvertsToSignedInteger(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  // f2i
+  case Intrinsic::nvvm_f2i_rm:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2i_rn:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2i_rp:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2i_rz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+  // d2i
+  case Intrinsic::nvvm_d2i_rm:
+  case Intrinsic::nvvm_d2i_rn:
+  case Intrinsic::nvvm_d2i_rp:
+  case Intrinsic::nvvm_d2i_rz:
+  // f2ll
+  case Intrinsic::nvvm_f2ll_rm:
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ll_rn:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ll_rp:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ll_rz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+  // d2ll
+  case Intrinsic::nvvm_d2ll_rm:
+  case Intrinsic::nvvm_d2ll_rn:
+  case Intrinsic::nvvm_d2ll_rp:
+  case Intrinsic::nvvm_d2ll_rz:
+    return true;
+  }
+  return false;
+}
+
+inline APFloat::roundingMode
+IntrinsicGetRoundingMode(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  // RM:
+  case Intrinsic::nvvm_f2i_rm:
+  case Intrinsic::nvvm_f2ui_rm:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2ui_rm_ftz:
+  case Intrinsic::nvvm_d2i_rm:
+  case Intrinsic::nvvm_d2ui_rm:
+
+  case Intrinsic::nvvm_f2ll_rm:
+  case Intrinsic::nvvm_f2ull_rm:
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ull_rm_ftz:
+  case Intrinsic::nvvm_d2ll_rm:
+  case Intrinsic::nvvm_d2ull_rm:
+    return APFloat::rmTowardNegative;
+
+  // RN:
+  case Intrinsic::nvvm_f2i_rn:
+  case Intrinsic::nvvm_f2ui_rn:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2ui_rn_ftz:
+  case Intrinsic::nvvm_d2i_rn:
+  case Intrinsic::nvvm_d2ui_rn:
+
+  case Intrinsic::nvvm_f2ll_rn:
+  case Intrinsic::nvvm_f2ull_rn:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ull_rn_ftz:
+  case Intrinsic::nvvm_d2ll_rn:
+  case Intrinsic::nvvm_d2ull_rn:
+    return APFloat::rmNearestTiesToEven;
+
+  // RP:
+  case Intrinsic::nvvm_f2i_rp:
+  case Intrinsic::nvvm_f2ui_rp:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2ui_rp_ftz:
+  case Intrinsic::nvvm_d2i_rp:
+  case Intrinsic::nvvm_d2ui_rp:
+
+  case Intrinsic::nvvm_f2ll_rp:
+  case Intrinsic::nvvm_f2ull_rp:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ull_rp_ftz:
+  case Intrinsic::nvvm_d2ll_rp:
+  case Intrinsic::nvvm_d2ull_rp:
+    return APFloat::rmTowardPositive;
+
+  // RZ:
+  case Intrinsic::nvvm_f2i_rz:
+  case Intrinsic::nvvm_f2ui_rz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+  case Intrinsic::nvvm_f2ui_rz_ftz:
+  case Intrinsic::nvvm_d2i_rz:
+  case Intrinsic::nvvm_d2ui_rz:
+
+  case Intrinsic::nvvm_f2ll_rz:
+  case Intrinsic::nvvm_f2ull_rz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+  case Intrinsic::nvvm_f2ull_rz_ftz:
+  case Intrinsic::nvvm_d2ll_rz:
+  case Intrinsic::nvvm_d2ull_rz:
+    return APFloat::rmTowardZero;
+  }
+  llvm_unreachable("Invalid f2i/d2i rounding mode intrinsic");
+  return APFloat::roundingMode::Invalid;
+}
+
+} // namespace nvvm
+} // namespace llvm
+#endif // LLVM_IR_NVVMINTRINSICUTILS_H
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index 5dab9d0..b523004 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -28,9 +28,9 @@
 /// polymorphism as outlined in the "Value Semantics and Concept-based
 /// Polymorphism" talk (or its abbreviated sibling "Inheritance Is The Base
 /// Class of Evil") by Sean Parent:
-/// * http://github.com/sean-parent/sean-parent.github.com/wiki/Papers-and-Presentations
+/// * https://sean-parent.stlab.cc/papers-and-presentations
 /// * http://www.youtube.com/watch?v=_BpMYeUFXv8
-/// * http://channel9.msdn.com/Events/GoingNative/2013/Inheritance-Is-The-Base-Class-of-Evil
+/// * https://learn.microsoft.com/en-us/shows/goingnative-2013/inheritance-base-class-of-evil
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index cc0e8d5..cd9a360 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1844,9 +1844,9 @@ struct m_ZeroMask {
 };
 
 struct m_SpecificMask {
-  ArrayRef<int> &MaskRef;
-  m_SpecificMask(ArrayRef<int> &MaskRef) : MaskRef(MaskRef) {}
-  bool match(ArrayRef<int> Mask) { return MaskRef == Mask; }
+  ArrayRef<int> Val;
+  m_SpecificMask(ArrayRef<int> Val) : Val(Val) {}
+  bool match(ArrayRef<int> Mask) { return Val == Mask; }
 };
 
 struct m_SplatOrPoisonMask {
@@ -2870,7 +2870,7 @@ template <typename Opnd_t> struct Signum_match {
       return false;
 
     unsigned ShiftWidth = TypeSize - 1;
-    Value *OpL = nullptr, *OpR = nullptr;
+    Value *Op;
 
     // This is the representation of signum we match:
     //
@@ -2882,11 +2882,11 @@ template <typename Opnd_t> struct Signum_match {
     //
     // for i1 values.
 
-    auto LHS = m_AShr(m_Value(OpL), m_SpecificInt(ShiftWidth));
-    auto RHS = m_LShr(m_Neg(m_Value(OpR)), m_SpecificInt(ShiftWidth));
-    auto Signum = m_Or(LHS, RHS);
+    auto LHS = m_AShr(m_Value(Op), m_SpecificInt(ShiftWidth));
+    auto RHS = m_LShr(m_Neg(m_Deferred(Op)), m_SpecificInt(ShiftWidth));
+    auto Signum = m_c_Or(LHS, RHS);
 
-    return Signum.match(V) && OpL == OpR && Val.match(OpL);
+    return Signum.match(V) && Val.match(Op);
   }
 };
 
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index cf31c36..f4de106 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -94,9 +94,10 @@ protected:
   /// constants into comdat sections.
   bool HasCOFFComdatConstants = false;
 
-  /// True if this is an XCOFF target that supports visibility attributes as
-  /// part of .global, .weak, .extern, and .comm. Default is false.
-  bool HasVisibilityOnlyWithLinkage = false;
+  bool IsAIX = false;
+
+  // True if using the HLASM dialect on z/OS.
+  bool IsHLASM = false;
 
   /// This is the maximum possible length of an instruction, which is needed to
   /// compute the size of an inline asm.  Defaults to 4.
@@ -110,14 +111,6 @@ protected:
   /// the current PC.  Defaults to false.
   bool DollarIsPC = false;
 
-  /// Allow '.' token, when not referencing an identifier or constant, to refer
-  /// to the current PC. Defaults to true.
-  bool DotIsPC = true;
-
-  /// Whether the '*' token refers to the current PC. This is used for the
-  /// HLASM dialect.
-  bool StarIsPC = false;
-
   /// This string, if specified, is used to separate instructions from each
   /// other when on the same line.  Defaults to ';'
   const char *SeparatorString;
@@ -126,10 +119,6 @@ protected:
   /// "#"
   StringRef CommentString;
 
-  /// This indicates whether the comment string is only accepted as a comment
-  /// at the beginning of statements. Defaults to false.
-  bool RestrictCommentStringToStartOfStatement = false;
-
   /// This indicates whether to allow additional "comment strings" to be lexed
   /// as a comment. Setting this attribute to true, will ensure that C-style
   /// line comments (// ..), C-style block comments (/* .. */), and "#" are
@@ -138,16 +127,9 @@ protected:
   /// Default is true.
   bool AllowAdditionalComments = true;
 
-  /// Should we emit the '\t' as the starting indentation marker for GNU inline
-  /// asm statements. Defaults to true.
-  bool EmitGNUAsmStartIndentationMarker = true;
-
   /// This is appended to emitted labels.  Defaults to ":"
   const char *LabelSuffix;
 
-  /// Emit labels in purely upper case. Defaults to false.
-  bool EmitLabelsInUpperCase = false;
-
   // Print the EH begin symbol with an assignment. Defaults to false.
   bool UseAssignmentForEHBegin = false;
 
@@ -209,13 +191,6 @@ protected:
   /// still be lexed as a comment.
   bool AllowAtAtStartOfIdentifier = false;
 
-  /// This is true if the assembler allows the "#" character at the start of
-  /// a string to be lexed as an AsmToken::Identifier.
-  /// If the AsmLexer determines that the string can be lexed as a possible
-  /// comment, setting this option will have no effect, and the string will
-  /// still be lexed as a comment.
-  bool AllowHashAtStartOfIdentifier = false;
-
   /// If this is true, symbol names with invalid characters will be printed in
   /// quotes.
   bool SupportsQuotedNames = true;
@@ -225,10 +200,6 @@ protected:
   /// instead.
   bool UseDataRegionDirectives = false;
 
-  /// True if .align is to be used for alignment. Only power-of-two
-  /// alignment is supported.
-  bool UseDotAlignForAlignment = false;
-
   /// True if the target supports LEB128 directives.
   bool HasLEB128Directives = true;
 
@@ -243,11 +214,6 @@ protected:
   /// "\t.zero\t"
   const char *ZeroDirective;
 
-  /// This should be set to true if the zero directive supports a value to emit
-  /// other than zero. If this is set to false, the Data*bitsDirective's will be
-  /// used to emit these bytes. Defaults to true.
-  bool ZeroDirectiveSupportsNonZeroValue = true;
-
   /// This directive allows emission of an ascii string with the standard C
   /// escape characters embedded into it.  If a target doesn't support this, it
   /// can be set to null. Defaults to "\t.ascii\t"
@@ -258,16 +224,6 @@ protected:
   /// doesn't support this, it can be set to null.  Defaults to "\t.asciz\t"
   const char *AscizDirective;
 
-  /// This directive accepts a comma-separated list of bytes for emission as a
-  /// string of bytes.  For targets that do not support this, it shall be set to
-  /// null.  Defaults to null.
-  const char *ByteListDirective = nullptr;
-
-  /// This directive allows emission of a zero-terminated ascii string without
-  /// the standard C escape characters embedded into it.  If a target doesn't
-  /// support this, it can be set to null. Defaults to null.
-  const char *PlainStringDirective = nullptr;
-
   /// Form used for character literals in the assembly syntax.  Useful for
   /// producing strings as byte lists.  If a target does not use or support
   /// this, it shall be set to ACLS_Unknown.  Defaults to ACLS_Unknown.
@@ -348,16 +304,6 @@ protected:
   /// argument and how it is interpreted.  Defaults to NoAlignment.
   LCOMM::LCOMMType LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
 
-  /// True if the target only has basename for .file directive. False if the
-  /// target also needs the directory along with the basename. Defaults to true.
-  bool HasBasenameOnlyForFileDirective = true;
-
-  /// True if the target represents string constants as mostly raw characters in
-  /// paired double quotation with paired double quotation marks as the escape
-  /// mechanism to represent a double quotation mark within the string. Defaults
-  /// to false.
-  bool HasPairedDoubleQuoteStringConstants = false;
-
   // True if the target allows .align directives on functions. This is true for
   // most targets, so defaults to true.
   bool HasFunctionAlignment = true;
@@ -370,10 +316,6 @@ protected:
   /// for ELF targets.  Defaults to true.
   bool HasSingleParameterDotFile = true;
 
-  /// True if the target has a four strings .file directive, strings separated
-  /// by comma. Defaults to false.
-  bool HasFourStringsDotFile = false;
-
   /// True if the target has a .ident directive, this is true for ELF targets.
   /// Defaults to false.
   bool HasIdentDirective = false;
@@ -440,18 +382,10 @@ protected:
   /// absolute difference.
   bool DwarfFDESymbolsUseAbsDiff = false;
 
-  /// True if the target supports generating the DWARF line table through using
-  /// the .loc/.file directives. Defaults to true.
-  bool UsesDwarfFileAndLocDirectives = true;
-
   /// True if DWARF `.file directory' directive syntax is used by
   /// default.
   bool EnableDwarfFileDirectoryDefault = true;
 
-  /// True if the target needs the DWARF section length in the header (if any)
-  /// of the DWARF section in the assembly file. Defaults to true.
-  bool DwarfSectionSizeRequired = true;
-
   /// True if dwarf register numbers are printed instead of symbolic register
   /// names in .cfi_* directives.  Defaults to false.
   bool DwarfRegNumForCFI = false;
@@ -507,9 +441,6 @@ protected:
   // If true, use Motorola-style integers in Assembly (ex. $0ac).
   bool UseMotorolaIntegers = false;
 
-  // If true, emit function descriptor symbol on AIX.
-  bool NeedsFunctionDescriptors = false;
-
 public:
   explicit MCAsmInfo();
   virtual ~MCAsmInfo();
@@ -590,12 +521,11 @@ public:
 
   // Accessors.
 
+  bool isAIX() const { return IsAIX; }
+  bool isHLASM() const { return IsHLASM; }
   bool isMachO() const { return HasSubsectionsViaSymbols; }
   bool hasCOFFAssociativeComdats() const { return HasCOFFAssociativeComdats; }
   bool hasCOFFComdatConstants() const { return HasCOFFComdatConstants; }
-  bool hasVisibilityOnlyWithLinkage() const {
-    return HasVisibilityOnlyWithLinkage;
-  }
 
   /// Returns the maximum possible encoded instruction size in bytes. If \p STI
   /// is null, this should be the maximum size for any subtarget.
@@ -605,23 +535,14 @@ public:
 
   unsigned getMinInstAlignment() const { return MinInstAlignment; }
   bool getDollarIsPC() const { return DollarIsPC; }
-  bool getDotIsPC() const { return DotIsPC; }
-  bool getStarIsPC() const { return StarIsPC; }
   const char *getSeparatorString() const { return SeparatorString; }
 
   unsigned getCommentColumn() const { return CommentColumn; }
   void setCommentColumn(unsigned Col) { CommentColumn = Col; }
 
   StringRef getCommentString() const { return CommentString; }
-  bool getRestrictCommentStringToStartOfStatement() const {
-    return RestrictCommentStringToStartOfStatement;
-  }
   bool shouldAllowAdditionalComments() const { return AllowAdditionalComments; }
-  bool getEmitGNUAsmStartIndentationMarker() const {
-    return EmitGNUAsmStartIndentationMarker;
-  }
   const char *getLabelSuffix() const { return LabelSuffix; }
-  bool shouldEmitLabelsInUpperCase() const { return EmitLabelsInUpperCase; }
 
   bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; }
   bool needsLocalForSize() const { return NeedsLocalForSize; }
@@ -655,32 +576,20 @@ public:
   bool doesAllowDollarAtStartOfIdentifier() const {
     return AllowDollarAtStartOfIdentifier;
   }
-  bool doesAllowHashAtStartOfIdentifier() const {
-    return AllowHashAtStartOfIdentifier;
-  }
   bool supportsNameQuoting() const { return SupportsQuotedNames; }
 
   bool doesSupportDataRegionDirectives() const {
     return UseDataRegionDirectives;
   }
 
-  bool useDotAlignForAlignment() const {
-    return UseDotAlignForAlignment;
-  }
-
   bool hasLEB128Directives() const { return HasLEB128Directives; }
 
   bool useFullRegisterNames() const { return PPCUseFullRegisterNames; }
   void setFullRegisterNames(bool V) { PPCUseFullRegisterNames = V; }
 
   const char *getZeroDirective() const { return ZeroDirective; }
-  bool doesZeroDirectiveSupportNonZeroValue() const {
-    return ZeroDirectiveSupportsNonZeroValue;
-  }
   const char *getAsciiDirective() const { return AsciiDirective; }
   const char *getAscizDirective() const { return AscizDirective; }
-  const char *getByteListDirective() const { return ByteListDirective; }
-  const char *getPlainStringDirective() const { return PlainStringDirective; }
   AsmCharLiteralSyntax characterLiteralSyntax() const {
     return CharacterLiteralSyntax;
   }
@@ -700,16 +609,9 @@ public:
     return LCOMMDirectiveAlignmentType;
   }
 
-  bool hasBasenameOnlyForFileDirective() const {
-    return HasBasenameOnlyForFileDirective;
-  }
-  bool hasPairedDoubleQuoteStringConstants() const {
-    return HasPairedDoubleQuoteStringConstants;
-  }
   bool hasFunctionAlignment() const { return HasFunctionAlignment; }
   bool hasDotTypeDotSizeDirective() const { return HasDotTypeDotSizeDirective; }
   bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
-  bool hasFourStringsDotFile() const { return HasFourStringsDotFile; }
   bool hasIdentDirective() const { return HasIdentDirective; }
   bool hasNoDeadStrip() const { return HasNoDeadStrip; }
   const char *getWeakDirective() const { return WeakDirective; }
@@ -776,13 +678,7 @@ public:
     return SupportsExtendedDwarfLocDirective;
   }
 
-  bool usesDwarfFileAndLocDirectives() const {
-    return UsesDwarfFileAndLocDirectives;
-  }
-
-  bool needsDwarfSectionSizeInHeader() const {
-    return DwarfSectionSizeRequired;
-  }
+  bool usesDwarfFileAndLocDirectives() const { return !IsAIX; }
 
   bool enableDwarfFileDirectoryDefault() const {
     return EnableDwarfFileDirectoryDefault;
@@ -832,7 +728,6 @@ public:
   bool shouldUseLogicalShr() const { return UseLogicalShr; }
 
   bool hasMipsExpressions() const { return HasMipsExpressions; }
-  bool needsFunctionDescriptors() const { return NeedsFunctionDescriptors; }
   bool shouldUseMotorolaIntegers() const { return UseMotorolaIntegers; }
 };
 
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 9115dcd..21da4da 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -1137,9 +1137,6 @@ public:
                                         const MCSymbol *LastLabel,
                                         const MCSymbol *Label,
                                         unsigned PointerSize) {}
-
-  /// Do finalization for the streamer at the end of a section.
-  virtual void doFinalizationAtSectionEnd(MCSection *Section) {}
 };
 
 /// Create a dummy machine code streamer, which does nothing. This is useful for
diff --git a/llvm/include/llvm/Option/OptTable.h b/llvm/include/llvm/Option/OptTable.h
index decb6cb..38a03fe 100644
--- a/llvm/include/llvm/Option/OptTable.h
+++ b/llvm/include/llvm/Option/OptTable.h
@@ -451,7 +451,7 @@ protected:
   LLVM_MAKE_OPT_ID_WITH_ID_PREFIX(OPT_, PREFIXES_OFFSET, PREFIXED_NAME_OFFSET, \
                                   ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS,    \
                                   VISIBILITY, PARAM, HELPTEXT,                 \
-                                  HELPTEXTSFORVARIANTS, METAVAR, VALUE)
+                                  HELPTEXTSFORVARIANTS, METAVAR, VALUES)
 
 #define LLVM_CONSTRUCT_OPT_INFO_WITH_ID_PREFIX(                                \
     ID_PREFIX, PREFIXES_OFFSET, PREFIXED_NAME_OFFSET, ID, KIND, GROUP, ALIAS,  \
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index d2e9e8185..a84164b 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -668,9 +668,6 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addIRPasses(
   if (getOptLevel() != CodeGenOptLevel::None && !Opt.DisableLSR) {
     addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass(),
                                             /*UseMemorySSA=*/true));
-    // FIXME: use -stop-after so we could remove PrintLSR
-    if (Opt.PrintLSR)
-      addPass(PrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
   if (getOptLevel() != CodeGenOptLevel::None) {
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 5a4e79d..2976399 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -179,7 +179,7 @@ MACHINE_FUNCTION_PASS("verify<machine-trace-metrics>", MachineTraceMetricsVerifi
                                           PARAMS)
 #endif
 MACHINE_FUNCTION_PASS_WITH_PARAMS(
-    "regallocfast", "RegAllocFast",
+    "regallocfast", "RegAllocFastPass",
     [](RegAllocFastPassOptions Opts) { return RegAllocFastPass(Opts); },
     [PB = this](StringRef Params) {
       return parseRegAllocFastPassOptions(*PB, Params);
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index dad5f12..81307d7 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -36,6 +36,7 @@
 #include <iterator>
 #include <map>
 #include <memory>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <system_error>
@@ -215,9 +216,12 @@ public:
   /// LHS.
   Counter subtract(Counter LHS, Counter RHS, bool Simplify = true);
 
+  /// K to V map. K will be Counter in most cases. V may be Counter or
+  /// Expression.
   using SubstMap = std::map<Counter, Counter>;
 
-  /// Return a counter for each term in the expression replaced by SubstMap.
+  /// \return A counter equivalent to \C, with each term in its
+  /// expression replaced with term from \p Map.
   Counter subst(Counter C, const SubstMap &Map);
 };
 
@@ -370,19 +374,16 @@ struct CountedRegion : public CounterMappingRegion {
   uint64_t FalseExecutionCount;
   bool TrueFolded;
   bool FalseFolded;
-  bool HasSingleByteCoverage;
 
-  CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount,
-                bool HasSingleByteCoverage)
+  CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount)
       : CounterMappingRegion(R), ExecutionCount(ExecutionCount),
-        FalseExecutionCount(0), TrueFolded(false), FalseFolded(true),
-        HasSingleByteCoverage(HasSingleByteCoverage) {}
+        FalseExecutionCount(0), TrueFolded(false), FalseFolded(true) {}
 
   CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount,
-                uint64_t FalseExecutionCount, bool HasSingleByteCoverage)
+                uint64_t FalseExecutionCount)
       : CounterMappingRegion(R), ExecutionCount(ExecutionCount),
         FalseExecutionCount(FalseExecutionCount), TrueFolded(false),
-        FalseFolded(false), HasSingleByteCoverage(HasSingleByteCoverage) {}
+        FalseFolded(false) {}
 };
 
 /// MCDC Record grouping all information together.
@@ -448,7 +449,7 @@ struct MCDCRecord {
   };
 
   using TestVectors = llvm::SmallVector<std::pair<TestVector, CondState>>;
-  using BoolVector = llvm::SmallVector<bool>;
+  using BoolVector = std::array<BitVector, 2>;
   using TVRowPair = std::pair<unsigned, unsigned>;
   using TVPairMap = llvm::DenseMap<unsigned, TVRowPair>;
   using CondIDMap = llvm::DenseMap<unsigned, unsigned>;
@@ -457,26 +458,31 @@ struct MCDCRecord {
 private:
   CounterMappingRegion Region;
   TestVectors TV;
-  TVPairMap IndependencePairs;
+  std::optional<TVPairMap> IndependencePairs;
   BoolVector Folded;
   CondIDMap PosToID;
   LineColPairMap CondLoc;
 
 public:
   MCDCRecord(const CounterMappingRegion &Region, TestVectors &&TV,
-             TVPairMap &&IndependencePairs, BoolVector &&Folded,
-             CondIDMap &&PosToID, LineColPairMap &&CondLoc)
-      : Region(Region), TV(std::move(TV)),
-        IndependencePairs(std::move(IndependencePairs)),
-        Folded(std::move(Folded)), PosToID(std::move(PosToID)),
-        CondLoc(std::move(CondLoc)){};
+             BoolVector &&Folded, CondIDMap &&PosToID, LineColPairMap &&CondLoc)
+      : Region(Region), TV(std::move(TV)), Folded(std::move(Folded)),
+        PosToID(std::move(PosToID)), CondLoc(std::move(CondLoc)) {
+    findIndependencePairs();
+  }
+
+  // Compare executed test vectors against each other to find an independence
+  // pairs for each condition.  This processing takes the most time.
+  void findIndependencePairs();
 
   const CounterMappingRegion &getDecisionRegion() const { return Region; }
   unsigned getNumConditions() const {
     return Region.getDecisionParams().NumConditions;
   }
   unsigned getNumTestVectors() const { return TV.size(); }
-  bool isCondFolded(unsigned Condition) const { return Folded[Condition]; }
+  bool isCondFolded(unsigned Condition) const {
+    return Folded[false][Condition] || Folded[true][Condition];
+  }
 
   /// Return the evaluation of a condition (indicated by Condition) in an
   /// executed test vector (indicated by TestVectorIndex), which will be True,
@@ -500,10 +506,10 @@ public:
   /// TestVectors requires a translation from a ordinal position to actual
   /// condition ID. This is done via PosToID[].
   bool isConditionIndependencePairCovered(unsigned Condition) const {
+    assert(IndependencePairs);
     auto It = PosToID.find(Condition);
-    if (It != PosToID.end())
-      return IndependencePairs.contains(It->second);
-    llvm_unreachable("Condition ID without an Ordinal mapping");
+    assert(It != PosToID.end() && "Condition ID without an Ordinal mapping");
+    return IndependencePairs->contains(It->second);
   }
 
   /// Return the Independence Pair that covers the given condition. Because
@@ -513,7 +519,8 @@ public:
   /// via PosToID[].
   TVRowPair getConditionIndependencePair(unsigned Condition) {
     assert(isConditionIndependencePairCovered(Condition));
-    return IndependencePairs[PosToID[Condition]];
+    assert(IndependencePairs);
+    return (*IndependencePairs)[PosToID[Condition]];
   }
 
   float getPercentCovered() const {
@@ -725,10 +732,9 @@ struct FunctionRecord {
   }
 
   void pushRegion(CounterMappingRegion Region, uint64_t Count,
-                  uint64_t FalseCount, bool HasSingleByteCoverage) {
+                  uint64_t FalseCount) {
     if (Region.isBranch()) {
-      CountedBranchRegions.emplace_back(Region, Count, FalseCount,
-                                        HasSingleByteCoverage);
+      CountedBranchRegions.emplace_back(Region, Count, FalseCount);
       // If either counter is hard-coded to zero, then this region represents a
       // constant-folded branch.
       CountedBranchRegions.back().TrueFolded = Region.Count.isZero();
@@ -737,8 +743,7 @@ struct FunctionRecord {
     }
     if (CountedRegions.empty())
       ExecutionCount = Count;
-    CountedRegions.emplace_back(Region, Count, FalseCount,
-                                HasSingleByteCoverage);
+    CountedRegions.emplace_back(Region, Count, FalseCount);
   }
 };
 
@@ -901,14 +906,19 @@ class CoverageData {
   std::vector<CountedRegion> BranchRegions;
   std::vector<MCDCRecord> MCDCRecords;
 
+  bool SingleByteCoverage = false;
+
 public:
   CoverageData() = default;
 
-  CoverageData(StringRef Filename) : Filename(Filename) {}
+  CoverageData(bool Single, StringRef Filename)
+      : Filename(Filename), SingleByteCoverage(Single) {}
 
   /// Get the name of the file this data covers.
   StringRef getFilename() const { return Filename; }
 
+  bool getSingleByteCoverage() const { return SingleByteCoverage; }
+
   /// Get an iterator over the coverage segments for this object. The segments
   /// are guaranteed to be uniqued and sorted by location.
   std::vector<CoverageSegment>::const_iterator begin() const {
@@ -941,6 +951,8 @@ class CoverageMapping {
   DenseMap<size_t, SmallVector<unsigned, 0>> FilenameHash2RecordIndices;
   std::vector<std::pair<std::string, uint64_t>> FuncHashMismatches;
 
+  std::optional<bool> SingleByteCoverage;
+
   CoverageMapping() = default;
 
   // Load coverage records from readers.
diff --git a/llvm/include/llvm/Support/Recycler.h b/llvm/include/llvm/Support/Recycler.h
index bbd9ae3..693c655 100644
--- a/llvm/include/llvm/Support/Recycler.h
+++ b/llvm/include/llvm/Support/Recycler.h
@@ -60,6 +60,10 @@ public:
     // clear() before deleting the Recycler.
     assert(!FreeList && "Non-empty recycler deleted!");
   }
+  Recycler(const Recycler &) = delete;
+  Recycler(Recycler &&Other)
+      : FreeList(std::exchange(Other.FreeList, nullptr)) {}
+  Recycler() = default;
 
   /// clear - Release all the tracked allocations to the allocator. The
   /// recycler must be free of any tracked allocations before being
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 81a9257..b15e9fc 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -448,7 +448,7 @@ public:
 };
 
 /// '?' - Represents an uninitialized value.
-class UnsetInit : public Init {
+class UnsetInit final : public Init {
   friend detail::RecordKeeperImpl;
 
   /// The record keeper that initialized this Init.
@@ -486,7 +486,7 @@ public:
 
 // Represent an argument.
 using ArgAuxType = std::variant<unsigned, const Init *>;
-class ArgumentInit : public Init, public FoldingSetNode {
+class ArgumentInit final : public Init, public FoldingSetNode {
 public:
   enum Kind {
     Positional,
@@ -638,7 +638,7 @@ public:
 };
 
 /// '7' - Represent an initialization by a literal integer value.
-class IntInit : public TypedInit {
+class IntInit final : public TypedInit {
   int64_t Value;
 
   explicit IntInit(RecordKeeper &RK, int64_t V)
@@ -669,7 +669,7 @@ public:
 };
 
 /// "anonymous_n" - Represent an anonymous record name
-class AnonymousNameInit : public TypedInit {
+class AnonymousNameInit final : public TypedInit {
   unsigned Value;
 
   explicit AnonymousNameInit(RecordKeeper &RK, unsigned V)
@@ -699,7 +699,7 @@ public:
 };
 
 /// "foo" - Represent an initialization by a string value.
-class StringInit : public TypedInit {
+class StringInit final : public TypedInit {
 public:
   enum StringFormat {
     SF_String, // Format as "text"
@@ -834,18 +834,12 @@ public:
            I->getKind() <= IK_LastOpInit;
   }
 
-  // Clone - Clone this operator, replacing arguments with the new list
-  virtual const OpInit *clone(ArrayRef<const Init *> Operands) const = 0;
-
-  virtual unsigned getNumOperands() const = 0;
-  virtual const Init *getOperand(unsigned i) const = 0;
-
-  const Init *getBit(unsigned Bit) const override;
+  const Init *getBit(unsigned Bit) const final;
 };
 
 /// !op (X) - Transform an init.
 ///
-class UnOpInit : public OpInit, public FoldingSetNode {
+class UnOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum UnaryOp : uint8_t {
     TOLOWER,
@@ -881,20 +875,6 @@ public:
 
   void Profile(FoldingSetNodeID &ID) const;
 
-  // Clone - Clone this operator, replacing arguments with the new list
-  const OpInit *clone(ArrayRef<const Init *> Operands) const override {
-    assert(Operands.size() == 1 &&
-           "Wrong number of operands for unary operation");
-    return UnOpInit::get(getOpcode(), *Operands.begin(), getType());
-  }
-
-  unsigned getNumOperands() const override { return 1; }
-
-  const Init *getOperand(unsigned i) const override {
-    assert(i == 0 && "Invalid operand id for unary operator");
-    return getOperand();
-  }
-
   UnaryOp getOpcode() const { return (UnaryOp)Opc; }
   const Init *getOperand() const { return LHS; }
 
@@ -908,7 +888,7 @@ public:
 };
 
 /// !op (X, Y) - Combine two inits.
-class BinOpInit : public OpInit, public FoldingSetNode {
+class BinOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum BinaryOp : uint8_t {
     ADD,
@@ -962,22 +942,6 @@ public:
 
   void Profile(FoldingSetNodeID &ID) const;
 
-  // Clone - Clone this operator, replacing arguments with the new list
-  const OpInit *clone(ArrayRef<const Init *> Operands) const override {
-    assert(Operands.size() == 2 &&
-           "Wrong number of operands for binary operation");
-    return BinOpInit::get(getOpcode(), Operands[0], Operands[1], getType());
-  }
-
-  unsigned getNumOperands() const override { return 2; }
-  const Init *getOperand(unsigned i) const override {
-    switch (i) {
-    default: llvm_unreachable("Invalid operand id for binary operator");
-    case 0: return getLHS();
-    case 1: return getRHS();
-    }
-  }
-
   BinaryOp getOpcode() const { return (BinaryOp)Opc; }
   const Init *getLHS() const { return LHS; }
   const Init *getRHS() const { return RHS; }
@@ -995,7 +959,7 @@ public:
 };
 
 /// !op (X, Y, Z) - Combine two inits.
-class TernOpInit : public OpInit, public FoldingSetNode {
+class TernOpInit final : public OpInit, public FoldingSetNode {
 public:
   enum TernaryOp : uint8_t {
     SUBST,
@@ -1030,24 +994,6 @@ public:
 
   void Profile(FoldingSetNodeID &ID) const;
 
-  // Clone - Clone this operator, replacing arguments with the new list
-  const OpInit *clone(ArrayRef<const Init *> Operands) const override {
-    assert(Operands.size() == 3 &&
-           "Wrong number of operands for ternary operation");
-    return TernOpInit::get(getOpcode(), Operands[0], Operands[1], Operands[2],
-                           getType());
-  }
-
-  unsigned getNumOperands() const override { return 3; }
-  const Init *getOperand(unsigned i) const override {
-    switch (i) {
-    default: llvm_unreachable("Invalid operand id for ternary operator");
-    case 0: return getLHS();
-    case 1: return getMHS();
-    case 2: return getRHS();
-    }
-  }
-
   TernaryOp getOpcode() const { return (TernaryOp)Opc; }
   const Init *getLHS() const { return LHS; }
   const Init *getMHS() const { return MHS; }
@@ -1144,7 +1090,7 @@ public:
 };
 
 /// !foldl (a, b, expr, start, lst) - Fold over a list.
-class FoldOpInit : public TypedInit, public FoldingSetNode {
+class FoldOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const Init *Start, *List, *A, *B, *Expr;
 
@@ -1179,7 +1125,7 @@ public:
 };
 
 /// !isa<type>(expr) - Dynamically determine the type of an expression.
-class IsAOpInit : public TypedInit, public FoldingSetNode {
+class IsAOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const RecTy *CheckType;
   const Init *Expr;
@@ -1213,7 +1159,7 @@ public:
 
 /// !exists<type>(expr) - Dynamically determine if a record of `type` named
 /// `expr` exists.
-class ExistsOpInit : public TypedInit, public FoldingSetNode {
+class ExistsOpInit final : public TypedInit, public FoldingSetNode {
 private:
   const RecTy *CheckType;
   const Init *Expr;
@@ -1246,7 +1192,7 @@ public:
 };
 
 /// 'Opcode' - Represent a reference to an entire variable object.
-class VarInit : public TypedInit {
+class VarInit final : public TypedInit {
   const Init *VarName;
 
   explicit VarInit(const Init *VN, const RecTy *T)
@@ -1320,7 +1266,7 @@ public:
 };
 
 /// AL - Represent a reference to a 'def' in the description
-class DefInit : public TypedInit {
+class DefInit final : public TypedInit {
   friend class Record;
 
   const Record *Def;
@@ -1409,7 +1355,7 @@ public:
 };
 
 /// X.Y - Represent a reference to a subfield of a variable
-class FieldInit : public TypedInit {
+class FieldInit final : public TypedInit {
   const Init *Rec;             // Record we are referring to
   const StringInit *FieldName; // Field we are accessing
 
diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index 29bdb9c..d3d19c8 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -39,7 +39,6 @@ struct CGPassBuilderOption {
 
   bool DisableLSR = false;
   bool DisableCGP = false;
-  bool PrintLSR = false;
   bool DisableMergeICmps = false;
   bool DisablePartialLibcallInlining = false;
   bool DisableConstantHoisting = false;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 013c3a6..8641eab 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1372,6 +1372,12 @@ def select_to_iminmax: GICombineRule<
          [{ return Helper.matchSelectIMinMax(${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFnMO(${root}, ${info}); }])>;
 
+def simplify_neg_minmax : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SUB):$root,
+         [{ return Helper.matchSimplifyNegMinMax(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 def match_selects : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
   (match (wip_match_opcode G_SELECT):$root,
@@ -2008,7 +2014,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
-    combine_concat_vector,
+    simplify_neg_minmax, combine_concat_vector,
     sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines,
     combine_use_vector_truncate, merge_combines, overflow_combines]>;
 
diff --git a/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc b/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc
new file mode 100644
index 0000000..96af618
--- /dev/null
+++ b/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc
@@ -0,0 +1,66 @@
+//===- AArch64FeatPriorities.inc - AArch64 FMV Priorities enum --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file enumerates the AArch64 FMV features sorted in ascending priority.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64_FEAT_PRIORITIES_INC_H
+#define AARCH64_FEAT_PRIORITIES_INC_H
+
+// Function Multi Versioning feature priorities.
+enum FeatPriorities {
+  PRIOR_RNG,
+  PRIOR_FLAGM,
+  PRIOR_FLAGM2,
+  PRIOR_LSE,
+  PRIOR_FP,
+  PRIOR_SIMD,
+  PRIOR_DOTPROD,
+  PRIOR_SM4,
+  PRIOR_RDM,
+  PRIOR_CRC,
+  PRIOR_SHA2,
+  PRIOR_SHA3,
+  PRIOR_PMULL,
+  PRIOR_FP16,
+  PRIOR_FP16FML,
+  PRIOR_DIT,
+  PRIOR_DPB,
+  PRIOR_DPB2,
+  PRIOR_JSCVT,
+  PRIOR_FCMA,
+  PRIOR_RCPC,
+  PRIOR_RCPC2,
+  PRIOR_RCPC3,
+  PRIOR_FRINTTS,
+  PRIOR_I8MM,
+  PRIOR_BF16,
+  PRIOR_SVE,
+  PRIOR_SVE_F32MM,
+  PRIOR_SVE_F64MM,
+  PRIOR_SVE2,
+  PRIOR_SVE_PMULL128,
+  PRIOR_SVE_BITPERM,
+  PRIOR_SVE_SHA3,
+  PRIOR_SVE_SM4,
+  PRIOR_SME,
+  PRIOR_MEMTAG2,
+  PRIOR_SB,
+  PRIOR_PREDRES,
+  PRIOR_SSBS2,
+  PRIOR_BTI,
+  PRIOR_LS64_ACCDATA,
+  PRIOR_WFXT,
+  PRIOR_SME_F64,
+  PRIOR_SME_I64,
+  PRIOR_SME2,
+  PRIOR_MOPS
+};
+
+#endif
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index ac8006d6..63f06a3 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -36,6 +36,7 @@ struct ArchInfo;
 struct CpuInfo;
 
 #include "llvm/TargetParser/AArch64CPUFeatures.inc"
+#include "llvm/TargetParser/AArch64FeatPriorities.inc"
 
 static_assert(FEAT_MAX < 62,
               "Number of features in CPUFeatures are limited to 62 entries");
@@ -70,12 +71,12 @@ struct ExtensionInfo {
 
 struct FMVInfo {
   StringRef Name;                // The target_version/target_clones spelling.
-  CPUFeatures Bit;               // Index of the bit in the FMV feature bitset.
+  CPUFeatures FeatureBit;        // Index of the bit in the FMV feature bitset.
+  FeatPriorities PriorityBit;    // Index of the bit in the FMV priority bitset.
   std::optional<ArchExtKind> ID; // The architecture extension to enable.
-  unsigned Priority;             // FMV priority.
-  FMVInfo(StringRef Name, CPUFeatures Bit, std::optional<ArchExtKind> ID,
-          unsigned Priority)
-      : Name(Name), Bit(Bit), ID(ID), Priority(Priority) {};
+  FMVInfo(StringRef Name, CPUFeatures FeatureBit, FeatPriorities PriorityBit,
+          std::optional<ArchExtKind> ID)
+      : Name(Name), FeatureBit(FeatureBit), PriorityBit(PriorityBit), ID(ID) {};
 };
 
 const std::vector<FMVInfo> &getFMVInfo();
@@ -270,7 +271,7 @@ void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 bool isX18ReservedByDefault(const Triple &TT);
 
 // Return the priority for a given set of FMV features.
-unsigned getFMVPriority(ArrayRef<StringRef> Features);
+uint64_t getFMVPriority(ArrayRef<StringRef> Features);
 
 // For given feature names, return a bitmask corresponding to the entries of
 // AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks themselves,
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index bd4051d..76914ab 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -564,6 +564,11 @@ public:
 
   bool isOSzOS() const { return getOS() == Triple::ZOS; }
 
+  /// Is this an Apple MachO triple.
+  bool isAppleMachO() const {
+    return (getVendor() == Triple::Apple) && isOSBinFormatMachO();
+  }
+
   /// Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
   bool isOSDarwin() const {
     return isMacOSX() || isiOS() || isWatchOS() || isDriverKit() || isXROS();
@@ -846,6 +851,9 @@ public:
            getArch() == Triple::spirv;
   }
 
+  // Tests whether the target is SPIR-V or SPIR.
+  bool isSPIROrSPIRV() const { return isSPIR() || isSPIRV(); }
+
   /// Tests whether the target is SPIR-V Logical
   bool isSPIRVLogical() const {
     return getArch() == Triple::spirv;
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 8915969..28bce7b 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1478,7 +1478,7 @@ struct AttributorConfig {
   /// The name of the pass running the attributor, used to emit remarks.
   const char *PassName = nullptr;
 
-  using IPOAmendableCBTy = function_ref<bool(const Function &F)>;
+  using IPOAmendableCBTy = std::function<bool(const Function &F)>;
   IPOAmendableCBTy IPOAmendableCB;
 };
 
@@ -3853,7 +3853,7 @@ struct AANoAlias
 
   /// See AbstractAttribute::isValidIRPositionForInit
   static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) {
-    if (!IRP.getAssociatedType()->isPtrOrPtrVectorTy())
+    if (!IRP.getAssociatedType()->isPointerTy())
       return false;
     return IRAttribute::isValidIRPositionForInit(A, IRP);
   }
@@ -4220,7 +4220,7 @@ struct AADereferenceable
 
   /// See AbstractAttribute::isValidIRPositionForInit
   static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) {
-    if (!IRP.getAssociatedType()->isPtrOrPtrVectorTy())
+    if (!IRP.getAssociatedType()->isPointerTy())
       return false;
     return IRAttribute::isValidIRPositionForInit(A, IRP);
   }
@@ -4364,7 +4364,7 @@ struct AANoCapture
 
   /// See AbstractAttribute::isValidIRPositionForInit
   static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) {
-    if (!IRP.getAssociatedType()->isPtrOrPtrVectorTy())
+    if (!IRP.getAssociatedType()->isPointerTy())
       return false;
     return IRAttribute::isValidIRPositionForInit(A, IRP);
   }
@@ -4635,8 +4635,7 @@ struct AAMemoryBehavior
 
   /// See AbstractAttribute::isValidIRPositionForInit
   static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) {
-    if (!IRP.isFunctionScope() &&
-        !IRP.getAssociatedType()->isPtrOrPtrVectorTy())
+    if (!IRP.isFunctionScope() && !IRP.getAssociatedType()->isPointerTy())
       return false;
     return IRAttribute::isValidIRPositionForInit(A, IRP);
   }
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index 7159205..fa6b60c 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -184,12 +184,6 @@ public:
     return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
   }
 
-  std::optional<std::pair<
-      CmpPredicate,
-      Constant *>> static getFlippedStrictnessPredicateAndConstant(CmpPredicate
-                                                                       Pred,
-                                                                   Constant *C);
-
   static bool shouldAvoidAbsorbingNotIntoSelect(const SelectInst &SI) {
     // a ? b : false and a ? true : b are the canonical form of logical and/or.
     // This includes !a ? b : false and !a ? true : b. Absorbing the not into
diff --git a/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
index a6cc56d..20f08b6 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
@@ -20,19 +20,11 @@ class Function;
 class FunctionPass;
 class Module;
 
-/// A function pass for tysan instrumentation.
 struct TypeSanitizerPass : public PassInfoMixin<TypeSanitizerPass> {
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
-  static bool isRequired() { return true; }
-};
-
-/// A module pass for tysan instrumentation.
-///
-/// Create ctor and init functions.
-struct ModuleTypeSanitizerPass : public PassInfoMixin<ModuleTypeSanitizerPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 };
 
 } // namespace llvm
+
 #endif /* LLVM_TRANSFORMS_INSTRUMENTATION_TYPESANITIZER_H */
diff --git a/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h b/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h
index 749b7b2..18cd923 100644
--- a/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h
@@ -120,12 +120,12 @@ public:
 #endif
   }
 
-  bool run();
+  void run();
 };
 
 /// Perform in-place global value handling on the given Module for
 /// exported local functions renamed and promoted for ThinLTO.
-bool renameModuleForThinLTO(
+void renameModuleForThinLTO(
     Module &M, const ModuleSummaryIndex &Index,
     bool ClearDSOLocalOnDeclarations,
     SetVector<GlobalValue *> *GlobalsToImport = nullptr);
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index bbf29e6..40c4485 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -412,6 +412,11 @@ Instruction *removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU = nullptr);
 bool removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU = nullptr,
                              MemorySSAUpdater *MSSAU = nullptr);
 
+/// DO NOT CALL EXTERNALLY.
+/// FIXME: https://github.com/llvm/llvm-project/issues/121495
+/// Once external callers of this function are removed, either inline into
+/// combineMetadataForCSE, or internalize and remove KnownIDs parameter.
+///
 /// Combine the metadata of two instructions so that K can replace J. Some
 /// metadata kinds can only be kept if K does not move, meaning it dominated
 /// J in the original IR.
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
index 3959f84..9b68d47 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
@@ -79,6 +79,10 @@ public:
     for (auto *N : this->Nodes)
       N->setSchedBundle(*this);
   }
+  /// Copy CTOR (unimplemented).
+  SchedBundle(const SchedBundle &Other) = delete;
+  /// Copy Assignment (unimplemented).
+  SchedBundle &operator=(const SchedBundle &Other) = delete;
   ~SchedBundle() {
     for (auto *N : this->Nodes)
       N->clearSchedBundle();
diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap
index 6beb0e0..46277e1 100644
--- a/llvm/include/module.modulemap
+++ b/llvm/include/module.modulemap
@@ -96,6 +96,7 @@ module LLVM_BinaryFormat {
     textual header "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
     textual header "llvm/BinaryFormat/ELFRelocs/PowerPC.def"
     textual header "llvm/BinaryFormat/ELFRelocs/RISCV.def"
+    textual header "llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def"
     textual header "llvm/BinaryFormat/ELFRelocs/Sparc.def"
     textual header "llvm/BinaryFormat/ELFRelocs/SystemZ.def"
     textual header "llvm/BinaryFormat/ELFRelocs/VE.def"
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 88533f2..031d675 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -45,8 +45,10 @@
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -1687,6 +1689,58 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::x86_avx512_cvttsd2usi64:
     return !Call->isStrictFP();
 
+  // NVVM float/double to int32/uint32 conversion intrinsics
+  case Intrinsic::nvvm_f2i_rm:
+  case Intrinsic::nvvm_f2i_rn:
+  case Intrinsic::nvvm_f2i_rp:
+  case Intrinsic::nvvm_f2i_rz:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+  case Intrinsic::nvvm_f2ui_rm:
+  case Intrinsic::nvvm_f2ui_rn:
+  case Intrinsic::nvvm_f2ui_rp:
+  case Intrinsic::nvvm_f2ui_rz:
+  case Intrinsic::nvvm_f2ui_rm_ftz:
+  case Intrinsic::nvvm_f2ui_rn_ftz:
+  case Intrinsic::nvvm_f2ui_rp_ftz:
+  case Intrinsic::nvvm_f2ui_rz_ftz:
+  case Intrinsic::nvvm_d2i_rm:
+  case Intrinsic::nvvm_d2i_rn:
+  case Intrinsic::nvvm_d2i_rp:
+  case Intrinsic::nvvm_d2i_rz:
+  case Intrinsic::nvvm_d2ui_rm:
+  case Intrinsic::nvvm_d2ui_rn:
+  case Intrinsic::nvvm_d2ui_rp:
+  case Intrinsic::nvvm_d2ui_rz:
+
+  // NVVM float/double to int64/uint64 conversion intrinsics
+  case Intrinsic::nvvm_f2ll_rm:
+  case Intrinsic::nvvm_f2ll_rn:
+  case Intrinsic::nvvm_f2ll_rp:
+  case Intrinsic::nvvm_f2ll_rz:
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+  case Intrinsic::nvvm_f2ull_rm:
+  case Intrinsic::nvvm_f2ull_rn:
+  case Intrinsic::nvvm_f2ull_rp:
+  case Intrinsic::nvvm_f2ull_rz:
+  case Intrinsic::nvvm_f2ull_rm_ftz:
+  case Intrinsic::nvvm_f2ull_rn_ftz:
+  case Intrinsic::nvvm_f2ull_rp_ftz:
+  case Intrinsic::nvvm_f2ull_rz_ftz:
+  case Intrinsic::nvvm_d2ll_rm:
+  case Intrinsic::nvvm_d2ll_rn:
+  case Intrinsic::nvvm_d2ll_rp:
+  case Intrinsic::nvvm_d2ll_rz:
+  case Intrinsic::nvvm_d2ull_rm:
+  case Intrinsic::nvvm_d2ull_rn:
+  case Intrinsic::nvvm_d2ull_rp:
+  case Intrinsic::nvvm_d2ull_rz:
+
   // Sign operations are actually bitwise operations, they do not raise
   // exceptions even for SNANs.
   case Intrinsic::fabs:
@@ -1849,6 +1903,12 @@ inline bool llvm_fenv_testexcept() {
   return false;
 }
 
+static const APFloat FTZPreserveSign(const APFloat &V) {
+  if (V.isDenormal())
+    return APFloat::getZero(V.getSemantics(), V.isNegative());
+  return V;
+}
+
 Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
                          Type *Ty) {
   llvm_fenv_clearexcept();
@@ -2309,6 +2369,85 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return ConstantFP::get(Ty->getContext(), U);
     }
 
+    // NVVM float/double to signed/unsigned int32/int64 conversions:
+    switch (IntrinsicID) {
+    // f2i
+    case Intrinsic::nvvm_f2i_rm:
+    case Intrinsic::nvvm_f2i_rn:
+    case Intrinsic::nvvm_f2i_rp:
+    case Intrinsic::nvvm_f2i_rz:
+    case Intrinsic::nvvm_f2i_rm_ftz:
+    case Intrinsic::nvvm_f2i_rn_ftz:
+    case Intrinsic::nvvm_f2i_rp_ftz:
+    case Intrinsic::nvvm_f2i_rz_ftz:
+    // f2ui
+    case Intrinsic::nvvm_f2ui_rm:
+    case Intrinsic::nvvm_f2ui_rn:
+    case Intrinsic::nvvm_f2ui_rp:
+    case Intrinsic::nvvm_f2ui_rz:
+    case Intrinsic::nvvm_f2ui_rm_ftz:
+    case Intrinsic::nvvm_f2ui_rn_ftz:
+    case Intrinsic::nvvm_f2ui_rp_ftz:
+    case Intrinsic::nvvm_f2ui_rz_ftz:
+    // d2i
+    case Intrinsic::nvvm_d2i_rm:
+    case Intrinsic::nvvm_d2i_rn:
+    case Intrinsic::nvvm_d2i_rp:
+    case Intrinsic::nvvm_d2i_rz:
+    // d2ui
+    case Intrinsic::nvvm_d2ui_rm:
+    case Intrinsic::nvvm_d2ui_rn:
+    case Intrinsic::nvvm_d2ui_rp:
+    case Intrinsic::nvvm_d2ui_rz:
+    // f2ll
+    case Intrinsic::nvvm_f2ll_rm:
+    case Intrinsic::nvvm_f2ll_rn:
+    case Intrinsic::nvvm_f2ll_rp:
+    case Intrinsic::nvvm_f2ll_rz:
+    case Intrinsic::nvvm_f2ll_rm_ftz:
+    case Intrinsic::nvvm_f2ll_rn_ftz:
+    case Intrinsic::nvvm_f2ll_rp_ftz:
+    case Intrinsic::nvvm_f2ll_rz_ftz:
+    // f2ull
+    case Intrinsic::nvvm_f2ull_rm:
+    case Intrinsic::nvvm_f2ull_rn:
+    case Intrinsic::nvvm_f2ull_rp:
+    case Intrinsic::nvvm_f2ull_rz:
+    case Intrinsic::nvvm_f2ull_rm_ftz:
+    case Intrinsic::nvvm_f2ull_rn_ftz:
+    case Intrinsic::nvvm_f2ull_rp_ftz:
+    case Intrinsic::nvvm_f2ull_rz_ftz:
+    // d2ll
+    case Intrinsic::nvvm_d2ll_rm:
+    case Intrinsic::nvvm_d2ll_rn:
+    case Intrinsic::nvvm_d2ll_rp:
+    case Intrinsic::nvvm_d2ll_rz:
+    // d2ull
+    case Intrinsic::nvvm_d2ull_rm:
+    case Intrinsic::nvvm_d2ull_rn:
+    case Intrinsic::nvvm_d2ull_rp:
+    case Intrinsic::nvvm_d2ull_rz: {
+      // In float-to-integer conversion, NaN inputs are converted to 0.
+      if (U.isNaN())
+        return ConstantInt::get(Ty, 0);
+
+      APFloat::roundingMode RMode = nvvm::IntrinsicGetRoundingMode(IntrinsicID);
+      bool IsFTZ = nvvm::IntrinsicShouldFTZ(IntrinsicID);
+      bool IsSigned = nvvm::IntrinsicConvertsToSignedInteger(IntrinsicID);
+
+      APSInt ResInt(Ty->getIntegerBitWidth(), !IsSigned);
+      auto FloatToRound = IsFTZ ? FTZPreserveSign(U) : U;
+
+      bool IsExact = false;
+      APFloat::opStatus Status =
+          FloatToRound.convertToInteger(ResInt, RMode, &IsExact);
+
+      if (Status != APFloat::opInvalidOp)
+        return ConstantInt::get(Ty, ResInt);
+      return nullptr;
+    }
+    }
+
     /// We only fold functions with finite arguments. Folding NaN and inf is
     /// likely to be aborted with an exception anyway, and some host libms
     /// have known errors raising exceptions.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 8567a05..999386c 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4275,25 +4275,27 @@ Value *llvm::simplifyFCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
   return ::simplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
-static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
-                                     const SimplifyQuery &Q,
-                                     bool AllowRefinement,
-                                     SmallVectorImpl<Instruction *> *DropFlags,
-                                     unsigned MaxRecurse) {
+static Value *simplifyWithOpsReplaced(Value *V,
+                                      ArrayRef<std::pair<Value *, Value *>> Ops,
+                                      const SimplifyQuery &Q,
+                                      bool AllowRefinement,
+                                      SmallVectorImpl<Instruction *> *DropFlags,
+                                      unsigned MaxRecurse) {
   assert((AllowRefinement || !Q.CanUseUndef) &&
          "If AllowRefinement=false then CanUseUndef=false");
+  for (const auto &OpAndRepOp : Ops) {
+    // We cannot replace a constant, and shouldn't even try.
+    if (isa<Constant>(OpAndRepOp.first))
+      return nullptr;
 
-  // Trivial replacement.
-  if (V == Op)
-    return RepOp;
+    // Trivial replacement.
+    if (V == OpAndRepOp.first)
+      return OpAndRepOp.second;
+  }
 
   if (!MaxRecurse--)
     return nullptr;
 
-  // We cannot replace a constant, and shouldn't even try.
-  if (isa<Constant>(Op))
-    return nullptr;
-
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
     return nullptr;
@@ -4303,11 +4305,6 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (isa<PHINode>(I))
     return nullptr;
 
-  // For vector types, the simplification must hold per-lane, so forbid
-  // potentially cross-lane operations like shufflevector.
-  if (Op->getType()->isVectorTy() && !isNotCrossLaneOperation(I))
-    return nullptr;
-
   // Don't fold away llvm.is.constant checks based on assumptions.
   if (match(I, m_Intrinsic<Intrinsic::is_constant>()))
     return nullptr;
@@ -4316,12 +4313,20 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (isa<FreezeInst>(I))
     return nullptr;
 
+  for (const auto &OpAndRepOp : Ops) {
+    // For vector types, the simplification must hold per-lane, so forbid
+    // potentially cross-lane operations like shufflevector.
+    if (OpAndRepOp.first->getType()->isVectorTy() &&
+        !isNotCrossLaneOperation(I))
+      return nullptr;
+  }
+
   // Replace Op with RepOp in instruction operands.
   SmallVector<Value *, 8> NewOps;
   bool AnyReplaced = false;
   for (Value *InstOp : I->operands()) {
-    if (Value *NewInstOp = simplifyWithOpReplaced(
-            InstOp, Op, RepOp, Q, AllowRefinement, DropFlags, MaxRecurse)) {
+    if (Value *NewInstOp = simplifyWithOpsReplaced(
+            InstOp, Ops, Q, AllowRefinement, DropFlags, MaxRecurse)) {
       NewOps.push_back(NewInstOp);
       AnyReplaced = InstOp != NewInstOp;
     } else {
@@ -4372,7 +4377,8 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
       // by assumption and this case never wraps, so nowrap flags can be
       // ignored.
       if ((Opcode == Instruction::Sub || Opcode == Instruction::Xor) &&
-          NewOps[0] == RepOp && NewOps[1] == RepOp)
+          NewOps[0] == NewOps[1] &&
+          any_of(Ops, [=](const auto &Rep) { return NewOps[0] == Rep.second; }))
         return Constant::getNullValue(I->getType());
 
       // If we are substituting an absorber constant into a binop and extra
@@ -4382,10 +4388,10 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
       // (Op == 0) ? 0 : (Op & -Op)            --> Op & -Op
       // (Op == 0) ? 0 : (Op * (binop Op, C))  --> Op * (binop Op, C)
       // (Op == -1) ? -1 : (Op | (binop C, Op) --> Op | (binop C, Op)
-      Constant *Absorber =
-          ConstantExpr::getBinOpAbsorber(Opcode, I->getType());
+      Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType());
       if ((NewOps[0] == Absorber || NewOps[1] == Absorber) &&
-          impliesPoison(BO, Op))
+          any_of(Ops,
+                 [=](const auto &Rep) { return impliesPoison(BO, Rep.first); }))
         return Absorber;
     }
 
@@ -4453,6 +4459,15 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                   /*AllowNonDeterministic=*/false);
 }
 
+static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                     const SimplifyQuery &Q,
+                                     bool AllowRefinement,
+                                     SmallVectorImpl<Instruction *> *DropFlags,
+                                     unsigned MaxRecurse) {
+  return simplifyWithOpsReplaced(V, {{Op, RepOp}}, Q, AllowRefinement,
+                                 DropFlags, MaxRecurse);
+}
+
 Value *llvm::simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                     const SimplifyQuery &Q,
                                     bool AllowRefinement,
@@ -4595,17 +4610,24 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS,
 
 /// Try to simplify a select instruction when its condition operand is an
 /// integer equality or floating-point equivalence comparison.
-static Value *simplifySelectWithEquivalence(Value *CmpLHS, Value *CmpRHS,
-                                            Value *TrueVal, Value *FalseVal,
-                                            const SimplifyQuery &Q,
-                                            unsigned MaxRecurse) {
-  if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q.getWithoutUndef(),
-                             /* AllowRefinement */ false,
-                             /* DropFlags */ nullptr, MaxRecurse) == TrueVal)
-    return FalseVal;
-  if (simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q,
-                             /* AllowRefinement */ true,
-                             /* DropFlags */ nullptr, MaxRecurse) == FalseVal)
+static Value *simplifySelectWithEquivalence(
+    ArrayRef<std::pair<Value *, Value *>> Replacements, Value *TrueVal,
+    Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) {
+  Value *SimplifiedFalseVal =
+      simplifyWithOpsReplaced(FalseVal, Replacements, Q.getWithoutUndef(),
+                              /* AllowRefinement */ false,
+                              /* DropFlags */ nullptr, MaxRecurse);
+  if (!SimplifiedFalseVal)
+    SimplifiedFalseVal = FalseVal;
+
+  Value *SimplifiedTrueVal =
+      simplifyWithOpsReplaced(TrueVal, Replacements, Q,
+                              /* AllowRefinement */ true,
+                              /* DropFlags */ nullptr, MaxRecurse);
+  if (!SimplifiedTrueVal)
+    SimplifiedTrueVal = TrueVal;
+
+  if (SimplifiedFalseVal == SimplifiedTrueVal)
     return FalseVal;
 
   return nullptr;
@@ -4699,10 +4721,10 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   // the arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (Value *V = simplifySelectWithEquivalence(CmpLHS, CmpRHS, TrueVal,
+    if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, TrueVal,
                                                  FalseVal, Q, MaxRecurse))
       return V;
-    if (Value *V = simplifySelectWithEquivalence(CmpRHS, CmpLHS, TrueVal,
+    if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, TrueVal,
                                                  FalseVal, Q, MaxRecurse))
       return V;
 
@@ -4712,11 +4734,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
     if (match(CmpLHS, m_Or(m_Value(X), m_Value(Y))) &&
         match(CmpRHS, m_Zero())) {
       // (X | Y) == 0 implies X == 0 and Y == 0.
-      if (Value *V = simplifySelectWithEquivalence(X, CmpRHS, TrueVal, FalseVal,
-                                                   Q, MaxRecurse))
-        return V;
-      if (Value *V = simplifySelectWithEquivalence(Y, CmpRHS, TrueVal, FalseVal,
-                                                   Q, MaxRecurse))
+      if (Value *V = simplifySelectWithEquivalence(
+              {{X, CmpRHS}, {Y, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse))
         return V;
     }
 
@@ -4724,11 +4743,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
     if (match(CmpLHS, m_And(m_Value(X), m_Value(Y))) &&
         match(CmpRHS, m_AllOnes())) {
       // (X & Y) == -1 implies X == -1 and Y == -1.
-      if (Value *V = simplifySelectWithEquivalence(X, CmpRHS, TrueVal, FalseVal,
-                                                   Q, MaxRecurse))
-        return V;
-      if (Value *V = simplifySelectWithEquivalence(Y, CmpRHS, TrueVal, FalseVal,
-                                                   Q, MaxRecurse))
+      if (Value *V = simplifySelectWithEquivalence(
+              {{X, CmpRHS}, {Y, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse))
         return V;
     }
   }
@@ -4757,11 +4773,11 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F,
   // This transforms is safe if at least one operand is known to not be zero.
   // Otherwise, the select can change the sign of a zero operand.
   if (IsEquiv) {
-    if (Value *V =
-            simplifySelectWithEquivalence(CmpLHS, CmpRHS, T, F, Q, MaxRecurse))
+    if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, T, F, Q,
+                                                 MaxRecurse))
       return V;
-    if (Value *V =
-            simplifySelectWithEquivalence(CmpRHS, CmpLHS, T, F, Q, MaxRecurse))
+    if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, T, F, Q,
+                                                 MaxRecurse))
       return V;
   }
 
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 4689451..e9d96a0c 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -266,6 +266,30 @@ void Lint::visitCallBase(CallBase &I) {
           visitMemoryReference(I, Loc, DL->getABITypeAlign(Ty), Ty,
                                MemRef::Read | MemRef::Write);
         }
+
+        // Check that ABI attributes for the function and call-site match.
+        unsigned ArgNo = AI->getOperandNo();
+        Attribute::AttrKind ABIAttributes[] = {
+            Attribute::ZExt,         Attribute::SExt,     Attribute::InReg,
+            Attribute::ByVal,        Attribute::ByRef,    Attribute::InAlloca,
+            Attribute::Preallocated, Attribute::StructRet};
+        AttributeList CallAttrs = I.getAttributes();
+        for (Attribute::AttrKind Attr : ABIAttributes) {
+          Attribute CallAttr = CallAttrs.getParamAttr(ArgNo, Attr);
+          Attribute FnAttr = F->getParamAttribute(ArgNo, Attr);
+          Check(CallAttr.isValid() == FnAttr.isValid(),
+                Twine("Undefined behavior: ABI attribute ") +
+                    Attribute::getNameFromAttrKind(Attr) +
+                    " not present on both function and call-site",
+                &I);
+          if (CallAttr.isValid() && FnAttr.isValid()) {
+            Check(CallAttr == FnAttr,
+                  Twine("Undefined behavior: ABI attribute ") +
+                      Attribute::getNameFromAttrKind(Attr) +
+                      " does not have same argument for function and call-site",
+                  &I);
+          }
+        }
       }
     }
   }
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 54b9521..bc03e40 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -25,10 +25,9 @@
 
 using namespace llvm;
 
-static bool isAligned(const Value *Base, const APInt &Offset, Align Alignment,
+static bool isAligned(const Value *Base, Align Alignment,
                       const DataLayout &DL) {
-  Align BA = Base->getPointerAlignment(DL);
-  return BA >= Alignment && Offset.isAligned(BA);
+  return Base->getPointerAlignment(DL) >= Alignment;
 }
 
 /// Test if V is always a pointer to allocated and suitably aligned memory for
@@ -118,8 +117,7 @@ static bool isDereferenceableAndAlignedPointer(
     // As we recursed through GEPs to get here, we've incrementally checked
     // that each step advanced by a multiple of the alignment. If our base is
     // properly aligned, then the original offset accessed must also be.
-    APInt Offset(DL.getTypeStoreSizeInBits(V->getType()), 0);
-    return isAligned(V, Offset, Alignment, DL);
+    return isAligned(V, Alignment, DL);
   }
 
   /// TODO refactor this function to be able to search independently for
@@ -154,8 +152,7 @@ static bool isDereferenceableAndAlignedPointer(
         // checked that each step advanced by a multiple of the alignment. If
         // our base is properly aligned, then the original offset accessed
         // must also be.
-        APInt Offset(DL.getTypeStoreSizeInBits(V->getType()), 0);
-        return isAligned(V, Offset, Alignment, DL);
+        return isAligned(V, Alignment, DL);
       }
     }
   }
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 1c3f589..2f3c87a 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -347,3 +347,20 @@ template <> uint64_t CallStack<MDNode, MDNode::op_iterator>::back() const {
   return mdconst::dyn_extract<ConstantInt>(N->operands().back())
       ->getZExtValue();
 }
+
+MDNode *MDNode::getMergedMemProfMetadata(MDNode *A, MDNode *B) {
+  // TODO: Support more sophisticated merging, such as selecting the one with
+  // more bytes allocated, or implement support for carrying multiple allocation
+  // leaf contexts. For now, keep the first one.
+  if (A)
+    return A;
+  return B;
+}
+
+MDNode *MDNode::getMergedCallsiteMetadata(MDNode *A, MDNode *B) {
+  // TODO: Support more sophisticated merging, which will require support for
+  // carrying multiple contexts. For now, keep the first one.
+  if (A)
+    return A;
+  return B;
+}
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 8ab5602..7e18f7c 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -226,7 +226,7 @@ static cl::opt<unsigned> RangeIterThreshold(
 
 static cl::opt<unsigned> MaxLoopGuardCollectionDepth(
     "scalar-evolution-max-loop-guard-collection-depth", cl::Hidden,
-    cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1));
+    cl::desc("Maximum depth for recursive loop guard collection"), cl::init(1));
 
 static cl::opt<bool>
 ClassifyExpressions("scalar-evolution-classify-expressions",
@@ -15765,6 +15765,7 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
   // original header.
   // TODO: share this logic with isLoopEntryGuardedByCond.
   unsigned NumCollectedConditions = 0;
+  VisitedBlocks.insert(Block);
   std::pair<const BasicBlock *, const BasicBlock *> Pair(Pred, Block);
   for (; Pair.first;
        Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 14d7c2d..0eb43dd 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1065,6 +1065,64 @@ void llvm::adjustKnownBitsForSelectArm(KnownBits &Known, Value *Cond,
   Known = CondRes;
 }
 
+// Match a signed min+max clamp pattern like smax(smin(In, CHigh), CLow).
+// Returns the input and lower/upper bounds.
+static bool isSignedMinMaxClamp(const Value *Select, const Value *&In,
+                                const APInt *&CLow, const APInt *&CHigh) {
+  assert(isa<Operator>(Select) &&
+         cast<Operator>(Select)->getOpcode() == Instruction::Select &&
+         "Input should be a Select!");
+
+  const Value *LHS = nullptr, *RHS = nullptr;
+  SelectPatternFlavor SPF = matchSelectPattern(Select, LHS, RHS).Flavor;
+  if (SPF != SPF_SMAX && SPF != SPF_SMIN)
+    return false;
+
+  if (!match(RHS, m_APInt(CLow)))
+    return false;
+
+  const Value *LHS2 = nullptr, *RHS2 = nullptr;
+  SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor;
+  if (getInverseMinMaxFlavor(SPF) != SPF2)
+    return false;
+
+  if (!match(RHS2, m_APInt(CHigh)))
+    return false;
+
+  if (SPF == SPF_SMIN)
+    std::swap(CLow, CHigh);
+
+  In = LHS2;
+  return CLow->sle(*CHigh);
+}
+
+static bool isSignedMinMaxIntrinsicClamp(const IntrinsicInst *II,
+                                         const APInt *&CLow,
+                                         const APInt *&CHigh) {
+  assert((II->getIntrinsicID() == Intrinsic::smin ||
+          II->getIntrinsicID() == Intrinsic::smax) &&
+         "Must be smin/smax");
+
+  Intrinsic::ID InverseID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
+  auto *InnerII = dyn_cast<IntrinsicInst>(II->getArgOperand(0));
+  if (!InnerII || InnerII->getIntrinsicID() != InverseID ||
+      !match(II->getArgOperand(1), m_APInt(CLow)) ||
+      !match(InnerII->getArgOperand(1), m_APInt(CHigh)))
+    return false;
+
+  if (II->getIntrinsicID() == Intrinsic::smin)
+    std::swap(CLow, CHigh);
+  return CLow->sle(*CHigh);
+}
+
+static void unionWithMinMaxIntrinsicClamp(const IntrinsicInst *II,
+                                          KnownBits &Known) {
+  const APInt *CLow, *CHigh;
+  if (isSignedMinMaxIntrinsicClamp(II, CLow, CHigh))
+    Known = Known.unionWith(
+        ConstantRange::getNonEmpty(*CLow, *CHigh + 1).toKnownBits());
+}
+
 static void computeKnownBitsFromOperator(const Operator *I,
                                          const APInt &DemandedElts,
                                          KnownBits &Known, unsigned Depth,
@@ -1804,11 +1862,13 @@ static void computeKnownBitsFromOperator(const Operator *I,
         computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
         computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::smin(Known, Known2);
+        unionWithMinMaxIntrinsicClamp(II, Known);
         break;
       case Intrinsic::smax:
         computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
         computeKnownBits(I->getOperand(1), DemandedElts, Known2, Depth + 1, Q);
         Known = KnownBits::smax(Known, Known2);
+        unionWithMinMaxIntrinsicClamp(II, Known);
         break;
       case Intrinsic::ptrmask: {
         computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
@@ -3751,55 +3811,6 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2,
   return false;
 }
 
-// Match a signed min+max clamp pattern like smax(smin(In, CHigh), CLow).
-// Returns the input and lower/upper bounds.
-static bool isSignedMinMaxClamp(const Value *Select, const Value *&In,
-                                const APInt *&CLow, const APInt *&CHigh) {
-  assert(isa<Operator>(Select) &&
-         cast<Operator>(Select)->getOpcode() == Instruction::Select &&
-         "Input should be a Select!");
-
-  const Value *LHS = nullptr, *RHS = nullptr;
-  SelectPatternFlavor SPF = matchSelectPattern(Select, LHS, RHS).Flavor;
-  if (SPF != SPF_SMAX && SPF != SPF_SMIN)
-    return false;
-
-  if (!match(RHS, m_APInt(CLow)))
-    return false;
-
-  const Value *LHS2 = nullptr, *RHS2 = nullptr;
-  SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor;
-  if (getInverseMinMaxFlavor(SPF) != SPF2)
-    return false;
-
-  if (!match(RHS2, m_APInt(CHigh)))
-    return false;
-
-  if (SPF == SPF_SMIN)
-    std::swap(CLow, CHigh);
-
-  In = LHS2;
-  return CLow->sle(*CHigh);
-}
-
-static bool isSignedMinMaxIntrinsicClamp(const IntrinsicInst *II,
-                                         const APInt *&CLow,
-                                         const APInt *&CHigh) {
-  assert((II->getIntrinsicID() == Intrinsic::smin ||
-          II->getIntrinsicID() == Intrinsic::smax) && "Must be smin/smax");
-
-  Intrinsic::ID InverseID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
-  auto *InnerII = dyn_cast<IntrinsicInst>(II->getArgOperand(0));
-  if (!InnerII || InnerII->getIntrinsicID() != InverseID ||
-      !match(II->getArgOperand(1), m_APInt(CLow)) ||
-      !match(InnerII->getArgOperand(1), m_APInt(CHigh)))
-    return false;
-
-  if (II->getIntrinsicID() == Intrinsic::smin)
-    std::swap(CLow, CHigh);
-  return CLow->sle(*CHigh);
-}
-
 /// For vector constants, loop over the elements and find the constant with the
 /// minimum number of sign bits. Return 0 if the value is not a vector constant
 /// or if any element was not analyzed; otherwise, return the count for the
@@ -8630,6 +8641,80 @@ SelectPatternResult llvm::getSelectPattern(CmpInst::Predicate Pred,
   }
 }
 
+std::optional<std::pair<CmpPredicate, Constant *>>
+llvm::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred, Constant *C) {
+  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
+         "Only for relational integer predicates.");
+  if (isa<UndefValue>(C))
+    return std::nullopt;
+
+  Type *Type = C->getType();
+  bool IsSigned = ICmpInst::isSigned(Pred);
+
+  CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
+  bool WillIncrement =
+      UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
+
+  // Check if the constant operand can be safely incremented/decremented
+  // without overflowing/underflowing.
+  auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
+    return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
+  };
+
+  Constant *SafeReplacementConstant = nullptr;
+  if (auto *CI = dyn_cast<ConstantInt>(C)) {
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!ConstantIsOk(CI))
+      return std::nullopt;
+  } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) {
+    unsigned NumElts = FVTy->getNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = C->getAggregateElement(i);
+      if (!Elt)
+        return std::nullopt;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      // Bail out if we can't determine if this constant is min/max or if we
+      // know that this constant is min/max.
+      auto *CI = dyn_cast<ConstantInt>(Elt);
+      if (!CI || !ConstantIsOk(CI))
+        return std::nullopt;
+
+      if (!SafeReplacementConstant)
+        SafeReplacementConstant = CI;
+    }
+  } else if (isa<VectorType>(C->getType())) {
+    // Handle scalable splat
+    Value *SplatC = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(SplatC);
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!CI || !ConstantIsOk(CI))
+      return std::nullopt;
+  } else {
+    // ConstantExpr?
+    return std::nullopt;
+  }
+
+  // It may not be safe to change a compare predicate in the presence of
+  // undefined elements, so replace those elements with the first safe constant
+  // that we found.
+  // TODO: in case of poison, it is safe; let's replace undefs only.
+  if (C->containsUndefOrPoisonElement()) {
+    assert(SafeReplacementConstant && "Replacement constant not set");
+    C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
+  }
+
+  CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
+
+  // Increment or decrement the constant.
+  Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
+  Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
+
+  return std::make_pair(NewPred, NewC);
+}
+
 static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
                                               FastMathFlags FMF,
                                               Value *CmpLHS, Value *CmpRHS,
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index d34fe0e..3ba4590 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -503,13 +503,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   // don't, this at least helps the user find where a global came from.
   if (MAI->hasSingleParameterDotFile()) {
     // .file "foo.c"
-
-    SmallString<128> FileName;
-    if (MAI->hasBasenameOnlyForFileDirective())
-      FileName = llvm::sys::path::filename(M.getSourceFileName());
-    else
-      FileName = M.getSourceFileName();
-    if (MAI->hasFourStringsDotFile()) {
+    if (MAI->isAIX()) {
       const char VerStr[] =
 #ifdef PACKAGE_VENDOR
           PACKAGE_VENDOR " "
@@ -520,9 +514,10 @@ bool AsmPrinter::doInitialization(Module &M) {
 #endif
           ;
       // TODO: Add timestamp and description.
-      OutStreamer->emitFileDirective(FileName, VerStr, "", "");
+      OutStreamer->emitFileDirective(M.getSourceFileName(), VerStr, "", "");
     } else {
-      OutStreamer->emitFileDirective(FileName);
+      OutStreamer->emitFileDirective(
+          llvm::sys::path::filename(M.getSourceFileName()));
     }
   }
 
@@ -967,11 +962,10 @@ void AsmPrinter::emitFunctionHeader() {
     MF->setSection(getObjFileLowering().SectionForGlobal(&F, TM));
   OutStreamer->switchSection(MF->getSection());
 
-  if (!MAI->hasVisibilityOnlyWithLinkage())
-    emitVisibility(CurrentFnSym, F.getVisibility());
-
-  if (MAI->needsFunctionDescriptors())
+  if (MAI->isAIX())
     emitLinkage(&F, CurrentFnDescSym);
+  else
+    emitVisibility(CurrentFnSym, F.getVisibility());
 
   emitLinkage(&F, CurrentFnSym);
   if (MAI->hasFunctionAlignment())
@@ -1031,7 +1025,7 @@ void AsmPrinter::emitFunctionHeader() {
   // to emit their specific function descriptor. Right now it is only used by
   // the AIX target. The PowerPC 64-bit V1 ELF target also uses function
   // descriptors and should be converted to use this hook as well.
-  if (MAI->needsFunctionDescriptors())
+  if (MAI->isAIX())
     emitFunctionDescriptor();
 
   // Emit the CurrentFnSym. This is a virtual function to allow targets to do
@@ -2234,9 +2228,6 @@ void AsmPrinter::emitGlobalAlias(const Module &M, const GlobalAlias &GA) {
   // point, all the extra label is emitted, we just have to emit linkage for
   // those labels.
   if (TM.getTargetTriple().isOSBinFormatXCOFF()) {
-    assert(MAI->hasVisibilityOnlyWithLinkage() &&
-           "Visibility should be handled with emitLinkage() on AIX.");
-
     // Linkage for alias of global variable has been emitted.
     if (isa<GlobalVariable>(GA.getAliaseeObject()))
       return;
@@ -2730,7 +2721,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
     HasNoSplitStack = true;
 
   // Get the function symbol.
-  if (!MAI->needsFunctionDescriptors()) {
+  if (!MAI->isAIX()) {
     CurrentFnSym = getSymbol(&MF.getFunction());
   } else {
     assert(TM.getTargetTriple().isOSAIX() &&
@@ -3923,21 +3914,22 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
   if (isa<ConstantAggregateZero>(CV)) {
     StructType *structType;
     if (AliasList && (structType = llvm::dyn_cast<StructType>(CV->getType()))) {
-      // Handle cases of aliases to direct struct elements
-      const StructLayout *Layout = DL.getStructLayout(structType);
-      uint64_t SizeSoFar = 0;
-      for (unsigned int i = 0, n = structType->getNumElements(); i < n - 1;
-           ++i) {
-        uint64_t GapToNext = Layout->getElementOffset(i + 1) - SizeSoFar;
-        AP.OutStreamer->emitZeros(GapToNext);
-        SizeSoFar += GapToNext;
-        emitGlobalAliasInline(AP, Offset + SizeSoFar, AliasList);
+      unsigned numElements = {structType->getNumElements()};
+      if (numElements != 0) {
+        // Handle cases of aliases to direct struct elements
+        const StructLayout *Layout = DL.getStructLayout(structType);
+        uint64_t SizeSoFar = 0;
+        for (unsigned int i = 0; i < numElements - 1; ++i) {
+          uint64_t GapToNext = Layout->getElementOffset(i + 1) - SizeSoFar;
+          AP.OutStreamer->emitZeros(GapToNext);
+          SizeSoFar += GapToNext;
+          emitGlobalAliasInline(AP, Offset + SizeSoFar, AliasList);
+        }
+        AP.OutStreamer->emitZeros(Size - SizeSoFar);
+        return;
       }
-      AP.OutStreamer->emitZeros(Size - SizeSoFar);
-      return;
-    } else {
-      return AP.OutStreamer->emitZeros(Size);
     }
+    return AP.OutStreamer->emitZeros(Size);
   }
 
   if (isa<UndefValue>(CV))
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index ebae27e..59fc4cf 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -153,7 +153,7 @@ static void EmitInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
     AsmPrinterVariant = MMI->getTarget().unqualifiedInlineAsmVariant();
 
   // FIXME: Should this happen for `asm inteldialect` as well?
-  if (!InputIsIntelDialect && MAI->getEmitGNUAsmStartIndentationMarker())
+  if (!InputIsIntelDialect && !MAI->isHLASM())
     OS << '\t';
 
   while (*LastEmitted) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index e1291e2..11de4b6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3789,6 +3789,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
       // they depend on addresses, throwing them out and rebuilding them.
       setCurrentDWARF5AccelTable(DWARF5AccelTableKind::CU);
       CU.constructTypeDIE(RefDie, cast<DICompositeType>(CTy));
+      CU.updateAcceleratorTables(CTy->getScope(), CTy, RefDie);
       return;
     }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 0225654..1632053 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -315,6 +315,11 @@ public:
   /// Get context owner's DIE.
   DIE *createTypeDIE(const DICompositeType *Ty);
 
+  /// If this is a named finished type then include it in the list of types for
+  /// the accelerator tables.
+  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
+                               const DIE &TyDIE);
+
 protected:
   ~DwarfUnit();
 
@@ -357,11 +362,6 @@ private:
 
   virtual void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) = 0;
 
-  /// If this is a named finished type then include it in the list of types for
-  /// the accelerator tables.
-  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
-                               const DIE &TyDIE);
-
   virtual bool isDwoUnit() const = 0;
   const MCSymbol *getCrossSectionRelativeBaseAddress() const override;
 
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 5c712e4..ba1b10e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -152,7 +152,7 @@ static cl::opt<bool>
 
 static cl::opt<bool>
     EnableAndCmpSinking("enable-andcmp-sinking", cl::Hidden, cl::init(true),
-                        cl::desc("Enable sinkinig and/cmp into branches."));
+                        cl::desc("Enable sinking and/cmp into branches."));
 
 static cl::opt<bool> DisableStoreExtract(
     "disable-cgp-store-extract", cl::Hidden, cl::init(false),
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index f3f7ea9..aec8df9 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -108,6 +108,13 @@ static bool isNeg(Value *V);
 static Value *getNegOperand(Value *V);
 
 namespace {
+template <typename T, typename IterT>
+std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
+  auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); });
+  if (Common != A.end())
+    return std::make_optional(*Common);
+  return std::nullopt;
+}
 
 class ComplexDeinterleavingLegacyPass : public FunctionPass {
 public:
@@ -144,6 +151,7 @@ private:
   friend class ComplexDeinterleavingGraph;
   using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;
   using RawNodePtr = ComplexDeinterleavingCompositeNode *;
+  bool OperandsValid = true;
 
 public:
   ComplexDeinterleavingOperation Operation;
@@ -160,7 +168,11 @@ public:
   SmallVector<RawNodePtr> Operands;
   Value *ReplacementNode = nullptr;
 
-  void addOperand(NodePtr Node) { Operands.push_back(Node.get()); }
+  void addOperand(NodePtr Node) {
+    if (!Node || !Node.get())
+      OperandsValid = false;
+    Operands.push_back(Node.get());
+  }
 
   void dump() { dump(dbgs()); }
   void dump(raw_ostream &OS) {
@@ -194,6 +206,8 @@ public:
       PrintNodeRef(Op);
     }
   }
+
+  bool areOperandsValid() { return OperandsValid; }
 };
 
 class ComplexDeinterleavingGraph {
@@ -293,7 +307,7 @@ private:
 
   NodePtr submitCompositeNode(NodePtr Node) {
     CompositeNodes.push_back(Node);
-    if (Node->Real && Node->Imag)
+    if (Node->Real)
       CachedResult[{Node->Real, Node->Imag}] = Node;
     return Node;
   }
@@ -327,6 +341,8 @@ private:
   ///      i: ai - br
   NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
   NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+  NodePtr identifyPartialReduction(Value *R, Value *I);
+  NodePtr identifyDotProduct(Value *Inst);
 
   NodePtr identifyNode(Value *R, Value *I);
 
@@ -396,6 +412,7 @@ private:
   /// * Deinterleave the final value outside of the loop and repurpose original
   /// reduction users
   void processReductionOperation(Value *OperationReplacement, RawNodePtr Node);
+  void processReductionSingle(Value *OperationReplacement, RawNodePtr Node);
 
 public:
   void dump() { dump(dbgs()); }
@@ -891,17 +908,163 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
-  LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n");
-  assert(R->getType() == I->getType() &&
-         "Real and imaginary parts should not have different types");
+ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
+
+  if (!TL->isComplexDeinterleavingOperationSupported(
+          ComplexDeinterleavingOperation::CDot, V->getType())) {
+    LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving "
+                         "operation CDot with the type "
+                      << *V->getType() << "\n");
+    return nullptr;
+  }
+
+  auto *Inst = cast<Instruction>(V);
+  auto *RealUser = cast<Instruction>(*Inst->user_begin());
+
+  NodePtr CN =
+      prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr);
+
+  NodePtr ANode;
+
+  const Intrinsic::ID PartialReduceInt =
+      Intrinsic::experimental_vector_partial_reduce_add;
+
+  Value *AReal = nullptr;
+  Value *AImag = nullptr;
+  Value *BReal = nullptr;
+  Value *BImag = nullptr;
+  Value *Phi = nullptr;
+
+  auto UnwrapCast = [](Value *V) -> Value * {
+    if (auto *CI = dyn_cast<CastInst>(V))
+      return CI->getOperand(0);
+    return V;
+  };
+
+  auto PatternRot0 = m_Intrinsic<PartialReduceInt>(
+      m_Intrinsic<PartialReduceInt>(m_Value(Phi),
+                                    m_Mul(m_Value(BReal), m_Value(AReal))),
+      m_Neg(m_Mul(m_Value(BImag), m_Value(AImag))));
+
+  auto PatternRot270 = m_Intrinsic<PartialReduceInt>(
+      m_Intrinsic<PartialReduceInt>(
+          m_Value(Phi), m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))),
+      m_Mul(m_Value(BImag), m_Value(AReal)));
+
+  if (match(Inst, PatternRot0)) {
+    CN->Rotation = ComplexDeinterleavingRotation::Rotation_0;
+  } else if (match(Inst, PatternRot270)) {
+    CN->Rotation = ComplexDeinterleavingRotation::Rotation_270;
+  } else {
+    Value *A0, *A1;
+    // The rotations 90 and 180 share the same operation pattern, so inspect the
+    // order of the operands, identifying where the real and imaginary
+    // components of A go, to discern between the aforementioned rotations.
+    auto PatternRot90Rot180 = m_Intrinsic<PartialReduceInt>(
+        m_Intrinsic<PartialReduceInt>(m_Value(Phi),
+                                      m_Mul(m_Value(BReal), m_Value(A0))),
+        m_Mul(m_Value(BImag), m_Value(A1)));
+
+    if (!match(Inst, PatternRot90Rot180))
+      return nullptr;
+
+    A0 = UnwrapCast(A0);
+    A1 = UnwrapCast(A1);
+
+    // Test if A0 is real/A1 is imag
+    ANode = identifyNode(A0, A1);
+    if (!ANode) {
+      // Test if A0 is imag/A1 is real
+      ANode = identifyNode(A1, A0);
+      // Unable to identify operand components, thus unable to identify rotation
+      if (!ANode)
+        return nullptr;
+      CN->Rotation = ComplexDeinterleavingRotation::Rotation_90;
+      AReal = A1;
+      AImag = A0;
+    } else {
+      AReal = A0;
+      AImag = A1;
+      CN->Rotation = ComplexDeinterleavingRotation::Rotation_180;
+    }
+  }
+
+  AReal = UnwrapCast(AReal);
+  AImag = UnwrapCast(AImag);
+  BReal = UnwrapCast(BReal);
+  BImag = UnwrapCast(BImag);
+
+  VectorType *VTy = cast<VectorType>(V->getType());
+  Type *ExpectedOperandTy = VectorType::getSubdividedVectorType(VTy, 2);
+  if (AReal->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (AImag->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (BReal->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (BImag->getType() != ExpectedOperandTy)
+    return nullptr;
+
+  if (Phi->getType() != VTy && RealUser->getType() != VTy)
+    return nullptr;
+
+  NodePtr Node = identifyNode(AReal, AImag);
+
+  // In the case that a node was identified to figure out the rotation, ensure
+  // that trying to identify a node with AReal and AImag post-unwrap results in
+  // the same node
+  if (ANode && Node != ANode) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Identified node is different from previously identified node. "
+           "Unable to confidently generate a complex operation node\n");
+    return nullptr;
+  }
+
+  CN->addOperand(Node);
+  CN->addOperand(identifyNode(BReal, BImag));
+  CN->addOperand(identifyNode(Phi, RealUser));
+
+  return submitCompositeNode(CN);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
+  // Partial reductions don't support non-vector types, so check these first
+  if (!isa<VectorType>(R->getType()) || !isa<VectorType>(I->getType()))
+    return nullptr;
+
+  auto CommonUser =
+      findCommonBetweenCollections<Value *>(R->users(), I->users());
+  if (!CommonUser)
+    return nullptr;
+
+  auto *IInst = dyn_cast<IntrinsicInst>(*CommonUser);
+  if (!IInst || IInst->getIntrinsicID() !=
+                    Intrinsic::experimental_vector_partial_reduce_add)
+    return nullptr;
+
+  if (NodePtr CN = identifyDotProduct(IInst))
+    return CN;
+
+  return nullptr;
+}
 
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
   auto It = CachedResult.find({R, I});
   if (It != CachedResult.end()) {
     LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
     return It->second;
   }
 
+  if (NodePtr CN = identifyPartialReduction(R, I))
+    return CN;
+
+  bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
+  if (!IsReduction && R->getType() != I->getType())
+    return nullptr;
+
   if (NodePtr CN = identifySplat(R, I))
     return CN;
 
@@ -1427,12 +1590,20 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
   if (It != RootToNode.end()) {
     auto RootNode = It->second;
     assert(RootNode->Operation ==
-           ComplexDeinterleavingOperation::ReductionOperation);
+               ComplexDeinterleavingOperation::ReductionOperation ||
+           RootNode->Operation ==
+               ComplexDeinterleavingOperation::ReductionSingle);
     // Find out which part, Real or Imag, comes later, and only if we come to
     // the latest part, add it to OrderedRoots.
     auto *R = cast<Instruction>(RootNode->Real);
-    auto *I = cast<Instruction>(RootNode->Imag);
-    auto *ReplacementAnchor = R->comesBefore(I) ? I : R;
+    auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
+
+    Instruction *ReplacementAnchor;
+    if (I)
+      ReplacementAnchor = R->comesBefore(I) ? I : R;
+    else
+      ReplacementAnchor = R;
+
     if (ReplacementAnchor != RootI)
       return false;
     OrderedRoots.push_back(RootI);
@@ -1523,7 +1694,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
     for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
       if (Processed[j])
         continue;
-
       auto *Real = OperationInstruction[i];
       auto *Imag = OperationInstruction[j];
       if (Real->getType() != Imag->getType())
@@ -1556,6 +1726,28 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
         break;
       }
     }
+
+    auto *Real = OperationInstruction[i];
+    // We want to check that we have 2 operands, but the function attributes
+    // being counted as operands bloats this value.
+    if (Real->getNumOperands() < 2)
+      continue;
+
+    RealPHI = ReductionInfo[Real].first;
+    ImagPHI = nullptr;
+    PHIsFound = false;
+    auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
+    if (Node && PHIsFound) {
+      LLVM_DEBUG(
+          dbgs() << "Identified single reduction starting from instruction: "
+                 << *Real << "/" << *ReductionInfo[Real].second << "\n");
+      Processed[i] = true;
+      auto RootNode = prepareCompositeNode(
+          ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
+      RootNode->addOperand(Node);
+      RootToNode[Real] = RootNode;
+      submitCompositeNode(RootNode);
+    }
   }
 
   RealPHI = nullptr;
@@ -1563,6 +1755,24 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
 }
 
 bool ComplexDeinterleavingGraph::checkNodes() {
+
+  bool FoundDeinterleaveNode = false;
+  for (NodePtr N : CompositeNodes) {
+    if (!N->areOperandsValid())
+      return false;
+    if (N->Operation == ComplexDeinterleavingOperation::Deinterleave)
+      FoundDeinterleaveNode = true;
+  }
+
+  // We need a deinterleave node in order to guarantee that we're working with
+  // complex numbers.
+  if (!FoundDeinterleaveNode) {
+    LLVM_DEBUG(
+        dbgs() << "Couldn't find a deinterleave node within the graph, cannot "
+                  "guarantee safety during graph transformation.\n");
+    return false;
+  }
+
   // Collect all instructions from roots to leaves
   SmallPtrSet<Instruction *, 16> AllInstructions;
   SmallVector<Instruction *, 8> Worklist;
@@ -1831,7 +2041,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real,
                                             Instruction *Imag) {
-  if (Real != RealPHI || Imag != ImagPHI)
+  if (Real != RealPHI || (ImagPHI && Imag != ImagPHI))
     return nullptr;
 
   PHIsFound = true;
@@ -1926,6 +2136,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
 
   Value *ReplacementNode;
   switch (Node->Operation) {
+  case ComplexDeinterleavingOperation::CDot: {
+    Value *Input0 = ReplaceOperandIfExist(Node, 0);
+    Value *Input1 = ReplaceOperandIfExist(Node, 1);
+    Value *Accumulator = ReplaceOperandIfExist(Node, 2);
+    assert(!Input1 || (Input0->getType() == Input1->getType() &&
+                       "Node inputs need to be of the same type"));
+    ReplacementNode = TL->createComplexDeinterleavingIR(
+        Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator);
+    break;
+  }
   case ComplexDeinterleavingOperation::CAdd:
   case ComplexDeinterleavingOperation::CMulPartial:
   case ComplexDeinterleavingOperation::Symmetric: {
@@ -1969,13 +2189,18 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
   case ComplexDeinterleavingOperation::ReductionPHI: {
     // If Operation is ReductionPHI, a new empty PHINode is created.
     // It is filled later when the ReductionOperation is processed.
+    auto *OldPHI = cast<PHINode>(Node->Real);
     auto *VTy = cast<VectorType>(Node->Real->getType());
     auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
     auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt());
-    OldToNewPHI[dyn_cast<PHINode>(Node->Real)] = NewPHI;
+    OldToNewPHI[OldPHI] = NewPHI;
     ReplacementNode = NewPHI;
     break;
   }
+  case ComplexDeinterleavingOperation::ReductionSingle:
+    ReplacementNode = replaceNode(Builder, Node->Operands[0]);
+    processReductionSingle(ReplacementNode, Node);
+    break;
   case ComplexDeinterleavingOperation::ReductionOperation:
     ReplacementNode = replaceNode(Builder, Node->Operands[0]);
     processReductionOperation(ReplacementNode, Node);
@@ -2000,6 +2225,38 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
   return ReplacementNode;
 }
 
+void ComplexDeinterleavingGraph::processReductionSingle(
+    Value *OperationReplacement, RawNodePtr Node) {
+  auto *Real = cast<Instruction>(Node->Real);
+  auto *OldPHI = ReductionInfo[Real].first;
+  auto *NewPHI = OldToNewPHI[OldPHI];
+  auto *VTy = cast<VectorType>(Real->getType());
+  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+
+  Value *Init = OldPHI->getIncomingValueForBlock(Incoming);
+
+  IRBuilder<> Builder(Incoming->getTerminator());
+
+  Value *NewInit = nullptr;
+  if (auto *C = dyn_cast<Constant>(Init)) {
+    if (C->isZeroValue())
+      NewInit = Constant::getNullValue(NewVTy);
+  }
+
+  if (!NewInit)
+    NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
+                                      {Init, Constant::getNullValue(VTy)});
+
+  NewPHI->addIncoming(NewInit, Incoming);
+  NewPHI->addIncoming(OperationReplacement, BackEdge);
+
+  auto *FinalReduction = ReductionInfo[Real].second;
+  Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
+
+  auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
+  FinalReduction->replaceAllUsesWith(AddReduce);
+}
+
 void ComplexDeinterleavingGraph::processReductionOperation(
     Value *OperationReplacement, RawNodePtr Node) {
   auto *Real = cast<Instruction>(Node->Real);
@@ -2059,8 +2316,13 @@ void ComplexDeinterleavingGraph::replaceNodes() {
       auto *RootImag = cast<Instruction>(RootNode->Imag);
       ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
       ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
-      DeadInstrRoots.push_back(cast<Instruction>(RootReal));
-      DeadInstrRoots.push_back(cast<Instruction>(RootImag));
+      DeadInstrRoots.push_back(RootReal);
+      DeadInstrRoots.push_back(RootImag);
+    } else if (RootNode->Operation ==
+               ComplexDeinterleavingOperation::ReductionSingle) {
+      auto *RootInst = cast<Instruction>(RootNode->Real);
+      ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
+      DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
     } else {
       assert(R && "Unable to find replacement for RootInstruction");
       DeadInstrRoots.push_back(RootInstruction);
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index f8ca7e3..74f93e1 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -669,17 +669,25 @@ Value *MemCmpExpansion::getMemCmpOneBlock() {
   if (CI->hasOneUser()) {
     auto *UI = cast<Instruction>(*CI->user_begin());
     CmpPredicate Pred = ICmpInst::Predicate::BAD_ICMP_PREDICATE;
-    uint64_t Shift;
     bool NeedsZExt = false;
     // This is a special case because instead of checking if the result is less
     // than zero:
     //    bool result = memcmp(a, b, NBYTES) < 0;
     // Compiler is clever enough to generate the following code:
     //    bool result = memcmp(a, b, NBYTES) >> 31;
-    if (match(UI, m_LShr(m_Value(), m_ConstantInt(Shift))) &&
-        Shift == (CI->getType()->getIntegerBitWidth() - 1)) {
+    if (match(UI,
+              m_LShr(m_Value(),
+                     m_SpecificInt(CI->getType()->getIntegerBitWidth() - 1)))) {
       Pred = ICmpInst::ICMP_SLT;
       NeedsZExt = true;
+    } else if (match(UI, m_SpecificICmp(ICmpInst::ICMP_SGT, m_Specific(CI),
+                                        m_AllOnes()))) {
+      // Adjust predicate as if it compared with 0.
+      Pred = ICmpInst::ICMP_SGE;
+    } else if (match(UI, m_SpecificICmp(ICmpInst::ICMP_SLT, m_Specific(CI),
+                                        m_One()))) {
+      // Adjust predicate as if it compared with 0.
+      Pred = ICmpInst::ICMP_SLE;
     } else {
       // In case of a successful match this call will set `Pred` variable
       match(UI, m_ICmp(Pred, m_Specific(CI), m_Zero()));
@@ -696,17 +704,9 @@ Value *MemCmpExpansion::getMemCmpOneBlock() {
     }
   }
 
-  // The result of memcmp is negative, zero, or positive, so produce that by
-  // subtracting 2 extended compare bits: sub (ugt, ult).
-  // If a target prefers to use selects to get -1/0/1, they should be able
-  // to transform this later. The inverse transform (going from selects to math)
-  // may not be possible in the DAG because the selects got converted into
-  // branches before we got there.
-  Value *CmpUGT = Builder.CreateICmpUGT(Loads.Lhs, Loads.Rhs);
-  Value *CmpULT = Builder.CreateICmpULT(Loads.Lhs, Loads.Rhs);
-  Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
-  Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
-  return Builder.CreateSub(ZextUGT, ZextULT);
+  // The result of memcmp is negative, zero, or positive.
+  return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::ucmp,
+                                 {Loads.Lhs, Loads.Rhs});
 }
 
 // This function expands the memcmp call into an inline expansion and returns
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index c20e9d0..4e3aaf5d 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6864,6 +6864,23 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
     };
     return true;
   }
+
+  // select Cond, 0, Pow2 --> (zext (!Cond)) << log2(Pow2)
+  if (FalseValue.isPowerOf2() && TrueValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Not = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Not, Cond);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Not);
+      // The shift amount must be scalar.
+      LLT ShiftTy = TrueTy.isVector() ? TrueTy.getElementType() : TrueTy;
+      auto ShAmtC = B.buildConstant(ShiftTy, FalseValue.exactLogBase2());
+      B.buildShl(Dest, Inner, ShAmtC, Flags);
+    };
+    return true;
+  }
+
   // select Cond, -1, C --> or (sext Cond), C
   if (TrueValue.isAllOnes()) {
     MatchInfo = [=](MachineIRBuilder &B) {
@@ -7045,6 +7062,34 @@ bool CombinerHelper::matchSelectIMinMax(const MachineOperand &MO,
   }
 }
 
+// (neg (min/max x, (neg x))) --> (max/min x, (neg x))
+bool CombinerHelper::matchSimplifyNegMinMax(MachineInstr &MI,
+                                            BuildFnTy &MatchInfo) const {
+  assert(MI.getOpcode() == TargetOpcode::G_SUB);
+  Register DestReg = MI.getOperand(0).getReg();
+  LLT DestTy = MRI.getType(DestReg);
+
+  Register X;
+  Register Sub0;
+  auto NegPattern = m_all_of(m_Neg(m_DeferredReg(X)), m_Reg(Sub0));
+  if (mi_match(DestReg, MRI,
+               m_Neg(m_OneUse(m_any_of(m_GSMin(m_Reg(X), NegPattern),
+                                       m_GSMax(m_Reg(X), NegPattern),
+                                       m_GUMin(m_Reg(X), NegPattern),
+                                       m_GUMax(m_Reg(X), NegPattern)))))) {
+    MachineInstr *MinMaxMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+    unsigned NewOpc = getInverseGMinMaxOpcode(MinMaxMI->getOpcode());
+    if (isLegal({NewOpc, {DestTy}})) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildInstr(NewOpc, {DestReg}, {X, Sub0});
+      };
+      return true;
+    }
+  }
+
+  return false;
+}
+
 bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) const {
   GSelect *Select = cast<GSelect>(&MI);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e2247f7..d0a6234 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -3022,8 +3023,19 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
       return UnableToLegalize;
 
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    if (!Ty.isScalar())
-      return UnableToLegalize;
+    assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
+    if (!Ty.isScalar()) {
+      // We need to widen the vector element type.
+      Observer.changingInstr(MI);
+      widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ANYEXT);
+      // We also need to adjust the MMO to turn this into a truncating store.
+      MachineMemOperand &MMO = **MI.memoperands_begin();
+      MachineFunction &MF = MIRBuilder.getMF();
+      auto *NewMMO = MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), Ty);
+      MI.setMemRefs(MF, {NewMMO});
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
 
     Observer.changingInstr(MI);
 
@@ -4106,10 +4118,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   unsigned StoreWidth = MemTy.getSizeInBits();
   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
 
-  if (StoreWidth != StoreSizeInBits) {
-    if (SrcTy.isVector())
-      return UnableToLegalize;
-
+  if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
     // Promote to a byte-sized store with upper bits zero if not
     // storing an integral number of bytes.  For example, promote
     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
@@ -4131,9 +4140,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
   }
 
   if (MemTy.isVector()) {
-    // TODO: Handle vector trunc stores
     if (MemTy != SrcTy)
-      return UnableToLegalize;
+      return scalarizeVectorBooleanStore(StoreMI);
 
     // TODO: We can do better than scalarizing the vector and at least split it
     // in half.
@@ -4189,6 +4197,50 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
 }
 
 LegalizerHelper::LegalizeResult
+LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
+  Register SrcReg = StoreMI.getValueReg();
+  Register PtrReg = StoreMI.getPointerReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  MachineMemOperand &MMO = **StoreMI.memoperands_begin();
+  LLT MemTy = MMO.getMemoryType();
+  LLT MemScalarTy = MemTy.getElementType();
+  MachineFunction &MF = MIRBuilder.getMF();
+
+  assert(SrcTy.isVector() && "Expect a vector store type");
+
+  if (!MemScalarTy.isByteSized()) {
+    // We need to build an integer scalar of the vector bit pattern.
+    // It's not legal for us to add padding when storing a vector.
+    unsigned NumBits = MemTy.getSizeInBits();
+    LLT IntTy = LLT::scalar(NumBits);
+    auto CurrVal = MIRBuilder.buildConstant(IntTy, 0);
+    LLT IdxTy = getLLTForMVT(TLI.getVectorIdxTy(MF.getDataLayout()));
+
+    for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
+      auto Elt = MIRBuilder.buildExtractVectorElement(
+          SrcTy.getElementType(), SrcReg, MIRBuilder.buildConstant(IdxTy, I));
+      auto Trunc = MIRBuilder.buildTrunc(MemScalarTy, Elt);
+      auto ZExt = MIRBuilder.buildZExt(IntTy, Trunc);
+      unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
+                                  ? (MemTy.getNumElements() - 1) - I
+                                  : I;
+      auto ShiftAmt = MIRBuilder.buildConstant(
+          IntTy, ShiftIntoIdx * MemScalarTy.getSizeInBits());
+      auto Shifted = MIRBuilder.buildShl(IntTy, ZExt, ShiftAmt);
+      CurrVal = MIRBuilder.buildOr(IntTy, CurrVal, Shifted);
+    }
+    auto PtrInfo = MMO.getPointerInfo();
+    auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, IntTy);
+    MIRBuilder.buildStore(CurrVal, PtrReg, *NewMMO);
+    StoreMI.eraseFromParent();
+    return Legalized;
+  }
+
+  // TODO: implement simple scalarization.
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
   switch (MI.getOpcode()) {
   case TargetOpcode::G_LOAD: {
@@ -4653,6 +4705,20 @@ LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
 }
 
+MachineInstrBuilder LegalizerHelper::createStackStoreLoad(const DstOp &Res,
+                                                          const SrcOp &Val) {
+  LLT SrcTy = Val.getLLTTy(MRI);
+  Align StackTypeAlign =
+      std::max(getStackTemporaryAlignment(SrcTy),
+               getStackTemporaryAlignment(Res.getLLTTy(MRI)));
+  MachinePointerInfo PtrInfo;
+  auto StackTemp =
+      createStackTemporary(SrcTy.getSizeInBytes(), StackTypeAlign, PtrInfo);
+
+  MIRBuilder.buildStore(Val, StackTemp, PtrInfo, StackTypeAlign);
+  return MIRBuilder.buildLoad(Res, StackTemp, PtrInfo, StackTypeAlign);
+}
+
 static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg,
                                  LLT VecTy) {
   LLT IdxTy = B.getMRI()->getType(IdxReg);
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 8c1e41e..625d556 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -276,6 +276,21 @@ void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
   reportGISelFailure(MF, TPC, MORE, R);
 }
 
+unsigned llvm::getInverseGMinMaxOpcode(unsigned MinMaxOpc) {
+  switch (MinMaxOpc) {
+  case TargetOpcode::G_SMIN:
+    return TargetOpcode::G_SMAX;
+  case TargetOpcode::G_SMAX:
+    return TargetOpcode::G_SMIN;
+  case TargetOpcode::G_UMIN:
+    return TargetOpcode::G_UMAX;
+  case TargetOpcode::G_UMAX:
+    return TargetOpcode::G_UMIN;
+  default:
+    llvm_unreachable("unrecognized opcode");
+  }
+}
+
 std::optional<APInt> llvm::getIConstantVRegVal(Register VReg,
                                                const MachineRegisterInfo &MRI) {
   std::optional<ValueAndVReg> ValAndVReg = getIConstantVRegValWithLookThrough(
@@ -1517,6 +1532,18 @@ llvm::isConstantOrConstantSplatVector(MachineInstr &MI,
   return APInt(ScalarSize, *MaybeCst, true);
 }
 
+std::optional<APFloat>
+llvm::isConstantOrConstantSplatVectorFP(MachineInstr &MI,
+                                        const MachineRegisterInfo &MRI) {
+  Register Def = MI.getOperand(0).getReg();
+  if (auto FpConst = getFConstantVRegValWithLookThrough(Def, MRI))
+    return FpConst->Value;
+  auto MaybeCstFP = getFConstantSplat(Def, MRI, /*allowUndef=*/false);
+  if (!MaybeCstFP)
+    return std::nullopt;
+  return MaybeCstFP->Value;
+}
+
 bool llvm::isNullOrNullSplat(const MachineInstr &MI,
                              const MachineRegisterInfo &MRI, bool AllowUndefs) {
   switch (MI.getOpcode()) {
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 1187ad0..e920b1b 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -60,11 +60,17 @@ static bool canParameterizeCallOperand(const CallBase *CI, unsigned OpIdx) {
     if (Name.starts_with("__dtrace"))
       return false;
   }
-  if (isCalleeOperand(CI, OpIdx) &&
-      CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value()) {
+  if (isCalleeOperand(CI, OpIdx)) {
     // The operand is the callee and it has already been signed. Ignore this
     // because we cannot add another ptrauth bundle to the call instruction.
-    return false;
+    if (CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value())
+      return false;
+  } else {
+    // The target of the arc-attached call must be a constant and cannot be
+    // parameterized.
+    if (CI->isOperandBundleOfType(LLVMContext::OB_clang_arc_attachedcall,
+                                  OpIdx))
+      return false;
   }
   return true;
 }
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index 9744c47..3367171 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -66,7 +66,7 @@ void LiveRegMatrix::init(MachineFunction &MF, LiveIntervals &pLIS,
   unsigned NumRegUnits = TRI->getNumRegUnits();
   if (NumRegUnits != Matrix.size())
     Queries.reset(new LiveIntervalUnion::Query[NumRegUnits]);
-  Matrix.init(LIUAlloc, NumRegUnits);
+  Matrix.init(*LIUAlloc, NumRegUnits);
 
   // Make sure no stale queries get reused.
   invalidateVirtRegs();
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
index 23db09b..9bba50e8 100644
--- a/llvm/lib/CodeGen/MIRSampleProfile.cpp
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -46,8 +46,9 @@ static cl::opt<bool> ShowFSBranchProb(
     cl::desc("Print setting flow sensitive branch probabilities"));
 static cl::opt<unsigned> FSProfileDebugProbDiffThreshold(
     "fs-profile-debug-prob-diff-threshold", cl::init(10),
-    cl::desc("Only show debug message if the branch probility is greater than "
-             "this value (in percentage)."));
+    cl::desc(
+        "Only show debug message if the branch probability is greater than "
+        "this value (in percentage)."));
 
 static cl::opt<unsigned> FSProfileDebugBWThreshold(
     "fs-profile-debug-bw-threshold", cl::init(10000),
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 0f68313..05bc4cf 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -149,7 +149,7 @@ static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
 static cl::opt<bool>
     TailDupPlacement("tail-dup-placement",
                      cl::desc("Perform tail duplication during placement. "
-                              "Creates more fallthrough opportunites in "
+                              "Creates more fallthrough opportunities in "
                               "outline branches."),
                      cl::init(true), cl::Hidden);
 
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 56fffff..2e92dd8 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -29,7 +29,7 @@ namespace llvm {
 cl::opt<unsigned>
     StaticLikelyProb("static-likely-prob",
                      cl::desc("branch probability threshold in percentage"
-                              "to be considered very likely"),
+                              " to be considered very likely"),
                      cl::init(80), cl::Hidden);
 
 cl::opt<unsigned> ProfileLikelyProb(
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 3a9bdde..5c9ca91 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1170,6 +1170,9 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (getFlags() & MachineMemOperand::MOTargetFlag3)
       OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag3)
          << "\" ";
+    if (getFlags() & MachineMemOperand::MOTargetFlag4)
+      OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag4)
+         << "\" ";
   } else {
     if (getFlags() & MachineMemOperand::MOTargetFlag1)
       OS << "\"MOTargetFlag1\" ";
@@ -1177,6 +1180,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << "\"MOTargetFlag2\" ";
     if (getFlags() & MachineMemOperand::MOTargetFlag3)
       OS << "\"MOTargetFlag3\" ";
+    if (getFlags() & MachineMemOperand::MOTargetFlag4)
+      OS << "\"MOTargetFlag4\" ";
   }
 
   assert((isLoad() || isStore()) &&
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 6f636a1..394b99b 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -407,9 +407,11 @@ void MachineRegisterInfo::replaceRegWith(Register FromReg, Register ToReg) {
 MachineInstr *MachineRegisterInfo::getVRegDef(Register Reg) const {
   // Since we are in SSA form, we can use the first definition.
   def_instr_iterator I = def_instr_begin(Reg);
-  assert((I.atEnd() || std::next(I) == def_instr_end()) &&
-         "getVRegDef assumes a single definition or no definition");
-  return !I.atEnd() ? &*I : nullptr;
+  if (I == def_instr_end())
+    return nullptr;
+  assert(std::next(I) == def_instr_end() &&
+         "getVRegDef assumes at most one definition");
+  return &*I;
 }
 
 /// getUniqueVRegDef - Return the unique machine instr that defines the
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 6576f97..021c1a0 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -683,11 +683,10 @@ struct DataDep {
   DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp)
     : UseOp(UseOp) {
     assert(Register::isVirtualRegister(VirtReg));
-    MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg);
-    assert(!DefI.atEnd() && "Register has no defs");
-    DefMI = DefI->getParent();
-    DefOp = DefI.getOperandNo();
-    assert((++DefI).atEnd() && "Register has multiple defs");
+    MachineOperand *DefMO = MRI->getOneDef(VirtReg);
+    assert(DefMO && "Register does not have unique def");
+    DefMI = DefMO->getParent();
+    DefOp = DefMO->getOperandNo();
   }
 };
 
diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp
index 2f7cfdd..badfd9a6 100644
--- a/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -98,12 +98,6 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
-
-  private:
-    bool enablePostRAScheduler(
-        const TargetSubtargetInfo &ST, CodeGenOptLevel OptLevel,
-        TargetSubtargetInfo::AntiDepBreakMode &Mode,
-        TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const;
   };
   char PostRAScheduler::ID = 0;
 
@@ -259,13 +253,8 @@ LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const {
 }
 #endif
 
-bool PostRAScheduler::enablePostRAScheduler(
-    const TargetSubtargetInfo &ST, CodeGenOptLevel OptLevel,
-    TargetSubtargetInfo::AntiDepBreakMode &Mode,
-    TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const {
-  Mode = ST.getAntiDepBreakMode();
-  ST.getCriticalPathRCs(CriticalPathRCs);
-
+static bool enablePostRAScheduler(const TargetSubtargetInfo &ST,
+                                  CodeGenOptLevel OptLevel) {
   // Check for explicit enable/disable of post-ra scheduling.
   if (EnablePostRAScheduler.getPosition() > 0)
     return EnablePostRAScheduler;
@@ -278,24 +267,17 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   if (skipFunction(Fn.getFunction()))
     return false;
 
-  TII = Fn.getSubtarget().getInstrInfo();
-  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  const auto &Subtarget = Fn.getSubtarget();
   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
-
-  RegClassInfo.runOnMachineFunction(Fn);
-
-  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
-    TargetSubtargetInfo::ANTIDEP_NONE;
-  SmallVector<const TargetRegisterClass*, 4> CriticalPathRCs;
-
   // Check that post-RA scheduling is enabled for this target.
-  // This may upgrade the AntiDepMode.
-  if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
-                             AntiDepMode, CriticalPathRCs))
+  if (!enablePostRAScheduler(Subtarget, PassConfig->getOptLevel()))
     return false;
 
-  // Check for antidep breaking override...
+  TII = Subtarget.getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
+      Subtarget.getAntiDepBreakMode();
   if (EnableAntiDepBreaking.getPosition() > 0) {
     AntiDepMode = (EnableAntiDepBreaking == "all")
       ? TargetSubtargetInfo::ANTIDEP_ALL
@@ -303,6 +285,9 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
          ? TargetSubtargetInfo::ANTIDEP_CRITICAL
          : TargetSubtargetInfo::ANTIDEP_NONE);
   }
+  SmallVector<const TargetRegisterClass *, 4> CriticalPathRCs;
+  Subtarget.getCriticalPathRCs(CriticalPathRCs);
+  RegClassInfo.runOnMachineFunction(Fn);
 
   LLVM_DEBUG(dbgs() << "PostRAScheduler\n");
 
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 79b0fa6..3ab6315 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -30,22 +30,22 @@ static bool isValidRegUse(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isUse();
 }
 
-static bool isValidRegUseOf(const MachineOperand &MO, MCRegister PhysReg,
+static bool isValidRegUseOf(const MachineOperand &MO, MCRegister Reg,
                             const TargetRegisterInfo *TRI) {
   if (!isValidRegUse(MO))
     return false;
-  return TRI->regsOverlap(MO.getReg(), PhysReg);
+  return TRI->regsOverlap(MO.getReg(), Reg);
 }
 
 static bool isValidRegDef(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isDef();
 }
 
-static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg,
+static bool isValidRegDefOf(const MachineOperand &MO, MCRegister Reg,
                             const TargetRegisterInfo *TRI) {
   if (!isValidRegDef(MO))
     return false;
-  return TRI->regsOverlap(MO.getReg(), PhysReg);
+  return TRI->regsOverlap(MO.getReg(), Reg);
 }
 
 void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
@@ -261,7 +261,7 @@ void ReachingDefAnalysis::traverse() {
 }
 
 int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
-                                        MCRegister PhysReg) const {
+                                        MCRegister Reg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
   int InstId = InstIds.lookup(MI);
   int DefRes = ReachingDefDefaultVal;
@@ -269,7 +269,7 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
   assert(MBBNumber < MBBReachingDefs.numBlockIDs() &&
          "Unexpected basic block number.");
   int LatestDef = ReachingDefDefaultVal;
-  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+  for (MCRegUnit Unit : TRI->regunits(Reg)) {
     for (int Def : MBBReachingDefs.defs(MBBNumber, Unit)) {
       if (Def >= InstId)
         break;
@@ -280,22 +280,21 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
   return LatestDef;
 }
 
-MachineInstr *
-ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
-                                           MCRegister PhysReg) const {
-  return hasLocalDefBefore(MI, PhysReg)
-    ? getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg))
-    : nullptr;
+MachineInstr *ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
+                                                         MCRegister Reg) const {
+  return hasLocalDefBefore(MI, Reg)
+             ? getInstFromId(MI->getParent(), getReachingDef(MI, Reg))
+             : nullptr;
 }
 
 bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
-                                             MCRegister PhysReg) const {
+                                             MCRegister Reg) const {
   MachineBasicBlock *ParentA = A->getParent();
   MachineBasicBlock *ParentB = B->getParent();
   if (ParentA != ParentB)
     return false;
 
-  return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg);
+  return getReachingDef(A, Reg) == getReachingDef(B, Reg);
 }
 
 MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
@@ -318,19 +317,18 @@ MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
   return nullptr;
 }
 
-int ReachingDefAnalysis::getClearance(MachineInstr *MI,
-                                      MCRegister PhysReg) const {
+int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCRegister Reg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
-  return InstIds.lookup(MI) - getReachingDef(MI, PhysReg);
+  return InstIds.lookup(MI) - getReachingDef(MI, Reg);
 }
 
 bool ReachingDefAnalysis::hasLocalDefBefore(MachineInstr *MI,
-                                            MCRegister PhysReg) const {
-  return getReachingDef(MI, PhysReg) >= 0;
+                                            MCRegister Reg) const {
+  return getReachingDef(MI, Reg) >= 0;
 }
 
 void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
-                                               MCRegister PhysReg,
+                                               MCRegister Reg,
                                                InstSet &Uses) const {
   MachineBasicBlock *MBB = Def->getParent();
   MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
@@ -340,11 +338,11 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
 
     // If/when we find a new reaching def, we know that there's no more uses
     // of 'Def'.
-    if (getReachingLocalMIDef(&*MI, PhysReg) != Def)
+    if (getReachingLocalMIDef(&*MI, Reg) != Def)
       return;
 
     for (auto &MO : MI->operands()) {
-      if (!isValidRegUseOf(MO, PhysReg, TRI))
+      if (!isValidRegUseOf(MO, Reg, TRI))
         continue;
 
       Uses.insert(&*MI);
@@ -354,15 +352,14 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
   }
 }
 
-bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB,
-                                        MCRegister PhysReg,
+bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, MCRegister Reg,
                                         InstSet &Uses) const {
   for (MachineInstr &MI :
        instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end())) {
     for (auto &MO : MI.operands()) {
-      if (!isValidRegUseOf(MO, PhysReg, TRI))
+      if (!isValidRegUseOf(MO, Reg, TRI))
         continue;
-      if (getReachingDef(&MI, PhysReg) >= 0)
+      if (getReachingDef(&MI, Reg) >= 0)
         return false;
       Uses.insert(&MI);
     }
@@ -370,18 +367,18 @@ bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB,
   auto Last = MBB->getLastNonDebugInstr();
   if (Last == MBB->end())
     return true;
-  return isReachingDefLiveOut(&*Last, PhysReg);
+  return isReachingDefLiveOut(&*Last, Reg);
 }
 
-void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
+void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister Reg,
                                         InstSet &Uses) const {
   MachineBasicBlock *MBB = MI->getParent();
 
   // Collect the uses that each def touches within the block.
-  getReachingLocalUses(MI, PhysReg, Uses);
+  getReachingLocalUses(MI, Reg, Uses);
 
   // Handle live-out values.
-  if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), PhysReg)) {
+  if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), Reg)) {
     if (LiveOut != MI)
       return;
 
@@ -389,9 +386,9 @@ void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
     SmallPtrSet<MachineBasicBlock*, 4>Visited;
     while (!ToVisit.empty()) {
       MachineBasicBlock *MBB = ToVisit.pop_back_val();
-      if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg))
+      if (Visited.count(MBB) || !MBB->isLiveIn(Reg))
         continue;
-      if (getLiveInUses(MBB, PhysReg, Uses))
+      if (getLiveInUses(MBB, Reg, Uses))
         llvm::append_range(ToVisit, MBB->successors());
       Visited.insert(MBB);
     }
@@ -399,25 +396,25 @@ void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
 }
 
 void ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI,
-                                                MCRegister PhysReg,
+                                                MCRegister Reg,
                                                 InstSet &Defs) const {
-  if (auto *Def = getUniqueReachingMIDef(MI, PhysReg)) {
+  if (auto *Def = getUniqueReachingMIDef(MI, Reg)) {
     Defs.insert(Def);
     return;
   }
 
   for (auto *MBB : MI->getParent()->predecessors())
-    getLiveOuts(MBB, PhysReg, Defs);
+    getLiveOuts(MBB, Reg, Defs);
 }
 
-void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
-                                      MCRegister PhysReg, InstSet &Defs) const {
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg,
+                                      InstSet &Defs) const {
   SmallPtrSet<MachineBasicBlock*, 2> VisitedBBs;
-  getLiveOuts(MBB, PhysReg, Defs, VisitedBBs);
+  getLiveOuts(MBB, Reg, Defs, VisitedBBs);
 }
 
-void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
-                                      MCRegister PhysReg, InstSet &Defs,
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg,
+                                      InstSet &Defs,
                                       BlockSet &VisitedBBs) const {
   if (VisitedBBs.count(MBB))
     return;
@@ -425,28 +422,28 @@ void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
   VisitedBBs.insert(MBB);
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (LiveRegs.available(PhysReg))
+  if (LiveRegs.available(Reg))
     return;
 
-  if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
+  if (auto *Def = getLocalLiveOutMIDef(MBB, Reg))
     Defs.insert(Def);
   else
     for (auto *Pred : MBB->predecessors())
-      getLiveOuts(Pred, PhysReg, Defs, VisitedBBs);
+      getLiveOuts(Pred, Reg, Defs, VisitedBBs);
 }
 
 MachineInstr *
 ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI,
-                                            MCRegister PhysReg) const {
+                                            MCRegister Reg) const {
   // If there's a local def before MI, return it.
-  MachineInstr *LocalDef = getReachingLocalMIDef(MI, PhysReg);
+  MachineInstr *LocalDef = getReachingLocalMIDef(MI, Reg);
   if (LocalDef && InstIds.lookup(LocalDef) < InstIds.lookup(MI))
     return LocalDef;
 
   SmallPtrSet<MachineInstr*, 2> Incoming;
   MachineBasicBlock *Parent = MI->getParent();
   for (auto *Pred : Parent->predecessors())
-    getLiveOuts(Pred, PhysReg, Incoming);
+    getLiveOuts(Pred, Reg, Incoming);
 
   // Check that we have a single incoming value and that it does not
   // come from the same block as MI - since it would mean that the def
@@ -469,13 +466,13 @@ MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
 }
 
 bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
-                                         MCRegister PhysReg) const {
+                                         MCRegister Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
 
   // Yes if the register is live out of the basic block.
-  if (!LiveRegs.available(PhysReg))
+  if (!LiveRegs.available(Reg))
     return true;
 
   // Walk backwards through the block to see if the register is live at some
@@ -483,62 +480,61 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
   for (MachineInstr &Last :
        instructionsWithoutDebug(MBB->instr_rbegin(), MBB->instr_rend())) {
     LiveRegs.stepBackward(Last);
-    if (!LiveRegs.available(PhysReg))
+    if (!LiveRegs.available(Reg))
       return InstIds.lookup(&Last) > InstIds.lookup(MI);
   }
   return false;
 }
 
 bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI,
-                                            MCRegister PhysReg) const {
+                                            MCRegister Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   auto Last = MBB->getLastNonDebugInstr();
   if (Last != MBB->end() &&
-      getReachingDef(MI, PhysReg) != getReachingDef(&*Last, PhysReg))
+      getReachingDef(MI, Reg) != getReachingDef(&*Last, Reg))
     return true;
 
-  if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
-    return Def == getReachingLocalMIDef(MI, PhysReg);
+  if (auto *Def = getLocalLiveOutMIDef(MBB, Reg))
+    return Def == getReachingLocalMIDef(MI, Reg);
 
   return false;
 }
 
 bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI,
-                                               MCRegister PhysReg) const {
+                                               MCRegister Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (LiveRegs.available(PhysReg))
+  if (LiveRegs.available(Reg))
     return false;
 
   auto Last = MBB->getLastNonDebugInstr();
-  int Def = getReachingDef(MI, PhysReg);
-  if (Last != MBB->end() && getReachingDef(&*Last, PhysReg) != Def)
+  int Def = getReachingDef(MI, Reg);
+  if (Last != MBB->end() && getReachingDef(&*Last, Reg) != Def)
     return false;
 
   // Finally check that the last instruction doesn't redefine the register.
   for (auto &MO : Last->operands())
-    if (isValidRegDefOf(MO, PhysReg, TRI))
+    if (isValidRegDefOf(MO, Reg, TRI))
       return false;
 
   return true;
 }
 
-MachineInstr *
-ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                          MCRegister PhysReg) const {
+MachineInstr *ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
+                                                        MCRegister Reg) const {
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (LiveRegs.available(PhysReg))
+  if (LiveRegs.available(Reg))
     return nullptr;
 
   auto Last = MBB->getLastNonDebugInstr();
   if (Last == MBB->end())
     return nullptr;
 
-  int Def = getReachingDef(&*Last, PhysReg);
+  int Def = getReachingDef(&*Last, Reg);
   for (auto &MO : Last->operands())
-    if (isValidRegDefOf(MO, PhysReg, TRI))
+    if (isValidRegDefOf(MO, Reg, TRI))
       return &*Last;
 
   return Def < 0 ? nullptr : getInstFromId(MBB, Def);
@@ -650,7 +646,7 @@ ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &Visited,
 void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
                                                 InstSet &Dead) const {
   Dead.insert(MI);
-  auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister PhysReg) {
+  auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister Reg) {
     if (mayHaveSideEffects(*Def))
       return false;
 
@@ -666,7 +662,7 @@ void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
       return false;
 
     SmallPtrSet<MachineInstr*, 4> Uses;
-    getGlobalUses(Def, PhysReg, Uses);
+    getGlobalUses(Def, Reg, Uses);
     return llvm::set_is_subset(Uses, Dead);
   };
 
@@ -680,18 +676,18 @@ void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
 }
 
 bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI,
-                                           MCRegister PhysReg) const {
+                                           MCRegister Reg) const {
   SmallPtrSet<MachineInstr*, 1> Ignore;
-  return isSafeToDefRegAt(MI, PhysReg, Ignore);
+  return isSafeToDefRegAt(MI, Reg, Ignore);
 }
 
-bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
+bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg,
                                            InstSet &Ignore) const {
   // Check for any uses of the register after MI.
-  if (isRegUsedAfter(MI, PhysReg)) {
-    if (auto *Def = getReachingLocalMIDef(MI, PhysReg)) {
+  if (isRegUsedAfter(MI, Reg)) {
+    if (auto *Def = getReachingLocalMIDef(MI, Reg)) {
       SmallPtrSet<MachineInstr*, 2> Uses;
-      getGlobalUses(Def, PhysReg, Uses);
+      getGlobalUses(Def, Reg, Uses);
       if (!llvm::set_is_subset(Uses, Ignore))
         return false;
     } else
@@ -700,13 +696,13 @@ bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
 
   MachineBasicBlock *MBB = MI->getParent();
   // Check for any defs after MI.
-  if (isRegDefinedAfter(MI, PhysReg)) {
+  if (isRegDefinedAfter(MI, Reg)) {
     auto I = MachineBasicBlock::iterator(MI);
     for (auto E = MBB->end(); I != E; ++I) {
       if (Ignore.count(&*I))
         continue;
       for (auto &MO : I->operands())
-        if (isValidRegDefOf(MO, PhysReg, TRI))
+        if (isValidRegDefOf(MO, Reg, TRI))
           return false;
     }
   }
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 4fa2bc7..b94992c 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -140,7 +140,7 @@ static cl::opt<bool> GreedyReverseLocalAssignment(
 static cl::opt<unsigned> SplitThresholdForRegWithHint(
     "split-threshold-for-reg-with-hint",
     cl::desc("The threshold for splitting a virtual register with a hint, in "
-             "percentate"),
+             "percentage"),
     cl::init(75), cl::Hidden);
 
 static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
@@ -376,6 +376,12 @@ unsigned DefaultPriorityAdvisor::getPriority(const LiveInterval &LI) const {
   return Prio;
 }
 
+unsigned DummyPriorityAdvisor::getPriority(const LiveInterval &LI) const {
+  // Prioritize by virtual register number, lowest first.
+  Register Reg = LI.reg();
+  return ~Reg.virtRegIndex();
+}
+
 const LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); }
 
 const LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
@@ -2029,6 +2035,9 @@ unsigned RAGreedy::tryLastChanceRecoloring(const LiveInterval &VirtReg,
     // available colors.
     Matrix->assign(VirtReg, PhysReg);
 
+    // VirtReg may be deleted during tryRecoloringCandidates, save a copy.
+    Register ThisVirtReg = VirtReg.reg();
+
     // Save the current recoloring state.
     // If we cannot recolor all the interferences, we will have to start again
     // at this point for the next physical register.
@@ -2040,8 +2049,16 @@ unsigned RAGreedy::tryLastChanceRecoloring(const LiveInterval &VirtReg,
         NewVRegs.push_back(NewVReg);
       // Do not mess up with the global assignment process.
       // I.e., VirtReg must be unassigned.
-      Matrix->unassign(VirtReg);
-      return PhysReg;
+      if (VRM->hasPhys(ThisVirtReg)) {
+        Matrix->unassign(VirtReg);
+        return PhysReg;
+      }
+
+      // It is possible VirtReg will be deleted during tryRecoloringCandidates.
+      LLVM_DEBUG(dbgs() << "tryRecoloringCandidates deleted a fixed register "
+                        << printReg(ThisVirtReg) << '\n');
+      FixedRegisters.erase(ThisVirtReg);
+      return 0;
     }
 
     LLVM_DEBUG(dbgs() << "Fail to assign: " << VirtReg << " to "
diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp
index 0650aaf..4525b8f 100644
--- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp
+++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp
@@ -30,7 +30,10 @@ static cl::opt<RegAllocPriorityAdvisorAnalysis::AdvisorMode> Mode(
         clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Release,
                    "release", "precompiled"),
         clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Development,
-                   "development", "for training")));
+                   "development", "for training"),
+        clEnumValN(
+            RegAllocPriorityAdvisorAnalysis::AdvisorMode::Dummy, "dummy",
+            "prioritize low virtual register numbers for test and debug")));
 
 char RegAllocPriorityAdvisorAnalysis::ID = 0;
 INITIALIZE_PASS(RegAllocPriorityAdvisorAnalysis, "regalloc-priority",
@@ -67,6 +70,31 @@ private:
   }
   const bool NotAsRequested;
 };
+
+class DummyPriorityAdvisorAnalysis final
+    : public RegAllocPriorityAdvisorAnalysis {
+public:
+  DummyPriorityAdvisorAnalysis()
+      : RegAllocPriorityAdvisorAnalysis(AdvisorMode::Dummy) {}
+
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocPriorityAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Dummy;
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<SlotIndexesWrapperPass>();
+    RegAllocPriorityAdvisorAnalysis::getAnalysisUsage(AU);
+  }
+
+  std::unique_ptr<RegAllocPriorityAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    return std::make_unique<DummyPriorityAdvisor>(
+        MF, RA, &getAnalysis<SlotIndexesWrapperPass>().getSI());
+  }
+};
+
 } // namespace
 
 template <> Pass *llvm::callDefaultCtor<RegAllocPriorityAdvisorAnalysis>() {
@@ -75,6 +103,9 @@ template <> Pass *llvm::callDefaultCtor<RegAllocPriorityAdvisorAnalysis>() {
   case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Default:
     Ret = new DefaultPriorityAdvisorAnalysis(/*NotAsRequested*/ false);
     break;
+  case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Dummy:
+    Ret = new DummyPriorityAdvisorAnalysis();
+    break;
   case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Development:
 #if defined(LLVM_HAVE_TFLITE)
     Ret = createDevelopmentModePriorityAdvisor();
@@ -97,6 +128,8 @@ StringRef RegAllocPriorityAdvisorAnalysis::getPassName() const {
     return "Release mode Regalloc Priority Advisor";
   case AdvisorMode::Development:
     return "Development mode Regalloc Priority Advisor";
+  case AdvisorMode::Dummy:
+    return "Dummy Regalloc Priority Advisor";
   }
   llvm_unreachable("Unknown advisor kind");
 }
diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h
index 1e9fa96..32e4598 100644
--- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h
+++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h
@@ -56,9 +56,21 @@ private:
   unsigned getPriority(const LiveInterval &LI) const override;
 };
 
+/// Stupid priority advisor which just enqueues in virtual register number
+/// order, for debug purposes only.
+class DummyPriorityAdvisor : public RegAllocPriorityAdvisor {
+public:
+  DummyPriorityAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                       SlotIndexes *const Indexes)
+      : RegAllocPriorityAdvisor(MF, RA, Indexes) {}
+
+private:
+  unsigned getPriority(const LiveInterval &LI) const override;
+};
+
 class RegAllocPriorityAdvisorAnalysis : public ImmutablePass {
 public:
-  enum class AdvisorMode : int { Default, Release, Development };
+  enum class AdvisorMode : int { Default, Release, Development, Dummy };
 
   RegAllocPriorityAdvisorAnalysis(AdvisorMode Mode)
       : ImmutablePass(ID), Mode(Mode){};
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 20ad644..8313927 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -113,7 +113,7 @@ static cl::opt<unsigned> LargeIntervalSizeThreshold(
 
 static cl::opt<unsigned> LargeIntervalFreqThreshold(
     "large-interval-freq-threshold", cl::Hidden,
-    cl::desc("For a large interval, if it is coalesed with other live "
+    cl::desc("For a large interval, if it is coalesced with other live "
              "intervals many times more than the threshold, stop its "
              "coalescing to control the compile time. "),
     cl::init(256));
@@ -1325,11 +1325,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   const MCInstrDesc &MCID = DefMI->getDesc();
   if (MCID.getNumDefs() != 1)
     return false;
-  // Only support subregister destinations when the def is read-undef.
-  MachineOperand &DstOperand = CopyMI->getOperand(0);
-  Register CopyDstReg = DstOperand.getReg();
-  if (DstOperand.getSubReg() && !DstOperand.isUndef())
-    return false;
 
   // If both SrcIdx and DstIdx are set, correct rematerialization would widen
   // the register substantially (beyond both source and dest size). This is bad
@@ -1339,6 +1334,32 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (SrcIdx && DstIdx)
     return false;
 
+  // Only support subregister destinations when the def is read-undef.
+  MachineOperand &DstOperand = CopyMI->getOperand(0);
+  Register CopyDstReg = DstOperand.getReg();
+  if (DstOperand.getSubReg() && !DstOperand.isUndef())
+    return false;
+
+  // In the physical register case, checking that the def is read-undef is not
+  // enough. We're widening the def and need to avoid clobbering other live
+  // values in the unused register pieces.
+  //
+  // TODO: Targets may support rewriting the rematerialized instruction to only
+  // touch relevant lanes, in which case we don't need any liveness check.
+  if (CopyDstReg.isPhysical() && CP.isPartial()) {
+    for (MCRegUnit Unit : TRI->regunits(DstReg)) {
+      // Ignore the register units we are writing anyway.
+      if (is_contained(TRI->regunits(CopyDstReg), Unit))
+        continue;
+
+      // Check if the other lanes we are defining are live at the
+      // rematerialization point.
+      LiveRange &LR = LIS->getRegUnit(Unit);
+      if (LR.liveAt(CopyIdx))
+        return false;
+    }
+  }
+
   const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
   const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
   if (!DefMI->isImplicitDef()) {
@@ -1375,27 +1396,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   NewMI.setDebugLoc(DL);
 
   // In a situation like the following:
-  //
-  //    undef %2.subreg:reg = INST %1:reg         ; DefMI (rematerializable),
-  //                                              ; DefSubIdx = subreg
-  //    %3:reg = COPY %2                          ; SrcIdx = DstIdx = 0
-  //    .... = SOMEINSTR %3:reg
-  //
-  // there are no subranges for %3 so after rematerialization we need
-  // to explicitly create them. Undefined subranges are removed later on.
-  if (DstReg.isVirtual() && DefSubIdx && !CP.getSrcIdx() && !CP.getDstIdx() &&
-      MRI->shouldTrackSubRegLiveness(DstReg)) {
-    LiveInterval &DstInt = LIS->getInterval(DstReg);
-    if (!DstInt.hasSubRanges()) {
-      LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstReg);
-      LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(DefSubIdx);
-      LaneBitmask UnusedLanes = FullMask & ~UsedLanes;
-      DstInt.createSubRangeFrom(LIS->getVNInfoAllocator(), UsedLanes, DstInt);
-      DstInt.createSubRangeFrom(LIS->getVNInfoAllocator(), UnusedLanes, DstInt);
-    }
-  }
-
-  // In a situation like the following:
   //     %0:subreg = instr              ; DefMI, subreg = DstIdx
   //     %1        = copy %0:subreg ; CopyMI, SrcIdx = 0
   // instead of widening %1 to the register class of %0 simply do:
@@ -1523,6 +1523,27 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     // sure that "undef" is not set.
     if (NewIdx == 0)
       NewMI.getOperand(0).setIsUndef(false);
+
+    // In a situation like the following:
+    //
+    //    undef %2.subreg:reg = INST %1:reg    ; DefMI (rematerializable),
+    //                                         ; Defines only some of lanes,
+    //                                         ; so DefSubIdx = NewIdx = subreg
+    //    %3:reg = COPY %2                     ; Copy full reg
+    //    .... = SOMEINSTR %3:reg              ; Use full reg
+    //
+    // there are no subranges for %3 so after rematerialization we need
+    // to explicitly create them. Undefined subranges are removed later on.
+    if (NewIdx && !DstInt.hasSubRanges() &&
+        MRI->shouldTrackSubRegLiveness(DstReg)) {
+      LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstReg);
+      LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(NewIdx);
+      LaneBitmask UnusedLanes = FullMask & ~UsedLanes;
+      VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
+      DstInt.createSubRangeFrom(Alloc, UsedLanes, DstInt);
+      DstInt.createSubRangeFrom(Alloc, UnusedLanes, DstInt);
+    }
+
     // Add dead subregister definitions if we are defining the whole register
     // but only part of it is live.
     // This could happen if the rematerialization instruction is rematerializing
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 7b927e6..bfc49dd 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -1044,6 +1044,18 @@ bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
     return true;
   }
 
+  // If latch has a select group with several elements, it is usually profitable
+  // to convert it to branches. We let `optimizeSelectsInnerLoops` decide if
+  // conversion is profitable for innermost loops.
+  auto *BB = SI.getI()->getParent();
+  auto *L = LI->getLoopFor(BB);
+  if (L && !L->isInnermost() && L->getLoopLatch() == BB &&
+      ASI.Selects.size() >= 3) {
+    OR << "Converted to branch because select group in the latch block is big.";
+    EmitAndPrintRemark(ORE, OR);
+    return true;
+  }
+
   ORmiss << "Not profitable to convert to branch (base heuristic).";
   EmitAndPrintRemark(ORE, ORmiss);
   return false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6cbfef2..da3c834 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -141,7 +141,7 @@ static cl::opt<bool> EnableReduceLoadOpStoreWidth(
 static cl::opt<bool> ReduceLoadOpStoreWidthForceNarrowingProfitable(
     "combiner-reduce-load-op-store-width-force-narrowing-profitable",
     cl::Hidden, cl::init(false),
-    cl::desc("DAG combiner force override the narrowing profitable check when"
+    cl::desc("DAG combiner force override the narrowing profitable check when "
              "reducing the width of load/op/store sequences"));
 
 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
@@ -3949,6 +3949,23 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       if (SDValue Result = TLI.expandABS(N1.getNode(), DAG, true))
         return Result;
 
+    // Similar to the previous rule, but this time targeting an expanded abs.
+    // (sub 0, (max X, (sub 0, X))) --> (min X, (sub 0, X))
+    // as well as
+    // (sub 0, (min X, (sub 0, X))) --> (max X, (sub 0, X))
+    // Note that these two are applicable to both signed and unsigned min/max.
+    SDValue X;
+    SDValue S0;
+    auto NegPat = m_AllOf(m_Neg(m_Deferred(X)), m_Value(S0));
+    if (sd_match(N1, m_OneUse(m_AnyOf(m_SMax(m_Value(X), NegPat),
+                                      m_UMax(m_Value(X), NegPat),
+                                      m_SMin(m_Value(X), NegPat),
+                                      m_UMin(m_Value(X), NegPat))))) {
+      unsigned NewOpc = ISD::getInverseMinMaxOpcode(N1->getOpcode());
+      if (hasOperation(NewOpc, VT))
+        return DAG.getNode(NewOpc, DL, VT, X, S0);
+    }
+
     // Fold neg(splat(neg(x)) -> splat(x)
     if (VT.isVector()) {
       SDValue N1S = DAG.getSplatValue(N1, true);
@@ -20438,10 +20455,8 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
       Value.hasOneUse()) {
     LoadSDNode *LD = cast<LoadSDNode>(Value);
     EVT VT = LD->getMemoryVT();
-    if (!VT.isFloatingPoint() ||
-        VT != ST->getMemoryVT() ||
-        LD->isNonTemporal() ||
-        ST->isNonTemporal() ||
+    if (!VT.isSimple() || !VT.isFloatingPoint() || VT != ST->getMemoryVT() ||
+        LD->isNonTemporal() || ST->isNonTemporal() ||
         LD->getPointerInfo().getAddrSpace() != 0 ||
         ST->getPointerInfo().getAddrSpace() != 0)
       return SDValue();
@@ -23088,8 +23103,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
       return DAG.getAnyExtOrTrunc(BCSrc, DL, ScalarVT);
 
+    // TODO: Add support for SCALAR_TO_VECTOR implicit truncation.
     if (LegalTypes && BCSrc.getValueType().isInteger() &&
-        BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+        BCSrc.getScalarValueSizeInBits() ==
+            BCSrc.getOperand(0).getScalarValueSizeInBits()) {
       // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
       // trunc i64 X to i32
       SDValue X = BCSrc.getOperand(0);
@@ -24288,8 +24306,8 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
   EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
 
   // Keep track of what we encounter.
-  bool AnyInteger = false;
-  bool AnyFP = false;
+  EVT AnyFPVT;
+
   for (const SDValue &Op : N->ops()) {
     if (ISD::BITCAST == Op.getOpcode() &&
         !Op.getOperand(0).getValueType().isVector())
@@ -24303,27 +24321,23 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
     // If it's neither, bail out, it could be something weird like x86mmx.
     EVT LastOpVT = Ops.back().getValueType();
     if (LastOpVT.isFloatingPoint())
-      AnyFP = true;
-    else if (LastOpVT.isInteger())
-      AnyInteger = true;
-    else
+      AnyFPVT = LastOpVT;
+    else if (!LastOpVT.isInteger())
       return SDValue();
   }
 
   // If any of the operands is a floating point scalar bitcast to a vector,
   // use floating point types throughout, and bitcast everything.
   // Replace UNDEFs by another scalar UNDEF node, of the final desired type.
-  if (AnyFP) {
-    SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
-    if (AnyInteger) {
-      for (SDValue &Op : Ops) {
-        if (Op.getValueType() == SVT)
-          continue;
-        if (Op.isUndef())
-          Op = DAG.getNode(ISD::UNDEF, DL, SVT);
-        else
-          Op = DAG.getBitcast(SVT, Op);
-      }
+  if (AnyFPVT != EVT()) {
+    SVT = AnyFPVT;
+    for (SDValue &Op : Ops) {
+      if (Op.getValueType() == SVT)
+        continue;
+      if (Op.isUndef())
+        Op = DAG.getNode(ISD::UNDEF, DL, SVT);
+      else
+        Op = DAG.getBitcast(SVT, Op);
     }
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index db21e70..89a00c5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -402,6 +402,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FMAXNUM_IEEE:
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
   case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
@@ -1081,6 +1083,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::FMAXIMUM:
     Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG));
     return;
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
+    Results.push_back(TLI.expandFMINIMUMNUM_FMAXIMUMNUM(Node, DAG));
+    return;
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -1738,7 +1744,8 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
   bool IsStrict = Node->isStrictFPOpcode();
   unsigned OpNo = IsStrict ? 1 : 0;
   SDValue Src = Node->getOperand(OpNo);
-  EVT VT = Src.getValueType();
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
   SDLoc DL(Node);
 
   // Attempt to expand using TargetLowering.
@@ -1752,11 +1759,11 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
   }
 
   // Make sure that the SINT_TO_FP and SRL instructions are available.
-  if (((!IsStrict && TLI.getOperationAction(ISD::SINT_TO_FP, VT) ==
+  if (((!IsStrict && TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) ==
                          TargetLowering::Expand) ||
-       (IsStrict && TLI.getOperationAction(ISD::STRICT_SINT_TO_FP, VT) ==
+       (IsStrict && TLI.getOperationAction(ISD::STRICT_SINT_TO_FP, SrcVT) ==
                         TargetLowering::Expand)) ||
-      TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) {
+      TLI.getOperationAction(ISD::SRL, SrcVT) == TargetLowering::Expand) {
     if (IsStrict) {
       UnrollStrictFPOp(Node, Results);
       return;
@@ -1766,37 +1773,59 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
     return;
   }
 
-  unsigned BW = VT.getScalarSizeInBits();
+  unsigned BW = SrcVT.getScalarSizeInBits();
   assert((BW == 64 || BW == 32) &&
          "Elements in vector-UINT_TO_FP must be 32 or 64 bits wide");
 
-  SDValue HalfWord = DAG.getConstant(BW / 2, DL, VT);
+  // If STRICT_/FMUL is not supported by the target (in case of f16) replace the
+  // UINT_TO_FP with a larger float and round to the smaller type
+  if ((!IsStrict && !TLI.isOperationLegalOrCustom(ISD::FMUL, DstVT)) ||
+      (IsStrict && !TLI.isOperationLegalOrCustom(ISD::STRICT_FMUL, DstVT))) {
+    EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64;
+    SDValue UIToFP;
+    SDValue Result;
+    SDValue TargetZero = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
+    EVT FloatVecVT = SrcVT.changeVectorElementType(FPVT);
+    if (IsStrict) {
+      UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {FloatVecVT, MVT::Other},
+                           {Node->getOperand(0), Src});
+      Result = DAG.getNode(ISD::STRICT_FP_ROUND, DL, {DstVT, MVT::Other},
+                           {Node->getOperand(0), UIToFP, TargetZero});
+      Results.push_back(Result);
+      Results.push_back(Result.getValue(1));
+    } else {
+      UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVecVT, Src);
+      Result = DAG.getNode(ISD::FP_ROUND, DL, DstVT, UIToFP, TargetZero);
+      Results.push_back(Result);
+    }
+
+    return;
+  }
+
+  SDValue HalfWord = DAG.getConstant(BW / 2, DL, SrcVT);
 
   // Constants to clear the upper part of the word.
   // Notice that we can also use SHL+SHR, but using a constant is slightly
   // faster on x86.
   uint64_t HWMask = (BW == 64) ? 0x00000000FFFFFFFF : 0x0000FFFF;
-  SDValue HalfWordMask = DAG.getConstant(HWMask, DL, VT);
+  SDValue HalfWordMask = DAG.getConstant(HWMask, DL, SrcVT);
 
   // Two to the power of half-word-size.
-  SDValue TWOHW =
-      DAG.getConstantFP(1ULL << (BW / 2), DL, Node->getValueType(0));
+  SDValue TWOHW = DAG.getConstantFP(1ULL << (BW / 2), DL, DstVT);
 
   // Clear upper part of LO, lower HI
-  SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Src, HalfWord);
-  SDValue LO = DAG.getNode(ISD::AND, DL, VT, Src, HalfWordMask);
+  SDValue HI = DAG.getNode(ISD::SRL, DL, SrcVT, Src, HalfWord);
+  SDValue LO = DAG.getNode(ISD::AND, DL, SrcVT, Src, HalfWordMask);
 
   if (IsStrict) {
     // Convert hi and lo to floats
     // Convert the hi part back to the upper values
     // TODO: Can any fast-math-flags be set on these nodes?
-    SDValue fHI = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL,
-                              {Node->getValueType(0), MVT::Other},
+    SDValue fHI = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {DstVT, MVT::Other},
                               {Node->getOperand(0), HI});
-    fHI = DAG.getNode(ISD::STRICT_FMUL, DL, {Node->getValueType(0), MVT::Other},
+    fHI = DAG.getNode(ISD::STRICT_FMUL, DL, {DstVT, MVT::Other},
                       {fHI.getValue(1), fHI, TWOHW});
-    SDValue fLO = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL,
-                              {Node->getValueType(0), MVT::Other},
+    SDValue fLO = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {DstVT, MVT::Other},
                               {Node->getOperand(0), LO});
 
     SDValue TF = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, fHI.getValue(1),
@@ -1804,8 +1833,7 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
 
     // Add the two halves
     SDValue Result =
-        DAG.getNode(ISD::STRICT_FADD, DL, {Node->getValueType(0), MVT::Other},
-                    {TF, fHI, fLO});
+        DAG.getNode(ISD::STRICT_FADD, DL, {DstVT, MVT::Other}, {TF, fHI, fLO});
 
     Results.push_back(Result);
     Results.push_back(Result.getValue(1));
@@ -1815,13 +1843,12 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
   // Convert hi and lo to floats
   // Convert the hi part back to the upper values
   // TODO: Can any fast-math-flags be set on these nodes?
-  SDValue fHI = DAG.getNode(ISD::SINT_TO_FP, DL, Node->getValueType(0), HI);
-  fHI = DAG.getNode(ISD::FMUL, DL, Node->getValueType(0), fHI, TWOHW);
-  SDValue fLO = DAG.getNode(ISD::SINT_TO_FP, DL, Node->getValueType(0), LO);
+  SDValue fHI = DAG.getNode(ISD::SINT_TO_FP, DL, DstVT, HI);
+  fHI = DAG.getNode(ISD::FMUL, DL, DstVT, fHI, TWOHW);
+  SDValue fLO = DAG.getNode(ISD::SINT_TO_FP, DL, DstVT, LO);
 
   // Add the two halves
-  Results.push_back(
-      DAG.getNode(ISD::FADD, DL, Node->getValueType(0), fHI, fLO));
+  Results.push_back(DAG.getNode(ISD::FADD, DL, DstVT, fHI, fLO));
 }
 
 SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) {
@@ -2246,11 +2273,13 @@ SDValue VectorLegalizer::UnrollVSETCC(SDNode *Node) {
                                   DAG.getVectorIdxConstant(i, dl));
     SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
                                   DAG.getVectorIdxConstant(i, dl));
+    // FIXME: We should use i1 setcc + boolext here, but it causes regressions.
     Ops[i] = DAG.getNode(ISD::SETCC, dl,
                          TLI.getSetCCResultType(DAG.getDataLayout(),
                                                 *DAG.getContext(), TmpEltVT),
                          LHSElem, RHSElem, CC);
-    Ops[i] = DAG.getSelect(dl, EltVT, Ops[i], DAG.getAllOnesConstant(dl, EltVT),
+    Ops[i] = DAG.getSelect(dl, EltVT, Ops[i],
+                           DAG.getBoolConstant(true, dl, EltVT, VT),
                            DAG.getConstant(0, dl, EltVT));
   }
   return DAG.getBuildVector(VT, dl, Ops);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 107454a..780eba1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -149,6 +149,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMAXNUM_IEEE:
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
   case ISD::FLDEXP:
   case ISD::ABDS:
   case ISD::ABDU:
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 9e5867c..51ee3cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -125,9 +125,9 @@ static cl::opt<int> MaxReorderWindow(
   cl::desc("Number of instructions to allow ahead of the critical path "
            "in sched=list-ilp"));
 
-static cl::opt<unsigned> AvgIPC(
-  "sched-avg-ipc", cl::Hidden, cl::init(1),
-  cl::desc("Average inst/cycle whan no target itinerary exists."));
+static cl::opt<unsigned>
+    AvgIPC("sched-avg-ipc", cl::Hidden, cl::init(1),
+           cl::desc("Average inst/cycle when no target itinerary exists."));
 
 namespace {
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 26fc75c..dff7243 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -43,9 +43,9 @@ STATISTIC(LoadsClustered, "Number of loads clustered together");
 // without a target itinerary. The choice of number here has more to do with
 // balancing scheduler heuristics than with the actual machine latency.
 static cl::opt<int> HighLatencyCycles(
-  "sched-high-latency-cycles", cl::Hidden, cl::init(10),
-  cl::desc("Roughly estimate the number of cycles that 'long latency'"
-           "instructions take for targets with no itinerary"));
+    "sched-high-latency-cycles", cl::Hidden, cl::init(10),
+    cl::desc("Roughly estimate the number of cycles that 'long latency' "
+             "instructions take for targets with no itinerary"));
 
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
     : ScheduleDAG(mf), InstrItins(mf.getSubtarget().getInstrItineraryData()) {}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 10e8ba9..0dfd030 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -430,6 +430,21 @@ bool ISD::matchBinaryPredicate(
   return true;
 }
 
+ISD::NodeType ISD::getInverseMinMaxOpcode(unsigned MinMaxOpc) {
+  switch (MinMaxOpc) {
+  default:
+    llvm_unreachable("unrecognized opcode");
+  case ISD::UMIN:
+    return ISD::UMAX;
+  case ISD::UMAX:
+    return ISD::UMIN;
+  case ISD::SMIN:
+    return ISD::SMAX;
+  case ISD::SMAX:
+    return ISD::SMIN;
+  }
+}
+
 ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) {
   switch (VecReduceOpcode) {
   default:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e87d809..9f57884 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8435,7 +8435,6 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
     return false;
 
   SDLoc dl(SDValue(Node, 0));
-  EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
 
   // Implementation of unsigned i64 to f64 following the algorithm in
   // __floatundidf in compiler_rt.  This implementation performs rounding
@@ -8448,7 +8447,7 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
       llvm::bit_cast<double>(UINT64_C(0x4530000000100000)), dl, DstVT);
   SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
   SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
-  SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
+  SDValue HiShift = DAG.getShiftAmountConstant(32, SrcVT, dl);
 
   SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
   SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
diff --git a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 687acd9..8437422 100644
--- a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -106,8 +106,6 @@ bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
   if (!EnablePatchPointLiveness)
     return false;
 
-  LLVM_DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
-                    << MF.getName() << " **********\n");
   TRI = MF.getSubtarget().getRegisterInfo();
   ++NumStackMapFuncVisited;
 
@@ -121,6 +119,8 @@ bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
 
 /// Performs the actual liveness calculation for the function.
 bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
+                    << MF.getName() << " **********\n");
   bool HasChanged = false;
   // For all basic blocks in the function.
   for (auto &MBB : MF) {
diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
index 74a94d6..decffdc 100644
--- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -259,7 +259,7 @@ void SwiftErrorValueTracking::propagateVRegs() {
   for (const auto &Use : VRegUpwardsUse) {
     const MachineBasicBlock *UseBB = Use.first.first;
     Register VReg = Use.second;
-    if (!MRI.def_begin(VReg).atEnd())
+    if (!MRI.def_empty(VReg))
       continue;
 
 #ifdef EXPENSIVE_CHECKS
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index d407e9f..5c05589 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -113,8 +113,6 @@ static cl::opt<bool> EnableImplicitNullChecks(
 static cl::opt<bool> DisableMergeICmps("disable-mergeicmps",
     cl::desc("Disable MergeICmps Pass"),
     cl::init(false), cl::Hidden);
-static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
-    cl::desc("Print LLVM IR produced by the loop-reduce pass"));
 static cl::opt<bool>
     PrintISelInput("print-isel-input", cl::Hidden,
                    cl::desc("Print LLVM IR input to isel pass"));
@@ -503,7 +501,6 @@ CGPassBuilderOption llvm::getCGPassBuilderOption() {
   SET_BOOLEAN_OPTION(DisableCGP)
   SET_BOOLEAN_OPTION(DisablePartialLibcallInlining)
   SET_BOOLEAN_OPTION(DisableSelectOptimize)
-  SET_BOOLEAN_OPTION(PrintLSR)
   SET_BOOLEAN_OPTION(PrintISelInput)
   SET_BOOLEAN_OPTION(DebugifyAndStripAll)
   SET_BOOLEAN_OPTION(DebugifyCheckAndStripAll)
@@ -836,9 +833,6 @@ void TargetPassConfig::addIRPasses() {
       addPass(createLoopStrengthReducePass());
       if (EnableLoopTermFold)
         addPass(createLoopTermFoldPass());
-      if (PrintLSR)
-        addPass(createPrintFunctionPass(dbgs(),
-                                        "\n\n*** Code after LSR ***\n"));
     }
 
     // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
diff --git a/llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp b/llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp
index 85b41e2..c112c0b 100644
--- a/llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/CallSiteInfo.cpp
@@ -151,7 +151,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionYAML)
 
 Error CallSiteInfoLoader::loadYAML(StringRef YAMLFile) {
   // Step 1: Read YAML file
-  auto BufferOrError = MemoryBuffer::getFile(YAMLFile);
+  auto BufferOrError = MemoryBuffer::getFile(YAMLFile, /*IsText=*/true);
   if (!BufferOrError)
     return errorCodeToError(BufferOrError.getError());
 
diff --git a/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp b/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
index dd754c70..785a8da 100644
--- a/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
@@ -235,10 +235,10 @@ llvm::Expected<uint64_t> FunctionInfo::encode(FileWriter &Out,
   return FuncInfoOffset;
 }
 
-llvm::Expected<LookupResult> FunctionInfo::lookup(DataExtractor &Data,
-                                                  const GsymReader &GR,
-                                                  uint64_t FuncAddr,
-                                                  uint64_t Addr) {
+llvm::Expected<LookupResult>
+FunctionInfo::lookup(DataExtractor &Data, const GsymReader &GR,
+                     uint64_t FuncAddr, uint64_t Addr,
+                     std::optional<DataExtractor> *MergedFuncsData) {
   LookupResult LR;
   LR.LookupAddr = Addr;
   uint64_t Offset = 0;
@@ -289,6 +289,12 @@ llvm::Expected<LookupResult> FunctionInfo::lookup(DataExtractor &Data,
           return ExpectedLE.takeError();
         break;
 
+      case InfoType::MergedFunctionsInfo:
+        // Store the merged functions data for later parsing, if needed.
+        if (MergedFuncsData)
+          *MergedFuncsData = InfoData;
+        break;
+
       case InfoType::InlineInfo:
         // We will parse the inline info after our line table, but only if
         // we have a line entry.
diff --git a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
index fa5476d..0a5bb7c 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
@@ -334,14 +334,52 @@ GsymReader::getFunctionInfoAtIndex(uint64_t Idx) const {
     return ExpectedData.takeError();
 }
 
-llvm::Expected<LookupResult> GsymReader::lookup(uint64_t Addr) const {
+llvm::Expected<LookupResult>
+GsymReader::lookup(uint64_t Addr,
+                   std::optional<DataExtractor> *MergedFunctionsData) const {
   uint64_t FuncStartAddr = 0;
   if (auto ExpectedData = getFunctionInfoDataForAddress(Addr, FuncStartAddr))
-    return FunctionInfo::lookup(*ExpectedData, *this, FuncStartAddr, Addr);
+    return FunctionInfo::lookup(*ExpectedData, *this, FuncStartAddr, Addr,
+                                MergedFunctionsData);
   else
     return ExpectedData.takeError();
 }
 
+llvm::Expected<std::vector<LookupResult>>
+GsymReader::lookupAll(uint64_t Addr) const {
+  std::vector<LookupResult> Results;
+  std::optional<DataExtractor> MergedFunctionsData;
+
+  // First perform a lookup to get the primary function info result.
+  auto MainResult = lookup(Addr, &MergedFunctionsData);
+  if (!MainResult)
+    return MainResult.takeError();
+
+  // Add the main result as the first entry.
+  Results.push_back(std::move(*MainResult));
+
+  // Now process any merged functions data that was found during the lookup.
+  if (MergedFunctionsData) {
+    // Get data extractors for each merged function.
+    auto ExpectedMergedFuncExtractors =
+        MergedFunctionsInfo::getFuncsDataExtractors(*MergedFunctionsData);
+    if (!ExpectedMergedFuncExtractors)
+      return ExpectedMergedFuncExtractors.takeError();
+
+    // Process each merged function data.
+    for (DataExtractor &MergedData : *ExpectedMergedFuncExtractors) {
+      if (auto FI = FunctionInfo::lookup(MergedData, *this,
+                                         MainResult->FuncRange.start(), Addr)) {
+        Results.push_back(std::move(*FI));
+      } else {
+        return FI.takeError();
+      }
+    }
+  }
+
+  return Results;
+}
+
 void GsymReader::dump(raw_ostream &OS) {
   const auto &Header = getHeader();
   // Dump the GSYM header.
diff --git a/llvm/lib/DebugInfo/GSYM/MergedFunctionsInfo.cpp b/llvm/lib/DebugInfo/GSYM/MergedFunctionsInfo.cpp
index 4efae22..d2c28f3 100644
--- a/llvm/lib/DebugInfo/GSYM/MergedFunctionsInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/MergedFunctionsInfo.cpp
@@ -35,22 +35,59 @@ llvm::Error MergedFunctionsInfo::encode(FileWriter &Out) const {
 llvm::Expected<MergedFunctionsInfo>
 MergedFunctionsInfo::decode(DataExtractor &Data, uint64_t BaseAddr) {
   MergedFunctionsInfo MFI;
+  auto FuncExtractorsOrError = MFI.getFuncsDataExtractors(Data);
+
+  if (!FuncExtractorsOrError)
+    return FuncExtractorsOrError.takeError();
+
+  for (DataExtractor &FuncData : *FuncExtractorsOrError) {
+    llvm::Expected<FunctionInfo> FI = FunctionInfo::decode(FuncData, BaseAddr);
+    if (!FI)
+      return FI.takeError();
+    MFI.MergedFunctions.push_back(std::move(*FI));
+  }
+
+  return MFI;
+}
+
+llvm::Expected<std::vector<DataExtractor>>
+MergedFunctionsInfo::getFuncsDataExtractors(DataExtractor &Data) {
+  std::vector<DataExtractor> Results;
   uint64_t Offset = 0;
+
+  // Ensure there is enough data to read the function count.
+  if (!Data.isValidOffsetForDataOfSize(Offset, 4))
+    return createStringError(
+        std::errc::io_error,
+        "unable to read the function count at offset 0x%8.8" PRIx64, Offset);
+
   uint32_t Count = Data.getU32(&Offset);
 
   for (uint32_t i = 0; i < Count; ++i) {
+    // Ensure there is enough data to read the function size.
+    if (!Data.isValidOffsetForDataOfSize(Offset, 4))
+      return createStringError(
+          std::errc::io_error,
+          "unable to read size of function %u at offset 0x%8.8" PRIx64, i,
+          Offset);
+
     uint32_t FnSize = Data.getU32(&Offset);
-    DataExtractor FnData(Data.getData().substr(Offset, FnSize),
+
+    // Ensure there is enough data for the function content.
+    if (!Data.isValidOffsetForDataOfSize(Offset, FnSize))
+      return createStringError(
+          std::errc::io_error,
+          "function data is truncated for function %u at offset 0x%8.8" PRIx64
+          ", expected size %u",
+          i, Offset, FnSize);
+
+    // Extract the function data.
+    Results.emplace_back(Data.getData().substr(Offset, FnSize),
                          Data.isLittleEndian(), Data.getAddressSize());
-    llvm::Expected<FunctionInfo> FI =
-        FunctionInfo::decode(FnData, BaseAddr + Offset);
-    if (!FI)
-      return FI.takeError();
-    MFI.MergedFunctions.push_back(std::move(*FI));
+
     Offset += FnSize;
   }
-
-  return MFI;
+  return Results;
 }
 
 bool operator==(const MergedFunctionsInfo &LHS,
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
index 56c32ae..a12e9f3 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
@@ -58,6 +58,10 @@ private:
       return Pointer32;
     case ELF::R_LARCH_32_PCREL:
       return Delta32;
+    case ELF::R_LARCH_B16:
+      return Branch16PCRel;
+    case ELF::R_LARCH_B21:
+      return Branch21PCRel;
     case ELF::R_LARCH_B26:
       return Branch26PCRel;
     case ELF::R_LARCH_PCALA_HI20:
diff --git a/llvm/lib/ExecutionEngine/JITLink/loongarch.cpp b/llvm/lib/ExecutionEngine/JITLink/loongarch.cpp
index 010c0ed..cdb3da0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/loongarch.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/loongarch.cpp
@@ -44,6 +44,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     KIND_NAME_CASE(Delta32)
     KIND_NAME_CASE(NegDelta32)
     KIND_NAME_CASE(Delta64)
+    KIND_NAME_CASE(Branch16PCRel)
+    KIND_NAME_CASE(Branch21PCRel)
     KIND_NAME_CASE(Branch26PCRel)
     KIND_NAME_CASE(Page20)
     KIND_NAME_CASE(PageOffset12)
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 6a9ebb4..d47eb44 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1576,12 +1576,22 @@ void Platform::lookupInitSymbolsAsync(
   }
 }
 
+MaterializationTask::~MaterializationTask() {
+  // If this task wasn't run then fail materialization.
+  if (MR)
+    MR->failMaterialization();
+}
+
 void MaterializationTask::printDescription(raw_ostream &OS) {
   OS << "Materialization task: " << MU->getName() << " in "
      << MR->getTargetJITDylib().getName();
 }
 
-void MaterializationTask::run() { MU->materialize(std::move(MR)); }
+void MaterializationTask::run() {
+  assert(MU && "MU should not be null");
+  assert(MR && "MR should not be null");
+  MU->materialize(std::move(MR));
+}
 
 void LookupTask::printDescription(raw_ostream &OS) { OS << "Lookup task"; }
 
@@ -1821,17 +1831,10 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
                          RegisterDependenciesFunction RegisterDependencies) {
 #if LLVM_ENABLE_THREADS
   // In the threaded case we use promises to return the results.
-  std::promise<SymbolMap> PromisedResult;
-  Error ResolutionError = Error::success();
+  std::promise<MSVCPExpected<SymbolMap>> PromisedResult;
 
   auto NotifyComplete = [&](Expected<SymbolMap> R) {
-    if (R)
-      PromisedResult.set_value(std::move(*R));
-    else {
-      ErrorAsOutParameter _(ResolutionError);
-      ResolutionError = R.takeError();
-      PromisedResult.set_value(SymbolMap());
-    }
+    PromisedResult.set_value(std::move(R));
   };
 
 #else
@@ -1848,18 +1851,11 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(K, SearchOrder, std::move(Symbols), RequiredState, NotifyComplete,
-         RegisterDependencies);
+  lookup(K, SearchOrder, std::move(Symbols), RequiredState,
+         std::move(NotifyComplete), RegisterDependencies);
 
 #if LLVM_ENABLE_THREADS
-  auto ResultFuture = PromisedResult.get_future();
-  auto Result = ResultFuture.get();
-
-  if (ResolutionError)
-    return std::move(ResolutionError);
-
-  return std::move(Result);
-
+  return PromisedResult.get_future().get();
 #else
   if (ResolutionError)
     return std::move(ResolutionError);
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
index c08e52e..0d9a912 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
@@ -148,7 +148,7 @@ public:
         DSec.BuilderSec->align = Log2_64(SR.getFirstBlock()->getAlignment());
         StringRef SectionData(SR.getFirstBlock()->getContent().data(),
                               SR.getFirstBlock()->getSize());
-        DebugSectionMap[SecName] =
+        DebugSectionMap[SecName.drop_front(2)] = // drop "__" prefix.
             MemoryBuffer::getMemBuffer(SectionData, G.getName(), false);
         if (SecName == "__debug_line")
           DebugLineSectionData = SectionData;
@@ -167,11 +167,10 @@ public:
           DebugLineSectionData, G.getEndianness() == llvm::endianness::little,
           G.getPointerSize());
       uint64_t Offset = 0;
-      DWARFDebugLine::LineTable LineTable;
+      DWARFDebugLine::Prologue P;
 
       // Try to parse line data. Consume error on failure.
-      if (auto Err = LineTable.parse(DebugLineData, &Offset, *DWARFCtx, nullptr,
-                                     consumeError)) {
+      if (auto Err = P.parse(DebugLineData, &Offset, consumeError, *DWARFCtx)) {
         handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) {
           LLVM_DEBUG({
             dbgs() << "Cannot parse line table for \"" << G.getName() << "\": ";
@@ -180,15 +179,26 @@ public:
           });
         });
       } else {
-        if (!LineTable.Prologue.FileNames.empty())
-          FileName = *dwarf::toString(LineTable.Prologue.FileNames[0].Name);
+        for (auto &FN : P.FileNames)
+          if ((FileName = dwarf::toString(FN.Name))) {
+            LLVM_DEBUG({
+              dbgs() << "Using FileName = \"" << *FileName
+                     << "\" from DWARF line table\n";
+            });
+            break;
+          }
       }
     }
 
     // If no line table (or unable to use) then use graph name.
     // FIXME: There are probably other debug sections we should look in first.
-    if (!FileName)
-      FileName = StringRef(G.getName());
+    if (!FileName) {
+      LLVM_DEBUG({
+        dbgs() << "Could not find source name from DWARF line table. "
+                  "Using FileName = \"\"\n";
+      });
+      FileName = "";
+    }
 
     Builder.addSymbol("", MachO::N_SO, 0, 0, 0);
     Builder.addSymbol(*FileName, MachO::N_SO, 0, 0, 0);
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 0e83497..9f324c7 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -937,6 +937,12 @@ Error MachOPlatform::MachOPlatformPlugin::bootstrapPipelineEnd(
     jitlink::LinkGraph &G) {
   std::lock_guard<std::mutex> Lock(MP.Bootstrap.load()->Mutex);
   assert(MP.Bootstrap && "DeferredAAs reset before bootstrap completed");
+
+  // Transfer any allocation actions to DeferredAAs.
+  std::move(G.allocActions().begin(), G.allocActions().end(),
+            std::back_inserter(MP.Bootstrap.load()->DeferredAAs));
+  G.allocActions().clear();
+
   --MP.Bootstrap.load()->ActiveGraphs;
   // Notify Bootstrap->CV while holding the mutex because the mutex is
   // also keeping Bootstrap->CV alive.
@@ -1397,10 +1403,6 @@ Error MachOPlatform::MachOPlatformPlugin::registerObjectPlatformSections(
                              SPSExecutorAddrRange, SPSExecutorAddrRange>>,
         SPSSequence<SPSTuple<SPSString, SPSExecutorAddrRange>>>;
 
-    shared::AllocActions &allocActions = LLVM_LIKELY(!InBootstrapPhase)
-                                             ? G.allocActions()
-                                             : MP.Bootstrap.load()->DeferredAAs;
-
     ExecutorAddr HeaderAddr;
     {
       std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
@@ -1410,7 +1412,7 @@ Error MachOPlatform::MachOPlatformPlugin::registerObjectPlatformSections(
       assert(I->second && "Null header registered for JD");
       HeaderAddr = I->second;
     }
-    allocActions.push_back(
+    G.allocActions().push_back(
         {cantFail(
              WrapperFunctionCall::Create<SPSRegisterObjectPlatformSectionsArgs>(
                  MP.RegisterObjectPlatformSections.Addr, HeaderAddr, UnwindInfo,
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 6688b09..9bc0aa8 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -16,8 +16,6 @@ namespace llvm::orc {
 
 char ObjectLinkingLayer::ID;
 
-using BaseObjectLayer = RTTIExtends<ObjectLinkingLayer, ObjectLayer>;
-
 void ObjectLinkingLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
                               std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Object must not be null");
diff --git a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
index fbe4b09..1af17e8 100644
--- a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
@@ -31,6 +31,10 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
   {
     std::lock_guard<std::mutex> Lock(DispatchMutex);
 
+    // Reject new tasks if they're dispatched after a call to shutdown.
+    if (Shutdown)
+      return;
+
     if (IsMaterializationTask) {
 
       // If this is a materialization task and there are too many running
@@ -54,6 +58,14 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
       // Run the task.
       T->run();
 
+      // Reset the task to free any resources. We need this to happen *before*
+      // we notify anyone (via Outstanding) that this thread is done to ensure
+      // that we don't proceed with JIT shutdown while still holding resources.
+      // (E.g. this was causing "Dangling SymbolStringPtr" assertions).
+      T.reset();
+
+      // Check the work queue state and either proceed with the next task or
+      // end this thread.
       std::lock_guard<std::mutex> Lock(DispatchMutex);
       if (!MaterializationTaskQueue.empty()) {
         // If there are any materialization tasks running then steal that work.
@@ -64,7 +76,6 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
           IsMaterializationTask = true;
         }
       } else {
-        // Otherwise decrement work counters.
         if (IsMaterializationTask)
           --NumMaterializationThreads;
         --Outstanding;
@@ -78,7 +89,7 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
 
 void DynamicThreadPoolTaskDispatcher::shutdown() {
   std::unique_lock<std::mutex> Lock(DispatchMutex);
-  Running = false;
+  Shutdown = true;
   OutstandingCV.wait(Lock, [this]() { return Outstanding == 0; });
 }
 #endif
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0d8dbbe..8dbf2aa 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5302,10 +5302,11 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
   Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
   if (AlignedVars.size()) {
     InsertPointTy IP = Builder.saveIP();
-    Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
     for (auto &AlignedItem : AlignedVars) {
       Value *AlignedPtr = AlignedItem.first;
       Value *Alignment = AlignedItem.second;
+      Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
+      Builder.SetInsertPoint(loadInst->getNextNode());
       Builder.CreateAlignmentAssumption(F->getDataLayout(),
                                         AlignedPtr, Alignment);
     }
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index d81a292..3566435 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -1520,15 +1520,72 @@ ConstantRange ConstantRange::binaryNot() const {
   return ConstantRange(APInt::getAllOnes(getBitWidth())).sub(*this);
 }
 
+/// Estimate the 'bit-masked AND' operation's lower bound.
+///
+/// E.g., given two ranges as follows (single quotes are separators and
+/// have no meaning here),
+///
+///   LHS = [10'00101'1,  ; LLo
+///          10'10000'0]  ; LHi
+///   RHS = [10'11111'0,  ; RLo
+///          10'11111'1]  ; RHi
+///
+/// we know that the higher 2 bits of the result is always 10; and we also
+/// notice that RHS[1:6] are always 1, so the result[1:6] cannot be less than
+/// LHS[1:6] (i.e., 00101). Thus, the lower bound is 10'00101'0.
+///
+/// The algorithm is as follows,
+/// 1. we first calculate a mask to find the higher common bits by
+///       Mask = ~((LLo ^ LHi) | (RLo ^ RHi) | (LLo ^ RLo));
+///       Mask = clear all non-leading-ones bits in Mask;
+///    in the example, the Mask is set to 11'00000'0;
+/// 2. calculate a new mask by setting all common leading bits to 1 in RHS, and
+///    keeping the longest leading ones (i.e., 11'11111'0 in the example);
+/// 3. return (LLo & new mask) as the lower bound;
+/// 4. repeat the step 2 and 3 with LHS and RHS swapped, and update the lower
+///    bound with the larger one.
+static APInt estimateBitMaskedAndLowerBound(const ConstantRange &LHS,
+                                            const ConstantRange &RHS) {
+  auto BitWidth = LHS.getBitWidth();
+  // If either is full set or unsigned wrapped, then the range must contain '0'
+  // which leads the lower bound to 0.
+  if ((LHS.isFullSet() || RHS.isFullSet()) ||
+      (LHS.isWrappedSet() || RHS.isWrappedSet()))
+    return APInt::getZero(BitWidth);
+
+  auto LLo = LHS.getLower();
+  auto LHi = LHS.getUpper() - 1;
+  auto RLo = RHS.getLower();
+  auto RHi = RHS.getUpper() - 1;
+
+  // Calculate the mask for the higher common bits.
+  auto Mask = ~((LLo ^ LHi) | (RLo ^ RHi) | (LLo ^ RLo));
+  unsigned LeadingOnes = Mask.countLeadingOnes();
+  Mask.clearLowBits(BitWidth - LeadingOnes);
+
+  auto estimateBound = [BitWidth, &Mask](APInt ALo, const APInt &BLo,
+                                         const APInt &BHi) {
+    unsigned LeadingOnes = ((BLo & BHi) | Mask).countLeadingOnes();
+    unsigned StartBit = BitWidth - LeadingOnes;
+    ALo.clearLowBits(StartBit);
+    return ALo;
+  };
+
+  auto LowerBoundByLHS = estimateBound(LLo, RLo, RHi);
+  auto LowerBoundByRHS = estimateBound(RLo, LLo, LHi);
+
+  return APIntOps::umax(LowerBoundByLHS, LowerBoundByRHS);
+}
+
 ConstantRange ConstantRange::binaryAnd(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
   ConstantRange KnownBitsRange =
       fromKnownBits(toKnownBits() & Other.toKnownBits(), false);
-  ConstantRange UMinUMaxRange =
-      getNonEmpty(APInt::getZero(getBitWidth()),
-                  APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax()) + 1);
+  auto LowerBound = estimateBitMaskedAndLowerBound(*this, Other);
+  ConstantRange UMinUMaxRange = getNonEmpty(
+      LowerBound, APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax()) + 1);
   return KnownBitsRange.intersectWith(UMinUMaxRange);
 }
 
@@ -1538,10 +1595,17 @@ ConstantRange ConstantRange::binaryOr(const ConstantRange &Other) const {
 
   ConstantRange KnownBitsRange =
       fromKnownBits(toKnownBits() | Other.toKnownBits(), false);
+
+  //      ~a & ~b    >= x
+  // <=>  ~(~a & ~b) <= ~x
+  // <=>  a | b      <= ~x
+  // <=>  a | b      <  ~x + 1 = -x
+  // thus, UpperBound(a | b) == -LowerBound(~a & ~b)
+  auto UpperBound =
+      -estimateBitMaskedAndLowerBound(binaryNot(), Other.binaryNot());
   // Upper wrapped range.
-  ConstantRange UMaxUMinRange =
-      getNonEmpty(APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin()),
-                  APInt::getZero(getBitWidth()));
+  ConstantRange UMaxUMinRange = getNonEmpty(
+      APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin()), UpperBound);
   return KnownBitsRange.intersectWith(UMaxUMinRange);
 }
 
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index aaaab0b..08bf3f9 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -491,8 +491,7 @@ public:
     default:
       if (Instruction::isCast(Opcode))
         return new CastConstantExpr(Opcode, Ops[0], Ty);
-      if ((Opcode >= Instruction::BinaryOpsBegin &&
-           Opcode < Instruction::BinaryOpsEnd))
+      if (Instruction::isBinaryOp(Opcode))
         return new BinaryConstantExpr(Opcode, Ops[0], Ops[1],
                                       SubclassOptionalData);
       llvm_unreachable("Invalid ConstantExpr!");
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index f340f7a..27b499e 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -78,11 +78,11 @@ void IRBuilderBase::SetInstDebugLocation(Instruction *I) const {
 
 CallInst *
 IRBuilderBase::createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
-                                const Twine &Name, Instruction *FMFSource,
+                                const Twine &Name, FMFSource FMFSource,
                                 ArrayRef<OperandBundleDef> OpBundles) {
   CallInst *CI = CreateCall(Callee, Ops, OpBundles, Name);
-  if (FMFSource)
-    CI->copyFastMathFlags(FMFSource);
+  if (isa<FPMathOperator>(CI))
+    CI->setFastMathFlags(FMFSource.get(FMF));
   return CI;
 }
 
@@ -869,7 +869,7 @@ CallInst *IRBuilderBase::CreateGCGetPointerOffset(Value *DerivedPtr,
 }
 
 CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
-                                              Instruction *FMFSource,
+                                              FMFSource FMFSource,
                                               const Twine &Name) {
   Module *M = BB->getModule();
   Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, {V->getType()});
@@ -877,12 +877,12 @@ CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
 }
 
 Value *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS,
-                                            Value *RHS, Instruction *FMFSource,
+                                            Value *RHS, FMFSource FMFSource,
                                             const Twine &Name) {
   Module *M = BB->getModule();
   Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, {LHS->getType()});
   if (Value *V = Folder.FoldBinaryIntrinsic(ID, LHS, RHS, Fn->getReturnType(),
-                                            FMFSource))
+                                            /*FMFSource=*/nullptr))
     return V;
   return createCallHelper(Fn, {LHS, RHS}, Name, FMFSource);
 }
@@ -890,7 +890,7 @@ Value *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS,
 CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID,
                                          ArrayRef<Type *> Types,
                                          ArrayRef<Value *> Args,
-                                         Instruction *FMFSource,
+                                         FMFSource FMFSource,
                                          const Twine &Name) {
   Module *M = BB->getModule();
   Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, Types);
@@ -899,7 +899,7 @@ CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID,
 
 CallInst *IRBuilderBase::CreateIntrinsic(Type *RetTy, Intrinsic::ID ID,
                                          ArrayRef<Value *> Args,
-                                         Instruction *FMFSource,
+                                         FMFSource FMFSource,
                                          const Twine &Name) {
   Module *M = BB->getModule();
 
@@ -925,16 +925,13 @@ CallInst *IRBuilderBase::CreateIntrinsic(Type *RetTy, Intrinsic::ID ID,
 }
 
 CallInst *IRBuilderBase::CreateConstrainedFPBinOp(
-    Intrinsic::ID ID, Value *L, Value *R, Instruction *FMFSource,
-    const Twine &Name, MDNode *FPMathTag,
-    std::optional<RoundingMode> Rounding,
+    Intrinsic::ID ID, Value *L, Value *R, FMFSource FMFSource,
+    const Twine &Name, MDNode *FPMathTag, std::optional<RoundingMode> Rounding,
     std::optional<fp::ExceptionBehavior> Except) {
   Value *RoundingV = getConstrainedFPRounding(Rounding);
   Value *ExceptV = getConstrainedFPExcept(Except);
 
-  FastMathFlags UseFMF = FMF;
-  if (FMFSource)
-    UseFMF = FMFSource->getFastMathFlags();
+  FastMathFlags UseFMF = FMFSource.get(FMF);
 
   CallInst *C = CreateIntrinsic(ID, {L->getType()},
                                 {L, R, RoundingV, ExceptV}, nullptr, Name);
@@ -944,14 +941,12 @@ CallInst *IRBuilderBase::CreateConstrainedFPBinOp(
 }
 
 CallInst *IRBuilderBase::CreateConstrainedFPUnroundedBinOp(
-    Intrinsic::ID ID, Value *L, Value *R, Instruction *FMFSource,
+    Intrinsic::ID ID, Value *L, Value *R, FMFSource FMFSource,
     const Twine &Name, MDNode *FPMathTag,
     std::optional<fp::ExceptionBehavior> Except) {
   Value *ExceptV = getConstrainedFPExcept(Except);
 
-  FastMathFlags UseFMF = FMF;
-  if (FMFSource)
-    UseFMF = FMFSource->getFastMathFlags();
+  FastMathFlags UseFMF = FMFSource.get(FMF);
 
   CallInst *C =
       CreateIntrinsic(ID, {L->getType()}, {L, R, ExceptV}, nullptr, Name);
@@ -976,15 +971,12 @@ Value *IRBuilderBase::CreateNAryOp(unsigned Opc, ArrayRef<Value *> Ops,
 }
 
 CallInst *IRBuilderBase::CreateConstrainedFPCast(
-    Intrinsic::ID ID, Value *V, Type *DestTy,
-    Instruction *FMFSource, const Twine &Name, MDNode *FPMathTag,
-    std::optional<RoundingMode> Rounding,
+    Intrinsic::ID ID, Value *V, Type *DestTy, FMFSource FMFSource,
+    const Twine &Name, MDNode *FPMathTag, std::optional<RoundingMode> Rounding,
     std::optional<fp::ExceptionBehavior> Except) {
   Value *ExceptV = getConstrainedFPExcept(Except);
 
-  FastMathFlags UseFMF = FMF;
-  if (FMFSource)
-    UseFMF = FMFSource->getFastMathFlags();
+  FastMathFlags UseFMF = FMFSource.get(FMF);
 
   CallInst *C;
   if (Intrinsic::hasConstrainedFPRoundingModeOperand(ID)) {
@@ -1002,9 +994,10 @@ CallInst *IRBuilderBase::CreateConstrainedFPCast(
   return C;
 }
 
-Value *IRBuilderBase::CreateFCmpHelper(
-    CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name,
-    MDNode *FPMathTag, bool IsSignaling) {
+Value *IRBuilderBase::CreateFCmpHelper(CmpInst::Predicate P, Value *LHS,
+                                       Value *RHS, const Twine &Name,
+                                       MDNode *FPMathTag, FMFSource FMFSource,
+                                       bool IsSignaling) {
   if (IsFPConstrained) {
     auto ID = IsSignaling ? Intrinsic::experimental_constrained_fcmps
                           : Intrinsic::experimental_constrained_fcmp;
@@ -1013,7 +1006,9 @@ Value *IRBuilderBase::CreateFCmpHelper(
 
   if (auto *V = Folder.FoldCmp(P, LHS, RHS))
     return V;
-  return Insert(setFPAttrs(new FCmpInst(P, LHS, RHS), FPMathTag, FMF), Name);
+  return Insert(
+      setFPAttrs(new FCmpInst(P, LHS, RHS), FPMathTag, FMFSource.get(FMF)),
+      Name);
 }
 
 CallInst *IRBuilderBase::CreateConstrainedFPCmp(
@@ -1047,6 +1042,12 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
 
 Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
                                    const Twine &Name, Instruction *MDFrom) {
+  return CreateSelectFMF(C, True, False, {}, Name, MDFrom);
+}
+
+Value *IRBuilderBase::CreateSelectFMF(Value *C, Value *True, Value *False,
+                                      FMFSource FMFSource, const Twine &Name,
+                                      Instruction *MDFrom) {
   if (auto *V = Folder.FoldSelect(C, True, False))
     return V;
 
@@ -1057,7 +1058,7 @@ Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
     Sel = addBranchMetadata(Sel, Prof, Unpred);
   }
   if (isa<FPMathOperator>(Sel))
-    setFPAttrs(Sel, nullptr /* MDNode* */, FMF);
+    setFPAttrs(Sel, /*MDNode=*/nullptr, FMFSource.get(FMF));
   return Insert(Sel, Name);
 }
 
diff --git a/llvm/lib/IR/SafepointIRVerifier.cpp b/llvm/lib/IR/SafepointIRVerifier.cpp
index ed99d05..d32852b 100644
--- a/llvm/lib/IR/SafepointIRVerifier.cpp
+++ b/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -289,6 +289,7 @@ static void PrintValueSet(raw_ostream &OS, IteratorTy Begin, IteratorTy End) {
 
 using AvailableValueSet = DenseSet<const Value *>;
 
+namespace {
 /// State we compute and track per basic block.
 struct BasicBlockState {
   // Set of values available coming in, before the phi nodes
@@ -305,6 +306,7 @@ struct BasicBlockState {
   // contribute to AvailableOut.
   bool Cleared = false;
 };
+} // namespace
 
 /// A given derived pointer can have multiple base pointers through phi/selects.
 /// This type indicates when the base pointer is exclusively constant
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 4522f4a..189f287 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -160,8 +160,7 @@ generateModuleMap(std::vector<std::unique_ptr<lto::InputFile>> &Modules) {
 
 static void promoteModule(Module &TheModule, const ModuleSummaryIndex &Index,
                           bool ClearDSOLocalOnDeclarations) {
-  if (renameModuleForThinLTO(TheModule, Index, ClearDSOLocalOnDeclarations))
-    report_fatal_error("renameModuleForThinLTO failed");
+  renameModuleForThinLTO(TheModule, Index, ClearDSOLocalOnDeclarations);
 }
 
 namespace {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a0c3f2c..be3535a 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1562,10 +1562,6 @@ Error IRLinker::run() {
   bool EnableDLWarning = true;
   bool EnableTripleWarning = true;
   if (SrcTriple.isNVPTX() && DstTriple.isNVPTX()) {
-    std::string ModuleId = SrcM->getModuleIdentifier();
-    StringRef FileName = llvm::sys::path::filename(ModuleId);
-    bool SrcIsLibDevice =
-        FileName.starts_with("libdevice") && FileName.ends_with(".10.bc");
     bool SrcHasLibDeviceDL =
         (SrcM->getDataLayoutStr().empty() ||
          SrcM->getDataLayoutStr() == "e-i64:64-v16:16-v32:32-n16:32:64");
@@ -1576,8 +1572,8 @@ Error IRLinker::run() {
                                   SrcTriple.getOSName() == "gpulibs") ||
                                  (SrcTriple.getVendorName() == "unknown" &&
                                   SrcTriple.getOSName() == "unknown");
-    EnableTripleWarning = !(SrcIsLibDevice && SrcHasLibDeviceTriple);
-    EnableDLWarning = !(SrcIsLibDevice && SrcHasLibDeviceDL);
+    EnableTripleWarning = !SrcHasLibDeviceTriple;
+    EnableDLWarning = !(SrcHasLibDeviceTriple && SrcHasLibDeviceDL);
   }
 
   if (EnableDLWarning && (SrcM->getDataLayout() != DstM.getDataLayout())) {
diff --git a/llvm/lib/MC/MCAsmInfoXCOFF.cpp b/llvm/lib/MC/MCAsmInfoXCOFF.cpp
index b07e95e..6ef11ba 100644
--- a/llvm/lib/MC/MCAsmInfoXCOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoXCOFF.cpp
@@ -19,29 +19,17 @@ extern cl::opt<cl::boolOrDefault> UseLEB128Directives;
 void MCAsmInfoXCOFF::anchor() {}
 
 MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
+  IsAIX = true;
   IsLittleEndian = false;
-  HasVisibilityOnlyWithLinkage = true;
-  HasBasenameOnlyForFileDirective = false;
-  HasFourStringsDotFile = true;
-
-  // For XCOFF, string constant consists of any number of characters enclosed in
-  // "" (double quotation marks).
-  HasPairedDoubleQuoteStringConstants = true;
 
   PrivateGlobalPrefix = "L..";
   PrivateLabelPrefix = "L..";
   SupportsQuotedNames = false;
-  UseDotAlignForAlignment = true;
-  UsesDwarfFileAndLocDirectives = false;
-  DwarfSectionSizeRequired = false;
   if (UseLEB128Directives == cl::BOU_UNSET)
     HasLEB128Directives = false;
   ZeroDirective = "\t.space\t";
-  ZeroDirectiveSupportsNonZeroValue = false;
   AsciiDirective = nullptr; // not supported
   AscizDirective = nullptr; // not supported
-  ByteListDirective = "\t.byte\t";
-  PlainStringDirective = "\t.string\t";
   CharacterLiteralSyntax = ACLS_SingleQuotePrefix;
 
   // Use .vbyte for data definition to avoid directives that apply an implicit
@@ -53,7 +41,6 @@ MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
   LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
   HasDotTypeDotSizeDirective = false;
   ParseInlineAsmUsingAsmParser = true;
-  NeedsFunctionDescriptors = true;
 
   ExceptionsType = ExceptionHandling::AIX;
 }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 78fed77..01fe11e 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -442,8 +442,6 @@ public:
   void emitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *LastLabel,
                                 const MCSymbol *Label,
                                 unsigned PointerSize) override;
-
-  void doFinalizationAtSectionEnd(MCSection *Section) override;
 };
 
 } // end anonymous namespace.
@@ -1219,7 +1217,7 @@ static void PrintByteList(StringRef Data, raw_ostream &OS,
 void MCAsmStreamer::PrintQuotedString(StringRef Data, raw_ostream &OS) const {
   OS << '"';
 
-  if (MAI->hasPairedDoubleQuoteStringConstants()) {
+  if (MAI->isAIX()) {
     for (unsigned char C : Data) {
       if (C == '"')
         OS << "\"\"";
@@ -1273,6 +1271,25 @@ void MCAsmStreamer::emitBytes(StringRef Data) {
   if (Data.empty()) return;
 
   const auto emitAsString = [this](StringRef Data) {
+    if (MAI->isAIX()) {
+      if (isPrintableString(Data)) {
+        // For target with DoubleQuoteString constants, .string and .byte are
+        // used as replacement of .asciz and .ascii.
+        if (Data.back() == 0) {
+          OS << "\t.string\t";
+          Data = Data.substr(0, Data.size() - 1);
+        } else {
+          OS << "\t.byte\t";
+        }
+        PrintQuotedString(Data, OS);
+      } else {
+        OS << "\t.byte\t";
+        PrintByteList(Data, OS, MAI->characterLiteralSyntax());
+      }
+      EmitEOL();
+      return true;
+    }
+
     // If the data ends with 0 and the target supports .asciz, use it, otherwise
     // use .ascii or a byte-list directive
     if (MAI->getAscizDirective() && Data.back() == 0) {
@@ -1280,27 +1297,6 @@ void MCAsmStreamer::emitBytes(StringRef Data) {
       Data = Data.substr(0, Data.size() - 1);
     } else if (LLVM_LIKELY(MAI->getAsciiDirective())) {
       OS << MAI->getAsciiDirective();
-    } else if (MAI->hasPairedDoubleQuoteStringConstants() &&
-               isPrintableString(Data)) {
-      // For target with DoubleQuoteString constants, .string and .byte are used
-      // as replacement of .asciz and .ascii.
-      assert(MAI->getPlainStringDirective() &&
-             "hasPairedDoubleQuoteStringConstants target must support "
-             "PlainString Directive");
-      assert(MAI->getByteListDirective() &&
-             "hasPairedDoubleQuoteStringConstants target must support ByteList "
-             "Directive");
-      if (Data.back() == 0) {
-        OS << MAI->getPlainStringDirective();
-        Data = Data.substr(0, Data.size() - 1);
-      } else {
-        OS << MAI->getByteListDirective();
-      }
-    } else if (MAI->getByteListDirective()) {
-      OS << MAI->getByteListDirective();
-      PrintByteList(Data, OS, MAI->characterLiteralSyntax());
-      EmitEOL();
-      return true;
     } else {
       return false;
     }
@@ -1483,7 +1479,7 @@ void MCAsmStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
     return;
 
   if (const char *ZeroDirective = MAI->getZeroDirective()) {
-    if (MAI->doesZeroDirectiveSupportNonZeroValue() || FillValue == 0) {
+    if (!MAI->isAIX() || FillValue == 0) {
       // FIXME: Emit location directives
       OS << ZeroDirective;
       NumBytes.print(OS, MAI);
@@ -1519,7 +1515,7 @@ void MCAsmStreamer::emitAlignmentDirective(uint64_t ByteAlignment,
                                            std::optional<int64_t> Value,
                                            unsigned ValueSize,
                                            unsigned MaxBytesToEmit) {
-  if (MAI->useDotAlignForAlignment()) {
+  if (MAI->isAIX()) {
     if (!isPowerOf2_64(ByteAlignment))
       report_fatal_error("Only power-of-two alignments are supported "
                          "with .align.");
@@ -1623,7 +1619,7 @@ void MCAsmStreamer::emitFileDirective(StringRef Filename,
                                       StringRef CompilerVersion,
                                       StringRef TimeStamp,
                                       StringRef Description) {
-  assert(MAI->hasFourStringsDotFile());
+  assert(MAI->isAIX());
   OS << "\t.file\t";
   PrintQuotedString(Filename, OS);
   bool useTimeStamp = !TimeStamp.empty();
@@ -1694,8 +1690,7 @@ Expected<unsigned> MCAsmStreamer::tryEmitDwarfFileDirective(
 
   // Return early if this file is already emitted before or if target doesn't
   // support .file directive.
-  if (NumFiles == Table.getMCDwarfFiles().size() ||
-      !MAI->usesDwarfFileAndLocDirectives())
+  if (NumFiles == Table.getMCDwarfFiles().size() || MAI->isAIX())
     return FileNo;
 
   SmallString<128> Str;
@@ -1724,7 +1719,7 @@ void MCAsmStreamer::emitDwarfFile0Directive(
                                       Source);
 
   // Target doesn't support .loc/.file directives, return early.
-  if (!MAI->usesDwarfFileAndLocDirectives())
+  if (MAI->isAIX())
     return;
 
   SmallString<128> Str;
@@ -1744,7 +1739,7 @@ void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                           StringRef FileName) {
   // If target doesn't support .loc/.file directive, we need to record the lines
   // same way like we do in object mode.
-  if (!MAI->usesDwarfFileAndLocDirectives()) {
+  if (MAI->isAIX()) {
     // In case we see two .loc directives in a row, make sure the
     // first one gets a line entry.
     MCDwarfLineEntry::make(this, getCurrentSectionOnly());
@@ -2444,7 +2439,7 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
 
 void MCAsmStreamer::emitInstruction(const MCInst &Inst,
                                     const MCSubtargetInfo &STI) {
-  if (!MAI->usesDwarfFileAndLocDirectives() && CurFrag)
+  if (MAI->isAIX() && CurFrag)
     // Now that a machine instruction has been assembled into this section, make
     // a line entry for any .loc directive that has been seen.
     MCDwarfLineEntry::make(this, getCurrentSectionOnly());
@@ -2547,7 +2542,7 @@ void MCAsmStreamer::finishImpl() {
 
   // Now it is time to emit debug line sections if target doesn't support .loc
   // and .line directives.
-  if (!MAI->usesDwarfFileAndLocDirectives()) {
+  if (MAI->isAIX()) {
     MCDwarfLineTable::emit(this, getAssembler().getDWARFLinetableParams());
     return;
   }
@@ -2572,7 +2567,7 @@ void MCAsmStreamer::emitDwarfUnitLength(uint64_t Length, const Twine &Comment) {
   // the debug section headers. In such cases, any label we placed occurs
   // after the implied length field. We need to adjust the reference here
   // to account for the offset introduced by the inserted length field.
-  if (!MAI->needsDwarfSectionSizeInHeader())
+  if (MAI->isAIX())
     return;
   MCStreamer::emitDwarfUnitLength(Length, Comment);
 }
@@ -2585,7 +2580,7 @@ MCSymbol *MCAsmStreamer::emitDwarfUnitLength(const Twine &Prefix,
   // the debug section headers. In such cases, any label we placed occurs
   // after the implied length field. We need to adjust the reference here
   // to account for the offset introduced by the inserted length field.
-  if (!MAI->needsDwarfSectionSizeInHeader())
+  if (MAI->isAIX())
     return getContext().createTempSymbol(Prefix + "_end");
   return MCStreamer::emitDwarfUnitLength(Prefix, Comment);
 }
@@ -2598,7 +2593,7 @@ void MCAsmStreamer::emitDwarfLineStartLabel(MCSymbol *StartSym) {
   // after the implied length field. We need to adjust the reference here
   // to account for the offset introduced by the inserted length field.
   MCContext &Ctx = getContext();
-  if (!MAI->needsDwarfSectionSizeInHeader()) {
+  if (MAI->isAIX()) {
     MCSymbol *DebugLineSymTmp = Ctx.createTempSymbol("debug_line_");
     // Emit the symbol which does not contain the unit length field.
     emitLabel(DebugLineSymTmp);
@@ -2625,7 +2620,7 @@ void MCAsmStreamer::emitDwarfLineEndEntry(MCSection *Section,
   // we currently use the .text end label as any section end. This will not
   // impact the debugability as we will jump to the caller of the last function
   // in the section before we come into the .text end address.
-  assert(!MAI->usesDwarfFileAndLocDirectives() &&
+  assert(MAI->isAIX() &&
          ".loc should not be generated together with raw data!");
 
   MCContext &Ctx = getContext();
@@ -2648,7 +2643,7 @@ void MCAsmStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta,
                                              const MCSymbol *LastLabel,
                                              const MCSymbol *Label,
                                              unsigned PointerSize) {
-  assert(!MAI->usesDwarfFileAndLocDirectives() &&
+  assert(MAI->isAIX() &&
          ".loc/.file don't need raw data in debug line section!");
 
   // Set to new address.
@@ -2682,20 +2677,6 @@ void MCAsmStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta,
   emitIntValue(dwarf::DW_LNS_copy, 1);
 }
 
-void MCAsmStreamer::doFinalizationAtSectionEnd(MCSection *Section) {
-  // Emit section end. This is used to tell the debug line section where the end
-  // is for a text section if we don't use .loc to represent the debug line.
-  if (MAI->usesDwarfFileAndLocDirectives())
-    return;
-
-  switchSectionNoPrint(Section);
-
-  MCSymbol *Sym = getCurrentSectionOnly()->getEndSymbol(getContext());
-
-  if (!Sym->isInSection())
-    emitLabel(Sym);
-}
-
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
                                     std::unique_ptr<formatted_raw_ostream> OS,
                                     MCInstPrinter *IP,
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index 778ca34..32b6e86 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -707,7 +707,7 @@ size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
 }
 
 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
-  if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
+  if (MAI.isHLASM() && !IsAtStartOfStatement)
     return false;
 
   StringRef CommentString = MAI.getCommentString();
@@ -836,7 +836,7 @@ AsmToken AsmLexer::LexToken() {
       return LexIdentifier();
     return AsmToken(AsmToken::At, StringRef(TokStart, 1));
   case '#':
-    if (MAI.doesAllowHashAtStartOfIdentifier())
+    if (MAI.isHLASM())
       return LexIdentifier();
     return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
   case '?':
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 153c107..bf952df 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -1181,7 +1181,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
       if (getTok().is(AsmToken::Dollar) || getTok().is(AsmToken::Star)) {
         bool ShouldGenerateTempSymbol = false;
         if ((getTok().is(AsmToken::Dollar) && MAI.getDollarIsPC()) ||
-            (getTok().is(AsmToken::Star) && MAI.getStarIsPC()))
+            (getTok().is(AsmToken::Star) && MAI.isHLASM()))
           ShouldGenerateTempSymbol = true;
 
         if (!ShouldGenerateTempSymbol)
@@ -1248,8 +1248,8 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
 
     MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName);
     if (!Sym)
-      Sym = getContext().getOrCreateSymbol(
-          MAI.shouldEmitLabelsInUpperCase() ? SymbolName.upper() : SymbolName);
+      Sym = getContext().getOrCreateSymbol(MAI.isHLASM() ? SymbolName.upper()
+                                                         : SymbolName);
 
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
@@ -1312,7 +1312,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
     return false;
   }
   case AsmToken::Dot: {
-    if (!MAI.getDotIsPC())
+    if (MAI.isHLASM())
       return TokError("cannot use . as current PC");
 
     // This is a '.' reference, which references the current PC.  Emit a
@@ -6322,9 +6322,7 @@ bool HLASMAsmParser::parseAsHLASMLabel(ParseStatementInfo &Info,
                  "Cannot have just a label for an HLASM inline asm statement");
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(
-      getContext().getAsmInfo()->shouldEmitLabelsInUpperCase()
-          ? LabelVal.upper()
-          : LabelVal);
+      getContext().getAsmInfo()->isHLASM() ? LabelVal.upper() : LabelVal);
 
   getTargetParser().doBeforeLabelEmit(Sym, LabelLoc);
 
diff --git a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
index 782d5b2..cebcb82 100644
--- a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
@@ -183,10 +183,18 @@ static Error handleArgs(const CommonConfig &Config,
   });
 
   if (Config.OnlyKeepDebug) {
+    const data_directory *DebugDir =
+        Obj.DataDirectories.size() > DEBUG_DIRECTORY
+            ? &Obj.DataDirectories[DEBUG_DIRECTORY]
+            : nullptr;
     // For --only-keep-debug, we keep all other sections, but remove their
     // content. The VirtualSize field in the section header is kept intact.
-    Obj.truncateSections([](const Section &Sec) {
+    Obj.truncateSections([DebugDir](const Section &Sec) {
       return !isDebugSection(Sec) && Sec.Name != ".buildid" &&
+             !(DebugDir && DebugDir->Size > 0 &&
+               DebugDir->RelativeVirtualAddress >= Sec.Header.VirtualAddress &&
+               DebugDir->RelativeVirtualAddress <
+                   Sec.Header.VirtualAddress + Sec.Header.SizeOfRawData) &&
              ((Sec.Header.Characteristics &
                (IMAGE_SCN_CNT_CODE | IMAGE_SCN_CNT_INITIALIZED_DATA)) != 0);
     });
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
index 78fc0c4..79bbb28 100644
--- a/llvm/lib/ObjCopy/ConfigManager.cpp
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -36,11 +36,9 @@ Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
 
 Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
   if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.SymbolsPrefixRemove.empty() || !Common.SymbolsToSkip.empty() ||
+      !Common.SymbolsPrefixRemove.empty() ||
       !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() ||
-      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() ||
-      !Common.SymbolsToLocalize.empty() ||
-      !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() ||
+      !Common.SymbolsToKeep.empty() || !Common.SectionsToRename.empty() ||
       !Common.UnneededSymbolsToRemove.empty() ||
       !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
       !Common.SetSectionType.empty() || Common.ExtractDWO ||
diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
index 93bc663..d4eb6a9b 100644
--- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
@@ -116,6 +116,11 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
   const bool IsObjectFile =
       O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
   uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
+  if (O.EncryptionInfoCommandIndex) {
+    // If we are emitting an encryptable binary, our load commands must have a
+    // separate (non-encrypted) page to themselves.
+    Offset = alignToPowerOf2(HeaderSize + O.Header.SizeOfCmds, PageSize);
+  }
   for (LoadCommand &LC : O.LoadCommands) {
     auto &MLC = LC.MachOLoadCommand;
     StringRef Segname;
diff --git a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
index 91500c2..a188425 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
@@ -93,19 +93,38 @@ static void markSymbols(const CommonConfig &, Object &Obj) {
 static void updateAndRemoveSymbols(const CommonConfig &Config,
                                    const MachOConfig &MachOConfig,
                                    Object &Obj) {
-  for (SymbolEntry &Sym : Obj.SymTable) {
-    // Weaken symbols first to match ELFObjcopy behavior.
-    bool IsExportedAndDefined =
-        (Sym.n_type & llvm::MachO::N_EXT) &&
-        (Sym.n_type & llvm::MachO::N_TYPE) != llvm::MachO::N_UNDF;
-    if (IsExportedAndDefined &&
+  Obj.SymTable.updateSymbols([&](SymbolEntry &Sym) {
+    if (Config.SymbolsToSkip.matches(Sym.Name))
+      return;
+
+    if (!Sym.isUndefinedSymbol() && Config.SymbolsToLocalize.matches(Sym.Name))
+      Sym.n_type &= ~MachO::N_EXT;
+
+    // Note: these two globalize flags have very similar names but different
+    // meanings:
+    //
+    // --globalize-symbol: promote a symbol to global
+    // --keep-global-symbol: all symbols except for these should be made local
+    //
+    // If --globalize-symbol is specified for a given symbol, it will be
+    // global in the output file even if it is not included via
+    // --keep-global-symbol. Because of that, make sure to check
+    // --globalize-symbol second.
+    if (!Sym.isUndefinedSymbol() && !Config.SymbolsToKeepGlobal.empty() &&
+        !Config.SymbolsToKeepGlobal.matches(Sym.Name))
+      Sym.n_type &= ~MachO::N_EXT;
+
+    if (!Sym.isUndefinedSymbol() && Config.SymbolsToGlobalize.matches(Sym.Name))
+      Sym.n_type |= MachO::N_EXT;
+
+    if (Sym.isExternalSymbol() && !Sym.isUndefinedSymbol() &&
         (Config.Weaken || Config.SymbolsToWeaken.matches(Sym.Name)))
-      Sym.n_desc |= llvm::MachO::N_WEAK_DEF;
+      Sym.n_desc |= MachO::N_WEAK_DEF;
 
     auto I = Config.SymbolsToRename.find(Sym.Name);
     if (I != Config.SymbolsToRename.end())
       Sym.Name = std::string(I->getValue());
-  }
+  });
 
   auto RemovePred = [&Config, &MachOConfig,
                      &Obj](const std::unique_ptr<SymbolEntry> &N) {
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
index d593d67..e0819d8 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -33,6 +33,19 @@ SymbolEntry *SymbolTable::getSymbolByIndex(uint32_t Index) {
       static_cast<const SymbolTable *>(this)->getSymbolByIndex(Index));
 }
 
+void SymbolTable::updateSymbols(function_ref<void(SymbolEntry &)> Callable) {
+  for (auto &Sym : Symbols)
+    Callable(*Sym);
+
+  // Partition symbols: local < defined external < undefined external.
+  auto ExternalBegin = std::stable_partition(
+      std::begin(Symbols), std::end(Symbols),
+      [](const auto &Sym) { return Sym->isLocalSymbol(); });
+  std::stable_partition(ExternalBegin, std::end(Symbols), [](const auto &Sym) {
+    return !Sym->isUndefinedSymbol();
+  });
+}
+
 void SymbolTable::removeSymbols(
     function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove) {
   llvm::erase_if(Symbols, ToRemove);
@@ -85,6 +98,10 @@ void Object::updateLoadCommandIndexes() {
     case MachO::LC_DYLD_EXPORTS_TRIE:
       ExportsTrieCommandIndex = Index;
       break;
+    case MachO::LC_ENCRYPTION_INFO:
+    case MachO::LC_ENCRYPTION_INFO_64:
+      EncryptionInfoCommandIndex = Index;
+      break;
     }
   }
 }
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index b3303fd..79eb0133 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -142,6 +142,7 @@ struct SymbolTable {
 
   const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
   SymbolEntry *getSymbolByIndex(uint32_t Index);
+  void updateSymbols(function_ref<void(SymbolEntry &)> Callable);
   void removeSymbols(
       function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove);
 };
@@ -340,6 +341,9 @@ struct Object {
   /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
   /// corresponding to the __TEXT segment.
   std::optional<size_t> TextSegmentCommandIndex;
+  /// The index of the LC_ENCRYPTION_INFO or LC_ENCRYPTION_INFO_64 load command
+  /// if present.
+  std::optional<size_t> EncryptionInfoCommandIndex;
 
   BumpPtrAllocator Alloc;
   StringSaver NewSectionsContents;
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
index 2b344f3..ef0e026 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
@@ -184,6 +184,10 @@ Error MachOReader::readLoadCommands(Object &O) const {
     case MachO::LC_DYLD_CHAINED_FIXUPS:
       O.ChainedFixupsCommandIndex = O.LoadCommands.size();
       break;
+    case MachO::LC_ENCRYPTION_INFO:
+    case MachO::LC_ENCRYPTION_INFO_64:
+      O.EncryptionInfoCommandIndex = O.LoadCommands.size();
+      break;
     }
 #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
   case MachO::LCName:                                                          \
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index ff3dcf9..595533f 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -133,6 +133,8 @@ static uint16_t getImgRelRelocation(MachineTypes Machine) {
     return IMAGE_REL_ARM64_ADDR32NB;
   case IMAGE_FILE_MACHINE_I386:
     return IMAGE_REL_I386_DIR32NB;
+  case IMAGE_FILE_MACHINE_R4000:
+    return IMAGE_REL_MIPS_REFWORDNB;
   }
 }
 
diff --git a/llvm/lib/Object/WindowsMachineFlag.cpp b/llvm/lib/Object/WindowsMachineFlag.cpp
index b9f8187..caf357e 100644
--- a/llvm/lib/Object/WindowsMachineFlag.cpp
+++ b/llvm/lib/Object/WindowsMachineFlag.cpp
@@ -21,6 +21,7 @@ using namespace llvm;
 
 // Returns /machine's value.
 COFF::MachineTypes llvm::getMachineType(StringRef S) {
+  // Flags must be a superset of Microsoft lib.exe /machine flags.
   return StringSwitch<COFF::MachineTypes>(S.lower())
       .Cases("x64", "amd64", COFF::IMAGE_FILE_MACHINE_AMD64)
       .Cases("x86", "i386", COFF::IMAGE_FILE_MACHINE_I386)
@@ -28,6 +29,7 @@ COFF::MachineTypes llvm::getMachineType(StringRef S) {
       .Case("arm64", COFF::IMAGE_FILE_MACHINE_ARM64)
       .Case("arm64ec", COFF::IMAGE_FILE_MACHINE_ARM64EC)
       .Case("arm64x", COFF::IMAGE_FILE_MACHINE_ARM64X)
+      .Case("mips", COFF::IMAGE_FILE_MACHINE_R4000)
       .Default(COFF::IMAGE_FILE_MACHINE_UNKNOWN);
 }
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index a936f53..30b8d7c 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -492,6 +492,9 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)                               \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define MACHINE_FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER,    \
+                                          PARAMS)                              \
+  PIC->addClassToPassName(CLASS, NAME);
 #include "llvm/Passes/MachinePassRegistry.def"
     });
   }
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index d737ea5..4ec0fb8 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -189,9 +189,9 @@ static cl::opt<bool> EnableGlobalAnalyses(
     "enable-global-analyses", cl::init(true), cl::Hidden,
     cl::desc("Enable inter-procedural analyses"));
 
-static cl::opt<bool>
-    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
-                       cl::desc("Run Partial inlinining pass"));
+static cl::opt<bool> RunPartialInlining("enable-partial-inlining",
+                                        cl::init(false), cl::Hidden,
+                                        cl::desc("Run Partial inlining pass"));
 
 static cl::opt<bool> ExtraVectorizerPasses(
     "extra-vectorizer-passes", cl::init(false), cl::Hidden,
@@ -264,7 +264,7 @@ static cl::opt<bool>
 static cl::opt<bool> FlattenedProfileUsed(
     "flattened-profile-used", cl::init(false), cl::Hidden,
     cl::desc("Indicate the sample profile being used is flattened, i.e., "
-             "no inline hierachy exists in the profile"));
+             "no inline hierarchy exists in the profile"));
 
 static cl::opt<bool> EnableOrderFileInstrumentation(
     "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 9f0b092..13e192f 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -156,7 +156,7 @@ MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass())
 MODULE_PASS("trigger-crash-module", TriggerCrashModulePass())
 MODULE_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
-MODULE_PASS("tysan-module", ModuleTypeSanitizerPass())
+MODULE_PASS("tysan", TypeSanitizerPass())
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("view-callgraph", CallGraphViewerPass())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
@@ -481,7 +481,6 @@ FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
 FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass())
 FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
-FUNCTION_PASS("tysan", TypeSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 65ff22c..83fe5f0 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -246,6 +246,40 @@ Expected<int64_t> CounterMappingContext::evaluate(const Counter &C) const {
   return LastPoppedValue;
 }
 
+// Find an independence pair for each condition:
+// - The condition is true in one test and false in the other.
+// - The decision outcome is true one test and false in the other.
+// - All other conditions' values must be equal or marked as "don't care".
+void MCDCRecord::findIndependencePairs() {
+  if (IndependencePairs)
+    return;
+
+  IndependencePairs.emplace();
+
+  unsigned NumTVs = TV.size();
+  // Will be replaced to shorter expr.
+  unsigned TVTrueIdx = std::distance(
+      TV.begin(),
+      std::find_if(TV.begin(), TV.end(),
+                   [&](auto I) { return (I.second == MCDCRecord::MCDC_True); })
+
+  );
+  for (unsigned I = TVTrueIdx; I < NumTVs; ++I) {
+    const auto &[A, ACond] = TV[I];
+    assert(ACond == MCDCRecord::MCDC_True);
+    for (unsigned J = 0; J < TVTrueIdx; ++J) {
+      const auto &[B, BCond] = TV[J];
+      assert(BCond == MCDCRecord::MCDC_False);
+      // If the two vectors differ in exactly one condition, ignoring DontCare
+      // conditions, we have found an independence pair.
+      auto AB = A.getDifferences(B);
+      if (AB.count() == 1)
+        IndependencePairs->insert(
+            {AB.find_first(), std::make_pair(J + 1, I + 1)});
+    }
+  }
+}
+
 mcdc::TVIdxBuilder::TVIdxBuilder(const SmallVectorImpl<ConditionIDs> &NextIDs,
                                  int Offset)
     : Indices(NextIDs.size()) {
@@ -400,9 +434,6 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
   /// ExecutedTestVectorBitmap.
   MCDCRecord::TestVectors &ExecVectors;
 
-  /// Number of False items in ExecVectors
-  unsigned NumExecVectorsF;
-
 #ifndef NDEBUG
   DenseSet<unsigned> TVIdxs;
 #endif
@@ -417,8 +448,9 @@ public:
       : NextIDsBuilder(Branches), TVIdxBuilder(this->NextIDs), Bitmap(Bitmap),
         Region(Region), DecisionParams(Region.getDecisionParams()),
         Branches(Branches), NumConditions(DecisionParams.NumConditions),
-        Folded(NumConditions, false), IndependencePairs(NumConditions),
-        ExecVectors(ExecVectorsByCond[false]), IsVersion11(IsVersion11) {}
+        Folded{{BitVector(NumConditions), BitVector(NumConditions)}},
+        IndependencePairs(NumConditions), ExecVectors(ExecVectorsByCond[false]),
+        IsVersion11(IsVersion11) {}
 
 private:
   // Walk the binary decision diagram and try assigning both false and true to
@@ -471,34 +503,11 @@ private:
     // Fill ExecVectors order by False items and True items.
     // ExecVectors is the alias of ExecVectorsByCond[false], so
     // Append ExecVectorsByCond[true] on it.
-    NumExecVectorsF = ExecVectors.size();
     auto &ExecVectorsT = ExecVectorsByCond[true];
     ExecVectors.append(std::make_move_iterator(ExecVectorsT.begin()),
                        std::make_move_iterator(ExecVectorsT.end()));
   }
 
-  // Find an independence pair for each condition:
-  // - The condition is true in one test and false in the other.
-  // - The decision outcome is true one test and false in the other.
-  // - All other conditions' values must be equal or marked as "don't care".
-  void findIndependencePairs() {
-    unsigned NumTVs = ExecVectors.size();
-    for (unsigned I = NumExecVectorsF; I < NumTVs; ++I) {
-      const auto &[A, ACond] = ExecVectors[I];
-      assert(ACond == MCDCRecord::MCDC_True);
-      for (unsigned J = 0; J < NumExecVectorsF; ++J) {
-        const auto &[B, BCond] = ExecVectors[J];
-        assert(BCond == MCDCRecord::MCDC_False);
-        // If the two vectors differ in exactly one condition, ignoring DontCare
-        // conditions, we have found an independence pair.
-        auto AB = A.getDifferences(B);
-        if (AB.count() == 1)
-          IndependencePairs.insert(
-              {AB.find_first(), std::make_pair(J + 1, I + 1)});
-      }
-    }
-  }
-
 public:
   /// Process the MC/DC Record in order to produce a result for a boolean
   /// expression. This process includes tracking the conditions that comprise
@@ -510,7 +519,6 @@ public:
   /// location is also tracked, as well as whether it is constant folded (in
   /// which case it is excuded from the metric).
   MCDCRecord processMCDCRecord() {
-    unsigned I = 0;
     MCDCRecord::CondIDMap PosToID;
     MCDCRecord::LineColPairMap CondLoc;
 
@@ -524,23 +532,19 @@ public:
     //   visualize where the condition is.
     // - Record whether the condition is constant folded so that we exclude it
     //   from being measured.
-    for (const auto *B : Branches) {
+    for (auto [I, B] : enumerate(Branches)) {
       const auto &BranchParams = B->getBranchParams();
       PosToID[I] = BranchParams.ID;
       CondLoc[I] = B->startLoc();
-      Folded[I++] = (B->Count.isZero() || B->FalseCount.isZero());
+      Folded[false][I] = B->FalseCount.isZero();
+      Folded[true][I] = B->Count.isZero();
     }
 
     // Using Profile Bitmap from runtime, mark the executed test vectors.
     findExecutedTestVectors();
 
-    // Compare executed test vectors against each other to find an independence
-    // pairs for each condition.  This processing takes the most time.
-    findIndependencePairs();
-
     // Record Test vectors, executed vectors, and independence pairs.
-    return MCDCRecord(Region, std::move(ExecVectors),
-                      std::move(IndependencePairs), std::move(Folded),
+    return MCDCRecord(Region, std::move(ExecVectors), std::move(Folded),
                       std::move(PosToID), std::move(CondLoc));
   }
 };
@@ -833,7 +837,6 @@ Error CoverageMapping::loadFunctionRecord(
   else
     OrigFuncName = getFuncNameWithoutPrefix(OrigFuncName, Record.Filenames[0]);
 
-  bool SingleByteCoverage = ProfileReader.hasSingleByteCoverage();
   CounterMappingContext Ctx(Record.Expressions);
 
   std::vector<uint64_t> Counts;
@@ -899,10 +902,7 @@ Error CoverageMapping::loadFunctionRecord(
       consumeError(std::move(E));
       return Error::success();
     }
-    Function.pushRegion(
-        Region, (SingleByteCoverage && *ExecutionCount ? 1 : *ExecutionCount),
-        (SingleByteCoverage && *AltExecutionCount ? 1 : *AltExecutionCount),
-        SingleByteCoverage);
+    Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount);
 
     // Record ExpansionRegion.
     if (Region.Kind == CounterMappingRegion::ExpansionRegion) {
@@ -964,6 +964,9 @@ Error CoverageMapping::loadFunctionRecord(
 Error CoverageMapping::loadFromReaders(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
     IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage) {
+  assert(!Coverage.SingleByteCoverage ||
+         *Coverage.SingleByteCoverage == ProfileReader.hasSingleByteCoverage());
+  Coverage.SingleByteCoverage = ProfileReader.hasSingleByteCoverage();
   for (const auto &CoverageReader : CoverageReaders) {
     for (auto RecordOrErr : *CoverageReader) {
       if (Error E = RecordOrErr.takeError())
@@ -1324,14 +1327,8 @@ class SegmentBuilder {
       // value for that area.
       // We add counts of the regions of the same kind as the active region
       // to handle the both situations.
-      if (I->Kind == Active->Kind) {
-        assert(I->HasSingleByteCoverage == Active->HasSingleByteCoverage &&
-               "Regions are generated in different coverage modes");
-        if (I->HasSingleByteCoverage)
-          Active->ExecutionCount = Active->ExecutionCount || I->ExecutionCount;
-        else
-          Active->ExecutionCount += I->ExecutionCount;
-      }
+      if (I->Kind == Active->Kind)
+        Active->ExecutionCount += I->ExecutionCount;
     }
     return Regions.drop_back(std::distance(++Active, End));
   }
@@ -1424,7 +1421,8 @@ static bool isExpansion(const CountedRegion &R, unsigned FileID) {
 }
 
 CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
-  CoverageData FileCoverage(Filename);
+  assert(SingleByteCoverage);
+  CoverageData FileCoverage(*SingleByteCoverage, Filename);
   std::vector<CountedRegion> Regions;
 
   // Look up the function records in the given file. Due to hash collisions on
@@ -1488,7 +1486,9 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const {
   if (!MainFileID)
     return CoverageData();
 
-  CoverageData FunctionCoverage(Function.Filenames[*MainFileID]);
+  assert(SingleByteCoverage);
+  CoverageData FunctionCoverage(*SingleByteCoverage,
+                                Function.Filenames[*MainFileID]);
   std::vector<CountedRegion> Regions;
   for (const auto &CR : Function.CountedRegions)
     if (CR.FileID == *MainFileID) {
@@ -1515,8 +1515,9 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const {
 
 CoverageData CoverageMapping::getCoverageForExpansion(
     const ExpansionRecord &Expansion) const {
+  assert(SingleByteCoverage);
   CoverageData ExpansionCoverage(
-      Expansion.Function.Filenames[Expansion.FileID]);
+      *SingleByteCoverage, Expansion.Function.Filenames[Expansion.FileID]);
   std::vector<CountedRegion> Regions;
   for (const auto &CR : Expansion.Function.CountedRegions)
     if (CR.FileID == Expansion.FileID) {
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index 10c36f2..6a4fecd 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -754,7 +754,7 @@ Error RawMemProfReader::readNextRecord(
 
 Expected<std::unique_ptr<YAMLMemProfReader>>
 YAMLMemProfReader::create(const Twine &Path) {
-  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
+  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path, /*IsText=*/true);
   if (std::error_code EC = BufferOr.getError())
     return report(errorCodeToError(EC), Path.getSingleStringRef());
 
@@ -770,7 +770,7 @@ YAMLMemProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
 }
 
 bool YAMLMemProfReader::hasFormat(const StringRef Path) {
-  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path);
+  auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path, /*IsText=*/true);
   if (!BufferOr)
     return false;
 
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 17db114c..5b311e7 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -1373,9 +1373,11 @@ std::error_code closeFile(file_t &F) {
 }
 
 std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
+  SmallString<128> NativePath;
+  llvm::sys::path::native(path, NativePath, path::Style::windows_backslash);
   // Convert to utf-16.
   SmallVector<wchar_t, 128> Path16;
-  std::error_code EC = widenPath(path, Path16);
+  std::error_code EC = widenPath(NativePath, Path16);
   if (EC && !IgnoreErrors)
     return EC;
 
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index eee4251..e23aec6 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -81,8 +81,7 @@ TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
   TokStart = nullptr;
 
   // Pretend that we enter the "top-level" include file.
-  PrepIncludeStack.push_back(
-      std::make_unique<std::vector<PreprocessorControlDesc>>());
+  PrepIncludeStack.emplace_back();
 
   // Add all macros defined on the command line to the DefinedMacros set.
   // Check invalid macro names and print fatal error if we find one.
@@ -453,8 +452,7 @@ bool TGLexer::LexInclude() {
   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
   CurPtr = CurBuf.begin();
 
-  PrepIncludeStack.push_back(
-      std::make_unique<std::vector<PreprocessorControlDesc>>());
+  PrepIncludeStack.emplace_back();
   return false;
 }
 
@@ -656,17 +654,13 @@ tgtok::TokKind TGLexer::LexExclaim() {
 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
   // Report an error, if preprocessor control stack for the current
   // file is not empty.
-  if (!PrepIncludeStack.back()->empty()) {
+  if (!PrepIncludeStack.back().empty()) {
     prepReportPreprocessorStackError();
 
     return false;
   }
 
   // Pop the preprocessing controls from the include stack.
-  if (PrepIncludeStack.empty()) {
-    PrintFatalError("preprocessor include stack is empty");
-  }
-
   PrepIncludeStack.pop_back();
 
   if (IncludeStackMustBeEmpty) {
@@ -761,7 +755,7 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
     // Regardless of whether we are processing tokens or not,
     // we put the #ifdef control on stack.
     // Note that MacroIsDefined has been canonicalized against ifdef.
-    PrepIncludeStack.back()->push_back(
+    PrepIncludeStack.back().push_back(
         {tgtok::Ifdef, MacroIsDefined, SMLoc::getFromPointer(TokStart)});
 
     if (!prepSkipDirectiveEnd())
@@ -789,10 +783,10 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
   } else if (Kind == tgtok::Else) {
     // Check if this #else is correct before calling prepSkipDirectiveEnd(),
     // which will move CurPtr away from the beginning of #else.
-    if (PrepIncludeStack.back()->empty())
+    if (PrepIncludeStack.back().empty())
       return ReturnError(TokStart, "#else without #ifdef or #ifndef");
 
-    PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back();
+    PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back();
 
     if (IfdefEntry.Kind != tgtok::Ifdef) {
       PrintError(TokStart, "double #else");
@@ -801,9 +795,8 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
 
     // Replace the corresponding #ifdef's control with its negation
     // on the control stack.
-    PrepIncludeStack.back()->pop_back();
-    PrepIncludeStack.back()->push_back(
-        {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)});
+    PrepIncludeStack.back().back() = {Kind, !IfdefEntry.IsDefined,
+                                      SMLoc::getFromPointer(TokStart)};
 
     if (!prepSkipDirectiveEnd())
       return ReturnError(CurPtr, "only comments are supported after #else");
@@ -822,10 +815,10 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
   } else if (Kind == tgtok::Endif) {
     // Check if this #endif is correct before calling prepSkipDirectiveEnd(),
     // which will move CurPtr away from the beginning of #endif.
-    if (PrepIncludeStack.back()->empty())
+    if (PrepIncludeStack.back().empty())
       return ReturnError(TokStart, "#endif without #ifdef");
 
-    auto &IfdefOrElseEntry = PrepIncludeStack.back()->back();
+    auto &IfdefOrElseEntry = PrepIncludeStack.back().back();
 
     if (IfdefOrElseEntry.Kind != tgtok::Ifdef &&
         IfdefOrElseEntry.Kind != tgtok::Else) {
@@ -836,7 +829,7 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
     if (!prepSkipDirectiveEnd())
       return ReturnError(CurPtr, "only comments are supported after #endif");
 
-    PrepIncludeStack.back()->pop_back();
+    PrepIncludeStack.back().pop_back();
 
     // If we were processing tokens before this #endif, then
     // we should continue it.
@@ -1055,20 +1048,16 @@ bool TGLexer::prepSkipDirectiveEnd() {
 }
 
 bool TGLexer::prepIsProcessingEnabled() {
-  for (const PreprocessorControlDesc &I :
-       llvm::reverse(*PrepIncludeStack.back()))
-    if (!I.IsDefined)
-      return false;
-
-  return true;
+  return all_of(PrepIncludeStack.back(),
+                [](const PreprocessorControlDesc &I) { return I.IsDefined; });
 }
 
 void TGLexer::prepReportPreprocessorStackError() {
-  if (PrepIncludeStack.back()->empty())
+  if (PrepIncludeStack.back().empty())
     PrintFatalError("prepReportPreprocessorStackError() called with "
                     "empty control stack");
 
-  auto &PrepControl = PrepIncludeStack.back()->back();
+  auto &PrepControl = PrepIncludeStack.back().back();
   PrintError(CurBuf.end(), "reached EOF without matching #endif");
   PrintError(PrepControl.SrcPos, "the latest preprocessor control is here");
 
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index 963d75e..f8b32dc 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
 #define LLVM_LIB_TABLEGEN_TGLEXER_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/DataTypes.h"
@@ -21,7 +22,6 @@
 #include <memory>
 #include <set>
 #include <string>
-#include <vector>
 
 namespace llvm {
 template <typename T> class ArrayRef;
@@ -323,8 +323,7 @@ private:
   // preprocessing control stacks for the current file and all its
   // parent files.  The back() element is the preprocessing control
   // stack for the current file.
-  std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
-      PrepIncludeStack;
+  SmallVector<SmallVector<PreprocessorControlDesc>> PrepIncludeStack;
 
   // Validate that the current preprocessing control stack is empty,
   // since we are about to exit a file, and pop the include stack.
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index e867943..60ae11b 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -776,13 +776,14 @@ ParseSubClassReference(Record *CurRec, bool isDefm) {
     return Result;
   }
 
-  if (ParseTemplateArgValueList(Result.TemplateArgs, CurRec, Result.Rec)) {
+  SmallVector<SMLoc> ArgLocs;
+  if (ParseTemplateArgValueList(Result.TemplateArgs, ArgLocs, CurRec,
+                                Result.Rec)) {
     Result.Rec = nullptr; // Error parsing value list.
     return Result;
   }
 
-  if (CheckTemplateArgValues(Result.TemplateArgs, Result.RefRange.Start,
-                             Result.Rec)) {
+  if (CheckTemplateArgValues(Result.TemplateArgs, ArgLocs, Result.Rec)) {
     Result.Rec = nullptr; // Error checking value list.
     return Result;
   }
@@ -812,7 +813,8 @@ ParseSubMultiClassReference(MultiClass *CurMC) {
     return Result;
   }
 
-  if (ParseTemplateArgValueList(Result.TemplateArgs, &CurMC->Rec,
+  SmallVector<SMLoc> ArgLocs;
+  if (ParseTemplateArgValueList(Result.TemplateArgs, ArgLocs, &CurMC->Rec,
                                 &Result.MC->Rec)) {
     Result.MC = nullptr; // Error parsing value list.
     return Result;
@@ -2722,11 +2724,12 @@ const Init *TGParser::ParseSimpleValue(Record *CurRec, const RecTy *ItemType,
     }
 
     SmallVector<const ArgumentInit *, 8> Args;
+    SmallVector<SMLoc> ArgLocs;
     Lex.Lex(); // consume the <
-    if (ParseTemplateArgValueList(Args, CurRec, Class))
+    if (ParseTemplateArgValueList(Args, ArgLocs, CurRec, Class))
       return nullptr; // Error parsing value list.
 
-    if (CheckTemplateArgValues(Args, NameLoc.Start, Class))
+    if (CheckTemplateArgValues(Args, ArgLocs, Class))
       return nullptr; // Error checking template argument values.
 
     if (resolveArguments(Class, Args, NameLoc.Start))
@@ -3201,8 +3204,8 @@ void TGParser::ParseValueList(SmallVectorImpl<const Init *> &Result,
 //   PostionalArgValueList ::= [Value {',' Value}*]
 //   NamedArgValueList ::= [NameValue '=' Value {',' NameValue '=' Value}*]
 bool TGParser::ParseTemplateArgValueList(
-    SmallVectorImpl<const ArgumentInit *> &Result, Record *CurRec,
-    const Record *ArgsRec) {
+    SmallVectorImpl<const ArgumentInit *> &Result,
+    SmallVectorImpl<SMLoc> &ArgLocs, Record *CurRec, const Record *ArgsRec) {
   assert(Result.empty() && "Result vector is not empty");
   ArrayRef<const Init *> TArgs = ArgsRec->getTemplateArgs();
 
@@ -3217,7 +3220,7 @@ bool TGParser::ParseTemplateArgValueList(
       return true;
     }
 
-    SMLoc ValueLoc = Lex.getLoc();
+    SMLoc ValueLoc = ArgLocs.emplace_back(Lex.getLoc());
     // If we are parsing named argument, we don't need to know the argument name
     // and argument type will be resolved after we know the name.
     const Init *Value = ParseValue(
@@ -4417,11 +4420,15 @@ bool TGParser::ParseFile() {
 // If necessary, replace an argument with a cast to the required type.
 // The argument count has already been checked.
 bool TGParser::CheckTemplateArgValues(
-    SmallVectorImpl<const ArgumentInit *> &Values, SMLoc Loc,
+    SmallVectorImpl<const ArgumentInit *> &Values, ArrayRef<SMLoc> ValuesLocs,
     const Record *ArgsRec) {
+  assert(Values.size() == ValuesLocs.size() &&
+         "expected as many values as locations");
+
   ArrayRef<const Init *> TArgs = ArgsRec->getTemplateArgs();
 
-  for (const ArgumentInit *&Value : Values) {
+  bool HasError = false;
+  for (auto [Value, Loc] : llvm::zip_equal(Values, ValuesLocs)) {
     const Init *ArgName = nullptr;
     if (Value->isPositional())
       ArgName = TArgs[Value->getIndex()];
@@ -4439,16 +4446,16 @@ bool TGParser::CheckTemplateArgValues(
                "result of template arg value cast has wrong type");
         Value = Value->cloneWithValue(CastValue);
       } else {
-        PrintFatalError(Loc, "Value specified for template argument '" +
-                                 Arg->getNameInitAsString() + "' is of type " +
-                                 ArgValue->getType()->getAsString() +
-                                 "; expected type " + ArgType->getAsString() +
-                                 ": " + ArgValue->getAsString());
+        HasError |= Error(
+            Loc, "Value specified for template argument '" +
+                     Arg->getNameInitAsString() + "' is of type " +
+                     ArgValue->getType()->getAsString() + "; expected type " +
+                     ArgType->getAsString() + ": " + ArgValue->getAsString());
       }
     }
   }
 
-  return false;
+  return HasError;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h
index cac1ba8..4509893 100644
--- a/llvm/lib/TableGen/TGParser.h
+++ b/llvm/lib/TableGen/TGParser.h
@@ -296,6 +296,7 @@ private:  // Parser methods.
   void ParseValueList(SmallVectorImpl<const Init *> &Result, Record *CurRec,
                       const RecTy *ItemType = nullptr);
   bool ParseTemplateArgValueList(SmallVectorImpl<const ArgumentInit *> &Result,
+                                 SmallVectorImpl<SMLoc> &ArgLocs,
                                  Record *CurRec, const Record *ArgsRec);
   void ParseDagArgList(
       SmallVectorImpl<std::pair<const Init *, const StringInit *>> &Result,
@@ -321,7 +322,8 @@ private:  // Parser methods.
   bool ApplyLetStack(Record *CurRec);
   bool ApplyLetStack(RecordsEntry &Entry);
   bool CheckTemplateArgValues(SmallVectorImpl<const ArgumentInit *> &Values,
-                              SMLoc Loc, const Record *ArgsRec);
+                              ArrayRef<SMLoc> ValuesLocs,
+                              const Record *ArgsRec);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 69d07f2..9bec782 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -2730,6 +2730,54 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, TmpInstSB);
     return;
   }
+  case AArch64::TLSDESC_AUTH_CALLSEQ: {
+    /// lower this to:
+    ///    adrp  x0, :tlsdesc_auth:var
+    ///    ldr   x16, [x0, #:tlsdesc_auth_lo12:var]
+    ///    add   x0, x0, #:tlsdesc_auth_lo12:var
+    ///    blraa x16, x0
+    ///    (TPIDR_EL0 offset now in x0)
+    const MachineOperand &MO_Sym = MI->getOperand(0);
+    MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
+    MCOperand SymTLSDescLo12, SymTLSDesc;
+    MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
+    MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
+    MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc);
+
+    MCInst Adrp;
+    Adrp.setOpcode(AArch64::ADRP);
+    Adrp.addOperand(MCOperand::createReg(AArch64::X0));
+    Adrp.addOperand(SymTLSDesc);
+    EmitToStreamer(*OutStreamer, Adrp);
+
+    MCInst Ldr;
+    Ldr.setOpcode(AArch64::LDRXui);
+    Ldr.addOperand(MCOperand::createReg(AArch64::X16));
+    Ldr.addOperand(MCOperand::createReg(AArch64::X0));
+    Ldr.addOperand(SymTLSDescLo12);
+    Ldr.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, Ldr);
+
+    MCInst Add;
+    Add.setOpcode(AArch64::ADDXri);
+    Add.addOperand(MCOperand::createReg(AArch64::X0));
+    Add.addOperand(MCOperand::createReg(AArch64::X0));
+    Add.addOperand(SymTLSDescLo12);
+    Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0)));
+    EmitToStreamer(*OutStreamer, Add);
+
+    // Authenticated TLSDESC accesses are not relaxed.
+    // Thus, do not emit .tlsdesccall for AUTH TLSDESC.
+
+    MCInst Blraa;
+    Blraa.setOpcode(AArch64::BLRAA);
+    Blraa.addOperand(MCOperand::createReg(AArch64::X16));
+    Blraa.addOperand(MCOperand::createReg(AArch64::X0));
+    EmitToStreamer(*OutStreamer, Blraa);
+
+    return;
+  }
   case AArch64::TLSDESC_CALLSEQ: {
     /// lower this to:
     ///    adrp  x0, :tlsdesc:var
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1b1d81f..ce19806 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -131,6 +131,15 @@ def ext: GICombineRule <
   (apply [{ applyEXT(*${root}, ${matchinfo}); }])
 >;
 
+def fullrev: GICombineRule <
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (G_IMPLICIT_DEF $src2),
+         (G_SHUFFLE_VECTOR $src, $src1, $src2, $mask):$root,
+         [{ return ShuffleVectorInst::isReverseMask(${mask}.getShuffleMask(),
+                                                    ${mask}.getShuffleMask().size()); }]),
+  (apply [{ applyFullRev(*${root}, MRI); }])
+>;
+
 def insertelt_nonconst: GICombineRule <
   (defs root:$root, shuffle_matchdata:$matchinfo),
   (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root,
@@ -163,7 +172,7 @@ def form_duplane : GICombineRule <
   (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
-def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
+def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn, fullrev,
                                               form_duplane, shuf_to_ins]>;
 
 // Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td
index fc7a94a..e0f56fd 100644
--- a/llvm/lib/Target/AArch64/AArch64FMV.td
+++ b/llvm/lib/Target/AArch64/AArch64FMV.td
@@ -22,64 +22,65 @@
 
 
 // Something you can add to target_version or target_clones.
-class FMVExtension<string n, string b, int p> {
+class FMVExtension<string name, string enumeration> {
     // Name, as spelled in target_version or target_clones. e.g. "memtag".
-    string Name = n;
+    string Name = name;
 
     // A C++ expression giving the number of the bit in the FMV ABI.
     // Currently this is given as a value from the enum "CPUFeatures".
-    string Bit = b;
+    string FeatureBit = "FEAT_" # enumeration;
 
     // SubtargetFeature enabled for codegen when this FMV feature is present.
-    string BackendFeature = n;
+    string BackendFeature = name;
 
-    // The FMV priority.
-    int Priority = p;
+    // A C++ expression giving the number of the priority bit.
+    // Currently this is given as a value from the enum "FeatPriorities".
+    string PriorityBit = "PRIOR_" # enumeration;
 }
 
-def : FMVExtension<"aes", "FEAT_PMULL", 150>;
-def : FMVExtension<"bf16", "FEAT_BF16", 280>;
-def : FMVExtension<"bti", "FEAT_BTI", 510>;
-def : FMVExtension<"crc", "FEAT_CRC", 110>;
-def : FMVExtension<"dit", "FEAT_DIT", 180>;
-def : FMVExtension<"dotprod", "FEAT_DOTPROD", 104>;
-let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "FEAT_DPB", 190>;
-let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "FEAT_DPB2", 200>;
-def : FMVExtension<"f32mm", "FEAT_SVE_F32MM", 350>;
-def : FMVExtension<"f64mm", "FEAT_SVE_F64MM", 360>;
-def : FMVExtension<"fcma", "FEAT_FCMA", 220>;
-def : FMVExtension<"flagm", "FEAT_FLAGM", 20>;
-let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FEAT_FLAGM2", 30>;
-def : FMVExtension<"fp", "FEAT_FP", 90>;
-def : FMVExtension<"fp16", "FEAT_FP16", 170>;
-def : FMVExtension<"fp16fml", "FEAT_FP16FML", 175>;
-let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FEAT_FRINTTS", 250>;
-def : FMVExtension<"i8mm", "FEAT_I8MM", 270>;
-def : FMVExtension<"jscvt", "FEAT_JSCVT", 210>;
-def : FMVExtension<"ls64", "FEAT_LS64_ACCDATA", 520>;
-def : FMVExtension<"lse", "FEAT_LSE", 80>;
-def : FMVExtension<"memtag", "FEAT_MEMTAG2", 440>;
-def : FMVExtension<"mops", "FEAT_MOPS", 650>;
-def : FMVExtension<"predres", "FEAT_PREDRES", 480>;
-def : FMVExtension<"rcpc", "FEAT_RCPC", 230>;
-let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "FEAT_RCPC2", 240>;
-def : FMVExtension<"rcpc3", "FEAT_RCPC3", 241>;
-def : FMVExtension<"rdm", "FEAT_RDM", 108>;
-def : FMVExtension<"rng", "FEAT_RNG", 10>;
-def : FMVExtension<"sb", "FEAT_SB", 470>;
-def : FMVExtension<"sha2", "FEAT_SHA2", 130>;
-def : FMVExtension<"sha3", "FEAT_SHA3", 140>;
-def : FMVExtension<"simd", "FEAT_SIMD", 100>;
-def : FMVExtension<"sm4", "FEAT_SM4", 106>;
-def : FMVExtension<"sme", "FEAT_SME", 430>;
-def : FMVExtension<"sme-f64f64", "FEAT_SME_F64", 560>;
-def : FMVExtension<"sme-i16i64", "FEAT_SME_I64", 570>;
-def : FMVExtension<"sme2", "FEAT_SME2", 580>;
-def : FMVExtension<"ssbs", "FEAT_SSBS2", 490>;
-def : FMVExtension<"sve", "FEAT_SVE", 310>;
-def : FMVExtension<"sve2", "FEAT_SVE2", 370>;
-def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", 380>;
-def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", 400>;
-def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", 410>;
-def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", 420>;
-def : FMVExtension<"wfxt", "FEAT_WFXT", 550>;
+def : FMVExtension<"aes", "PMULL">;
+def : FMVExtension<"bf16", "BF16">;
+def : FMVExtension<"bti", "BTI">;
+def : FMVExtension<"crc", "CRC">;
+def : FMVExtension<"dit", "DIT">;
+def : FMVExtension<"dotprod", "DOTPROD">;
+let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "DPB">;
+let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "DPB2">;
+def : FMVExtension<"f32mm", "SVE_F32MM">;
+def : FMVExtension<"f64mm", "SVE_F64MM">;
+def : FMVExtension<"fcma", "FCMA">;
+def : FMVExtension<"flagm", "FLAGM">;
+let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FLAGM2">;
+def : FMVExtension<"fp", "FP">;
+def : FMVExtension<"fp16", "FP16">;
+def : FMVExtension<"fp16fml", "FP16FML">;
+let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FRINTTS">;
+def : FMVExtension<"i8mm", "I8MM">;
+def : FMVExtension<"jscvt", "JSCVT">;
+def : FMVExtension<"ls64", "LS64_ACCDATA">;
+def : FMVExtension<"lse", "LSE">;
+def : FMVExtension<"memtag", "MEMTAG2">;
+def : FMVExtension<"mops", "MOPS">;
+def : FMVExtension<"predres", "PREDRES">;
+def : FMVExtension<"rcpc", "RCPC">;
+let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "RCPC2">;
+def : FMVExtension<"rcpc3", "RCPC3">;
+def : FMVExtension<"rdm", "RDM">;
+def : FMVExtension<"rng", "RNG">;
+def : FMVExtension<"sb", "SB">;
+def : FMVExtension<"sha2", "SHA2">;
+def : FMVExtension<"sha3", "SHA3">;
+def : FMVExtension<"simd", "SIMD">;
+def : FMVExtension<"sm4", "SM4">;
+def : FMVExtension<"sme", "SME">;
+def : FMVExtension<"sme-f64f64", "SME_F64">;
+def : FMVExtension<"sme-i16i64", "SME_I64">;
+def : FMVExtension<"sme2", "SME2">;
+def : FMVExtension<"ssbs", "SSBS2">;
+def : FMVExtension<"sve", "SVE">;
+def : FMVExtension<"sve2", "SVE2">;
+def : FMVExtension<"sve2-aes", "SVE_PMULL128">;
+def : FMVExtension<"sve2-bitperm", "SVE_BITPERM">;
+def : FMVExtension<"sve2-sha3", "SVE_SHA3">;
+def : FMVExtension<"sve2-sm4", "SVE_SM4">;
+def : FMVExtension<"wfxt", "WFXT">;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ff3ca8a..6aa8cd4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -228,6 +228,8 @@ public:
     return false;
   }
 
+  bool SelectAny(SDValue) { return true; }
+
   bool SelectDupZero(SDValue N) {
     switch(N->getOpcode()) {
     case AArch64ISD::DUP:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a6f8f47..3ad2905 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -753,6 +753,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(Op, MVT::v8bf16, Expand);
   }
 
+  // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
+  setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom);
+
   auto LegalizeNarrowFP = [this](MVT ScalarVT) {
     for (auto Op : {
              ISD::SETCC,
@@ -893,10 +901,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(Op, MVT::f16, Legal);
   }
 
-  // Strict conversion to a larger type is legal
-  for (auto VT : {MVT::f32, MVT::f64})
-    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
-
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
@@ -1183,6 +1187,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setMaxDivRemBitWidthSupported(128);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  if (Subtarget->hasSME())
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
 
   if (Subtarget->isNeonAvailable()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
@@ -2669,6 +2675,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::CSINC)
     MAKE_CASE(AArch64ISD::THREAD_POINTER)
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+    MAKE_CASE(AArch64ISD::TLSDESC_AUTH_CALLSEQ)
     MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
     MAKE_CASE(AArch64ISD::ABDS_PRED)
     MAKE_CASE(AArch64ISD::ABDU_PRED)
@@ -4495,6 +4502,54 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
     return LowerFixedLengthFPExtendToSVE(Op, DAG);
 
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
+  EVT Op0VT = Op0.getValueType();
+  if (VT == MVT::f64) {
+    // FP16->FP32 extends are legal for v32 and v4f32.
+    if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
+      return Op;
+    // Split bf16->f64 extends into two fpextends.
+    if (Op0VT == MVT::bf16 && IsStrict) {
+      SDValue Ext1 =
+          DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
+                      {Op0, Op.getOperand(0)});
+      return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
+                         {Ext1, Ext1.getValue(1)});
+    }
+    if (Op0VT == MVT::bf16)
+      return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
+                         DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
+    return SDValue();
+  }
+
+  if (VT.getScalarType() == MVT::f32) {
+    // FP16->FP32 extends are legal for v32 and v4f32.
+    if (Op0VT.getScalarType() == MVT::f16)
+      return Op;
+    if (Op0VT.getScalarType() == MVT::bf16) {
+      SDLoc DL(Op);
+      EVT IVT = VT.changeTypeToInteger();
+      if (!Op0VT.isVector()) {
+        Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
+        IVT = MVT::v4i32;
+      }
+
+      EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
+      SDValue Ext =
+          DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
+      SDValue Shift =
+          DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
+      if (!Op0VT.isVector())
+        Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
+                            DAG.getConstant(0, DL, MVT::i64));
+      Shift = DAG.getBitcast(VT, Shift);
+      return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
+                      : Shift;
+    }
+    return SDValue();
+  }
+
   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
   return SDValue();
 }
@@ -7342,6 +7397,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::STRICT_FP_ROUND:
     return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND:
+  case ISD::STRICT_FP_EXTEND:
     return LowerFP_EXTEND(Op, DAG);
   case ISD::FRAMEADDR:
     return LowerFRAMEADDR(Op, DAG);
@@ -10123,8 +10179,11 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  Chain =
-      DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
+  unsigned Opcode =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
+          ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
+          : AArch64ISD::TLSDESC_CALLSEQ;
+  Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
   SDValue Glue = Chain.getValue(1);
 
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
@@ -10136,8 +10195,12 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
 
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  AArch64FunctionInfo *MFI =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
-  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+  TLSModel::Model Model = MFI->hasELFSignedGOT()
+                              ? TLSModel::GeneralDynamic
+                              : getTargetMachine().getTLSModel(GA->getGlobal());
 
   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
     if (Model == TLSModel::LocalDynamic)
@@ -10174,8 +10237,6 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
     // calculation.
 
     // These accesses will need deduplicating if there's more than one.
-    AArch64FunctionInfo *MFI =
-        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
     MFI->incNumLocalDynamicTLSAccesses();
 
     // The call needs a relocation too for linker relaxation. It doesn't make
@@ -18424,7 +18485,7 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
     EVT VT = A.getValueType();
     SDValue Op0 = A.getOperand(0);
     SDValue Op1 = A.getOperand(1);
-    if (Op0.getOpcode() != Op0.getOpcode() ||
+    if (Op0.getOpcode() != Op1.getOpcode() ||
         (Op0.getOpcode() != ISD::ZERO_EXTEND &&
          Op0.getOpcode() != ISD::SIGN_EXTEND))
       return SDValue();
@@ -21981,21 +22042,35 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   SDLoc DL(N);
 
   SDValue Op2 = N->getOperand(2);
-  if (Op2->getOpcode() != ISD::MUL ||
-      !ISD::isExtOpcode(Op2->getOperand(0)->getOpcode()) ||
-      !ISD::isExtOpcode(Op2->getOperand(1)->getOpcode()))
-    return SDValue();
+  unsigned Op2Opcode = Op2->getOpcode();
+  SDValue MulOpLHS, MulOpRHS;
+  bool MulOpLHSIsSigned, MulOpRHSIsSigned;
+  if (ISD::isExtOpcode(Op2Opcode)) {
+    MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
+    MulOpLHS = Op2->getOperand(0);
+    MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
+  } else if (Op2Opcode == ISD::MUL) {
+    SDValue ExtMulOpLHS = Op2->getOperand(0);
+    SDValue ExtMulOpRHS = Op2->getOperand(1);
+
+    unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
+    unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
+    if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
+        !ISD::isExtOpcode(ExtMulOpRHSOpcode))
+      return SDValue();
 
-  SDValue Acc = N->getOperand(1);
-  SDValue Mul = N->getOperand(2);
-  SDValue ExtMulOpLHS = Mul->getOperand(0);
-  SDValue ExtMulOpRHS = Mul->getOperand(1);
+    MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
+    MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
+
+    MulOpLHS = ExtMulOpLHS->getOperand(0);
+    MulOpRHS = ExtMulOpRHS->getOperand(0);
 
-  SDValue MulOpLHS = ExtMulOpLHS->getOperand(0);
-  SDValue MulOpRHS = ExtMulOpRHS->getOperand(0);
-  if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
+    if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
+      return SDValue();
+  } else
     return SDValue();
 
+  SDValue Acc = N->getOperand(1);
   EVT ReducedVT = N->getValueType(0);
   EVT MulSrcVT = MulOpLHS.getValueType();
 
@@ -22009,8 +22084,6 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
       !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
     return SDValue();
 
-  bool MulOpLHSIsSigned = ExtMulOpLHS->getOpcode() == ISD::SIGN_EXTEND;
-  bool MulOpRHSIsSigned = ExtMulOpRHS->getOpcode() == ISD::SIGN_EXTEND;
   // If the extensions are mixed, we should lower it to a usdot instead
   unsigned Opcode = 0;
   if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
@@ -22026,10 +22099,8 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
     // USDOT expects the signed operand to be last
     if (!MulOpRHSIsSigned)
       std::swap(MulOpLHS, MulOpRHS);
-  } else if (MulOpLHSIsSigned)
-    Opcode = AArch64ISD::SDOT;
-  else
-    Opcode = AArch64ISD::UDOT;
+  } else
+    Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
 
   // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
   // product followed by a zero / sign extension
@@ -27413,6 +27484,15 @@ void AArch64TargetLowering::ReplaceNodeResults(
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
+    case Intrinsic::aarch64_sme_in_streaming_mode: {
+      SDLoc DL(N);
+      SDValue Chain = DAG.getEntryNode();
+      SDValue RuntimePStateSM =
+          getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
+      Results.push_back(
+          DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
+      return;
+    }
     case Intrinsic::experimental_vector_match:
     case Intrinsic::get_active_lane_mask: {
       if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
@@ -29648,9 +29728,16 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
 
   if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
     unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
+
+    if (Operation == ComplexDeinterleavingOperation::CDot)
+      return ScalarWidth == 32 || ScalarWidth == 64;
     return 8 <= ScalarWidth && ScalarWidth <= 64;
   }
 
+  // CDot is not supported outside of scalable/sve scopes
+  if (Operation == ComplexDeinterleavingOperation::CDot)
+    return false;
+
   return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
          ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
 }
@@ -29660,6 +29747,8 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
     Value *Accumulator) const {
   VectorType *Ty = cast<VectorType>(InputA->getType());
+  if (Accumulator == nullptr)
+    Accumulator = Constant::getNullValue(Ty);
   bool IsScalable = Ty->isScalableTy();
   bool IsInt = Ty->getElementType()->isIntegerTy();
 
@@ -29671,6 +29760,10 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
 
   if (TyWidth > 128) {
     int Stride = Ty->getElementCount().getKnownMinValue() / 2;
+    int AccStride = cast<VectorType>(Accumulator->getType())
+                        ->getElementCount()
+                        .getKnownMinValue() /
+                    2;
     auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
     auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
     auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
@@ -29680,25 +29773,26 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
         B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
     Value *LowerSplitAcc = nullptr;
     Value *UpperSplitAcc = nullptr;
-    if (Accumulator) {
-      LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
-      UpperSplitAcc =
-          B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
-    }
+    Type *FullTy = Ty;
+    FullTy = Accumulator->getType();
+    auto *HalfAccTy = VectorType::getHalfElementsVectorType(
+        cast<VectorType>(Accumulator->getType()));
+    LowerSplitAcc =
+        B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
+    UpperSplitAcc =
+        B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
     auto *LowerSplitInt = createComplexDeinterleavingIR(
         B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
     auto *UpperSplitInt = createComplexDeinterleavingIR(
         B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
 
-    auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
-                                        B.getInt64(0));
-    return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
+    auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
+                                        LowerSplitInt, B.getInt64(0));
+    return B.CreateInsertVector(FullTy, Result, UpperSplitInt,
+                                B.getInt64(AccStride));
   }
 
   if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
-    if (Accumulator == nullptr)
-      Accumulator = Constant::getNullValue(Ty);
-
     if (IsScalable) {
       if (IsInt)
         return B.CreateIntrinsic(
@@ -29750,6 +29844,13 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
   }
 
+  if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
+      IsScalable) {
+    return B.CreateIntrinsic(
+        Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
+        {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1b7f328..85b62be 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -83,6 +83,7 @@ enum NodeType : unsigned {
   // Produces the full sequence of instructions for getting the thread pointer
   // offset of a variable into X0, using the TLSDesc model.
   TLSDESC_CALLSEQ,
+  TLSDESC_AUTH_CALLSEQ,
   ADRP,     // Page address of a TargetGlobalAddress operand.
   ADR,      // ADR
   ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 47c4c6c..f527f7e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1804,7 +1804,9 @@ class TMSystemException<bits<3> op1, string asm, list<dag> pattern>
 }
 
 class APASI : SimpleSystemI<0, (ins GPR64:$Xt), "apas", "\t$Xt">, Sched<[]> {
+  bits<5> Xt;
   let Inst{20-5} = 0b0111001110000000;
+  let Inst{4-0} = Xt;
   let DecoderNamespace = "APAS";
 }
 
@@ -2768,6 +2770,8 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode>
   let Inst{23-21} = opc;
   let Inst{20-16} = Rm;
   let Inst{15}    = 0;
+  let Inst{14-10} = 0b11111;
+  let Unpredictable{14-10} = 0b11111;
   let Inst{9-5}   = Rn;
   let Inst{4-0}   = Rd;
 
@@ -4920,6 +4924,8 @@ class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
   bits<5> Rt;
   bits<5> Rt2;
   bits<5> Rn;
+  let Inst{20-16} = 0b11111;
+  let Unpredictable{20-16} = 0b11111;
   let Inst{14-10} = Rt2;
   let Inst{9-5} = Rn;
   let Inst{4-0} = Rt;
@@ -4935,6 +4941,7 @@ class BaseLoadStoreExclusiveLSUI<bits<2> sz, bit L, bit o0,
   let Inst{31-30} = sz;
   let Inst{29-23} = 0b0010010;
   let Inst{22}    = L;
+  let Inst{21}    = 0b0;
   let Inst{15}    = o0;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b37f4a0..c6f5cdc 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -381,9 +381,6 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
 
 def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
 
-def UseUnaryUndefPseudos
-  : Predicate<"!(Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2()))">;
-
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
@@ -886,6 +883,9 @@ def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
                                     SDT_AArch64TLSDescCallSeq,
                                     [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>;
 
+def AArch64tlsdesc_auth_callseq : SDNode<"AArch64ISD::TLSDESC_AUTH_CALLSEQ",
+                                    SDT_AArch64TLSDescCallSeq,
+                                    [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>;
 
 def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
                                  SDT_AArch64WrapperLarge>;
@@ -3315,8 +3315,16 @@ def TLSDESC_CALLSEQ
     : Pseudo<(outs), (ins i64imm:$sym),
              [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
       Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
+let isCall = 1, Defs = [NZCV, LR, X0, X16], hasSideEffects = 1, Size = 16,
+    isCodeGenOnly = 1 in
+def TLSDESC_AUTH_CALLSEQ
+    : Pseudo<(outs), (ins i64imm:$sym),
+             [(AArch64tlsdesc_auth_callseq tglobaltlsaddr:$sym)]>,
+      Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
 def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
           (TLSDESC_CALLSEQ texternalsym:$sym)>;
+def : Pat<(AArch64tlsdesc_auth_callseq texternalsym:$sym),
+          (TLSDESC_AUTH_CALLSEQ texternalsym:$sym)>;
 
 //===----------------------------------------------------------------------===//
 // Conditional branch (immediate) instruction.
@@ -5115,22 +5123,6 @@ let Predicates = [HasFullFP16] in {
 //===----------------------------------------------------------------------===//
 
 defm FCVT : FPConversion<"fcvt">;
-// Helper to get bf16 into fp32.
-def cvt_bf16_to_fp32 :
-  OutPatFrag<(ops node:$Rn),
-             (f32 (COPY_TO_REGCLASS
-	         (i32 (UBFMWri
-		   (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-		                           node:$Rn, hsub), GPR32)),
-	           (i64 (i32shift_a (i64 16))),
-                   (i64 (i32shift_b (i64 16))))),
-	         FPR32))>;
-// Pattern for bf16 -> fp32.
-def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))),
-          (cvt_bf16_to_fp32 FPR16:$Rn)>;
-// Pattern for bf16 -> fp64.
-def : Pat<(f64 (any_fpextend (bf16 FPR16:$Rn))),
-          (FCVTDSr (f32 (cvt_bf16_to_fp32 FPR16:$Rn)))>;
 
 //===----------------------------------------------------------------------===//
 // Floating point single operand instructions.
@@ -8325,8 +8317,6 @@ def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>
 def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
 def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
 def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-// Vector bf16 -> fp32 is implemented morally as a zext + shift.
-def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))), (SHLLv4i16 V64:$Rn)>;
 // Also match an extend from the upper half of a 128 bit source register.
 def : Pat<(v8i16 (anyext (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn)) ))),
           (USHLLv16i8_shift V128:$Rn, (i32 0))>;
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index a290a51..c3bc70a 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -144,20 +144,20 @@ void AArch64PointerAuth::signLR(MachineFunction &MF,
   // No SEH opcode for this one; it doesn't materialize into an
   // instruction on Windows.
   if (MFnI.branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
     BuildMI(MBB, MBBI, DL,
             TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSPPC
                                                : AArch64::PACIASPPC))
         .setMIFlag(MachineInstr::FrameSetup)
         ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
-    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
   } else {
     BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup);
+    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
     BuildMI(MBB, MBBI, DL,
             TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSP
                                                : AArch64::PACIASP))
         .setMIFlag(MachineInstr::FrameSetup)
         ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
-    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
   }
 
   if (!EmitCFI && NeedsWinCFI) {
@@ -212,19 +212,19 @@ void AArch64PointerAuth::authenticateLR(
     if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
       assert(PACSym && "No PAC instruction to refer to");
       emitPACSymOffsetIntoX16(*TII, MBB, MBBI, DL, PACSym);
+      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
+                 EmitAsyncCFI);
       BuildMI(MBB, MBBI, DL,
               TII->get(UseBKey ? AArch64::AUTIBSPPCi : AArch64::AUTIASPPCi))
           .addSym(PACSym)
           .setMIFlag(MachineInstr::FrameDestroy);
-      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
-                 EmitAsyncCFI);
     } else {
       BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy, PACSym);
+      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
+                 EmitAsyncCFI);
       BuildMI(MBB, MBBI, DL,
               TII->get(UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP))
           .setMIFlag(MachineInstr::FrameDestroy);
-      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
-                 EmitAsyncCFI);
     }
 
     if (NeedsWinCFI) {
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 8b8d73d..aee54ed 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -979,8 +979,7 @@ defm FSCALE_2ZZ   : sme2_fp_sve_destructive_vector_vg2_single<"fscale", 0b001100
 defm FSCALE_4ZZ   : sme2_fp_sve_destructive_vector_vg4_single<"fscale", 0b0011000>;
 defm FSCALE_2Z2Z  : sme2_fp_sve_destructive_vector_vg2_multi<"fscale",  0b0011000>;
 defm FSCALE_4Z4Z  : sme2_fp_sve_destructive_vector_vg4_multi<"fscale",  0b0011000>;
-
-} // [HasSME2, HasFP8]
+}
 
 let Predicates = [HasSME2, HasFAMINMAX] in {
 defm FAMAX_2Z2Z : sme2_fp_sve_destructive_vector_vg2_multi<"famax", 0b0010100>;
@@ -988,17 +987,16 @@ defm FAMIN_2Z2Z : sme2_fp_sve_destructive_vector_vg2_multi<"famin", 0b0010101>;
 
 defm FAMAX_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famax", 0b0010100>;
 defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;
-} //[HasSME2, HasFAMINMAX]
-
+}
 
 let Predicates = [HasSME_LUTv2] in {
 defm MOVT_TIZ : sme2_movt_zt_to_zt<"movt",  0b0011111, int_aarch64_sme_write_lane_zt, int_aarch64_sme_write_zt>;
 def LUTI4_4ZZT2Z    : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
-} //[HasSME_LUTv2]
+}
 
 let Predicates = [HasSME2p1, HasSME_LUTv2] in {
 def LUTI4_S_4ZZT2Z  : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
-} //[HasSME2p1, HasSME_LUTv2]
+}
 
 let Predicates = [HasSMEF8F16] in {
 defm FVDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fvdot", 0b110, int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2>;
@@ -1014,17 +1012,15 @@ defm FMLAL_MZZI_BtoH      : sme2_fp8_fmlal_index_za16<"fmlal",      int_aarch64_
 defm FMLAL_VG2_M2ZZI_BtoH : sme2_fp8_fmlal_index_za16_vgx2<"fmlal", int_aarch64_sme_fp8_fmlal_lane_za16_vg2x2>;
 defm FMLAL_VG4_M4ZZI_BtoH : sme2_fp8_fmlal_index_za16_vgx4<"fmlal", int_aarch64_sme_fp8_fmlal_lane_za16_vg2x4>;
 
-// FP8 FMLAL (single)
 defm FMLAL_VG2_MZZ_BtoH  : sme2_fp8_fmlal_single_za16<"fmlal", int_aarch64_sme_fp8_fmlal_single_za16_vg2x1>;
-defm FMLAL_VG2_M2ZZ_BtoH : sme2_fp_mla_long_array_vg2_single<"fmlal",  0b001, MatrixOp16, ZZ_b, ZPR4b8, nxv16i8,  int_aarch64_sme_fp8_fmlal_single_za16_vg2x2, [FPMR, FPCR]>;
+defm FMLAL_VG2_M2ZZ_BtoH : sme2_fp_mla_long_array_vg2_single<"fmlal", 0b001, MatrixOp16, ZZ_b, ZPR4b8, nxv16i8,   int_aarch64_sme_fp8_fmlal_single_za16_vg2x2, [FPMR, FPCR]>;
 defm FMLAL_VG4_M4ZZ_BtoH : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b001, MatrixOp16, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_fp8_fmlal_single_za16_vg2x4, [FPMR, FPCR]>;
 
-// FP8 FMLALL (multi)
 defm FMLAL_VG2_M2Z2Z_BtoH : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b100, MatrixOp16, ZZ_b_mul_r, nxv16i8,   int_aarch64_sme_fp8_fmlal_multi_za16_vg2x2, [FPMR, FPCR]>;
 defm FMLAL_VG4_M4Z4Z_BtoH : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_fp8_fmlal_multi_za16_vg2x4, [FPMR, FPCR]>;
 
 defm FMOPA_MPPZZ_BtoH : sme2_fp8_fmopa_za16<"fmopa", int_aarch64_sme_fp8_fmopa_za16>;
-} //[HasSMEF8F16]
+}
 
 let Predicates = [HasSMEF8F32] in {
 defm FDOT_VG2_M2ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x2>;
@@ -1042,17 +1038,15 @@ defm FMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"fmlall",     0b01, 0b0
 defm FMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"fmlall", 0b10, 0b100,  int_aarch64_sme_fp8_fmlall_lane_za32_vg4x2, [FPMR, FPCR]>;
 defm FMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"fmlall", 0b00, 0b1000, int_aarch64_sme_fp8_fmlall_lane_za32_vg4x4, [FPMR, FPCR]>;
 
-// FP8 FMLALL (single)
 defm FMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"fmlall", 0b01000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_fp8_fmlall_single_za32_vg4x1, [FPMR, FPCR]>;
 defm FMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"fmlall", 0b000001, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_fp8_fmlall_single_za32_vg4x2, [FPMR, FPCR]>;
 defm FMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"fmlall", 0b010001, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_fp8_fmlall_single_za32_vg4x4, [FPMR, FPCR]>;
 
-// FP8 FMLALL (multi)
 defm FMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"fmlall", 0b01000, MatrixOp32, ZZ_b_mul_r, nxv16i8,   int_aarch64_sme_fp8_fmlall_multi_za32_vg4x2, [FPMR, FPCR]>;
 defm FMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"fmlall", 0b01000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_fp8_fmlall_multi_za32_vg4x4, [FPMR, FPCR]>;
 
 defm FMOPA_MPPZZ_BtoS : sme2_fp8_fmopa_za32<"fmopa", int_aarch64_sme_fp8_fmopa_za32>;
-} //[HasSMEF8F32]
+}
 
 let Predicates = [HasSME2, HasSVEBFSCALE] in {
   defm BFSCALE : sme2_bfscale_single<"bfscale">;
@@ -1077,31 +1071,31 @@ let Predicates = [HasSME2p2] in {
 
   defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a">;
   defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s">;
-} // [HasSME2p2]
+}
 
 let Predicates = [HasSME2p2, HasSMEB16B16] in {
   def BFTMOPA_M2ZZZI_HtoH : sme_tmopa_16b<0b11001, ZZ_h_mul_r, ZPR16, "bftmopa">;
-} // [HasSME2p2, HasSMEB16B16]
+}
 
 let Predicates = [HasSME2p2, HasSMEF8F32], Uses = [FPMR, FPCR] in {
   def FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, "ftmopa">;
-} // [HasSME2p2, HasSMEF8F32], Uses = [FPMR, FPCR]
+}
 
 let Predicates = [HasSME2p2, HasSMEF8F16], Uses = [FPMR, FPCR] in {
   def FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, "ftmopa">;
   defm FMOP4A : sme2_fmop4a_fp8_fp16_2way<"fmop4a">;
-} // [HasSME2p2, HasSMEF8F16],  Uses = [FPMR, FPCR]
+}
 
 let Predicates = [HasSME2p2, HasSMEF16F16] in {
   def FTMOPA_M2ZZZI_HtoH : sme_tmopa_16b<0b10001, ZZ_h_mul_r, ZPR16, "ftmopa">;
   defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a">;
   defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s">;
-} // [HasSME2p2, HasSMEF16F16]
+}
 
 let Predicates = [HasSME2, HasSVEBFSCALE] in {
   defm BFMUL : sme2_bfmul_single<"bfmul">;
   defm BFMUL : sme2_bfmul_multi<"bfmul">;
-} //[HasSME2, HasSVEBFSCALE]
+}
 
 let Uses = [FPMR, FPCR] in {
 let Predicates = [HasSME2p2, HasSMEF8F32] in {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c8892de..7dd6d49 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -675,14 +675,6 @@ let Predicates = [HasSVEorSME] in {
   defm FABS_ZPmZ : sve_int_un_pred_arit_bitwise_fp<0b100, "fabs", AArch64fabs_mt>;
   defm FNEG_ZPmZ : sve_int_un_pred_arit_bitwise_fp<0b101, "fneg", AArch64fneg_mt>;
 
-  let Predicates = [HasSVEorSME, UseUnaryUndefPseudos] in {
-    defm FABS_ZPmZ : sve_fp_un_pred_arit_hsd<AArch64fabs_mt>;
-    defm FNEG_ZPmZ : sve_fp_un_pred_arit_hsd<AArch64fneg_mt>;
-
-    defm ABS_ZPmZ : sve_int_un_pred_arit_bhsd<AArch64abs_mt>;
-    defm NEG_ZPmZ : sve_int_un_pred_arit_bhsd<AArch64neg_mt>;
-  }
-
   foreach VT = [nxv2bf16, nxv4bf16, nxv8bf16] in {
     // No dedicated instruction, so just clear the sign bit.
     def : Pat<(VT (fabs VT:$op)),
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 737fc73..e23daec 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -512,6 +512,12 @@ def N2Write_8c_3L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL,
   let NumMicroOps = 7;
 }
 
+def N2Write_7c_7V0 : SchedWriteRes<[N2UnitV0]> {
+  let Latency     = 7;
+  let NumMicroOps = 7;
+  let ReleaseAtCycles = [7];
+}
+
 //===----------------------------------------------------------------------===//
 // Define generic 8 micro-op types
 
@@ -548,6 +554,15 @@ def N2Write_9c_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL,
 }
 
 //===----------------------------------------------------------------------===//
+// Define generic 9 micro-op types
+
+def N2Write_9c_9V0 : SchedWriteRes<[N2UnitV0]> {
+  let Latency     = 9;
+  let NumMicroOps = 9;
+  let ReleaseAtCycles = [9];
+}
+
+//===----------------------------------------------------------------------===//
 // Define generic 10 micro-op types
 
 def N2Write_7c_5L01_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
@@ -557,6 +572,12 @@ def N2Write_7c_5L01_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
   let NumMicroOps = 10;
 }
 
+def N2Write_10c_10V0 : SchedWriteRes<[N2UnitV0]> {
+  let Latency     = 10;
+  let NumMicroOps = 10;
+  let ReleaseAtCycles = [10];
+}
+
 //===----------------------------------------------------------------------===//
 // Define generic 12 micro-op types
 
@@ -580,6 +601,21 @@ def N2Write_7c_5L01_5S_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
   let NumMicroOps = 15;
 }
 
+def N2Write_15c_15V0 : SchedWriteRes<[N2UnitV0]> {
+  let Latency     = 15;
+  let NumMicroOps = 15;
+  let ReleaseAtCycles = [15];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 16 micro-op types
+
+def N2Write_16c_16V0 : SchedWriteRes<[N2UnitV0]> {
+  let Latency     = 16;
+  let NumMicroOps = 16;
+  let ReleaseAtCycles = [16];
+}
+
 //===----------------------------------------------------------------------===//
 // Define generic 18 micro-op types
 
@@ -795,22 +831,26 @@ def : SchedAlias<WriteF,     N2Write_2c_1V>;
 // FP compare
 def : SchedAlias<WriteFCmp,  N2Write_2c_1V0>;
 
+// FP divide and square root operations are performed using an iterative
+// algorithm and block subsequent similar operations to the same pipeline
+// until complete (Arm Neoverse N2 Software Optimization Guide, 3.14).
+
 // FP divide, square root
-def : SchedAlias<WriteFDiv,  N2Write_7c_1V0>;
+def : SchedAlias<WriteFDiv,  N2Write_7c_7V0>;
 
 // FP divide, H-form
-def : InstRW<[N2Write_7c_1V0],  (instrs FDIVHrr)>;
+def : InstRW<[N2Write_7c_7V0],  (instrs FDIVHrr)>;
 // FP divide, S-form
-def : InstRW<[N2Write_10c_1V0], (instrs FDIVSrr)>;
+def : InstRW<[N2Write_10c_10V0], (instrs FDIVSrr)>;
 // FP divide, D-form
-def : InstRW<[N2Write_15c_1V0], (instrs FDIVDrr)>;
+def : InstRW<[N2Write_15c_15V0], (instrs FDIVDrr)>;
 
 // FP square root, H-form
-def : InstRW<[N2Write_7c_1V0],  (instrs FSQRTHr)>;
+def : InstRW<[N2Write_7c_7V0],  (instrs FSQRTHr)>;
 // FP square root, S-form
-def : InstRW<[N2Write_9c_1V0],  (instrs FSQRTSr)>;
+def : InstRW<[N2Write_9c_9V0],  (instrs FSQRTSr)>;
 // FP square root, D-form
-def : InstRW<[N2Write_16c_1V0], (instrs FSQRTDr)>;
+def : InstRW<[N2Write_16c_16V0], (instrs FSQRTDr)>;
 
 // FP multiply
 def : WriteRes<WriteFMul, [N2UnitV]> { let Latency = 3; }
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index f22e024..355a9d2 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -42,10 +42,7 @@ def HasCONTEXTIDREL2
 //===----------------------------------------------------------------------===//
 
 class AT<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-         bits<3> op2> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+         bits<3> op2> {
   string Name = name;
   bits<14> Encoding;
   let Encoding{13-11} = op1;
@@ -55,6 +52,27 @@ class AT<string name, bits<3> op1, bits<4> crn, bits<4> crm,
   code Requires = [{ {} }];
 }
 
+def ATValues : GenericEnum {
+  let FilterClass = "AT";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def ATsList : GenericTable {
+  let FilterClass = "AT";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupATByName : SearchIndex {
+  let Table = ATsList;
+  let Key = ["Name"];
+}
+
+def lookupATByEncoding : SearchIndex {
+  let Table = ATsList;
+  let Key = ["Encoding"];
+}
+
 def : AT<"S1E1R",  0b000, 0b0111, 0b1000, 0b000>;
 def : AT<"S1E2R",  0b100, 0b0111, 0b1000, 0b000>;
 def : AT<"S1E3R",  0b110, 0b0111, 0b1000, 0b000>;
@@ -82,14 +100,32 @@ def : AT<"S1E3A", 0b110, 0b0111, 0b1001, 0b010>;
 // DMB/DSB (data barrier) instruction options.
 //===----------------------------------------------------------------------===//
 
-class DB<string name, bits<4> encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class DB<string name, bits<4> encoding> {
   string Name = name;
   bits<4> Encoding = encoding;
 }
 
+def DBValues : GenericEnum {
+  let FilterClass = "DB";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def DBsList : GenericTable {
+  let FilterClass = "DB";
+  let Fields = ["Name", "Encoding"];
+}
+
+def lookupDBByName : SearchIndex {
+  let Table = DBsList;
+  let Key = ["Name"];
+}
+
+def lookupDBByEncoding : SearchIndex {
+  let Table = DBsList;
+  let Key = ["Encoding"];
+}
+
 def : DB<"oshld", 0x1>;
 def : DB<"oshst", 0x2>;
 def : DB<"osh",   0x3>;
@@ -103,16 +139,39 @@ def : DB<"ld",    0xd>;
 def : DB<"st",    0xe>;
 def : DB<"sy",    0xf>;
 
-class DBnXS<string name, bits<4> encoding, bits<5> immValue> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding", "ImmValue"];
-  let EnumValueField = "Encoding";
-
+class DBnXS<string name, bits<4> encoding, bits<5> immValue> {
   string Name = name;
   bits<4> Encoding = encoding;
   bits<5> ImmValue = immValue;
   code Requires = [{ {AArch64::FeatureXS} }];
 }
 
+def DBnXSValues : GenericEnum {
+  let FilterClass = "DBnXS";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def DBnXSsList : GenericTable {
+  let FilterClass = "DBnXS";
+  let Fields = ["Name", "Encoding", "ImmValue", "Requires"];
+}
+
+def lookupDBnXSByName : SearchIndex {
+  let Table = DBnXSsList;
+  let Key = ["Name"];
+}
+
+def lookupDBnXSByEncoding : SearchIndex {
+  let Table = DBnXSsList;
+  let Key = ["Encoding"];
+}
+
+def lookupDBnXSByImmValue : SearchIndex {
+  let Table = DBnXSsList;
+  let Key = ["ImmValue"];
+}
+
 def : DBnXS<"oshnxs", 0x3, 0x10>;
 def : DBnXS<"nshnxs", 0x7, 0x14>;
 def : DBnXS<"ishnxs", 0xb, 0x18>;
@@ -123,10 +182,7 @@ def : DBnXS<"synxs",  0xf, 0x1c>;
 //===----------------------------------------------------------------------===//
 
 class DC<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-         bits<3> op2> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+         bits<3> op2> {
   string Name = name;
   bits<14> Encoding;
   let Encoding{13-11} = op1;
@@ -136,6 +192,27 @@ class DC<string name, bits<3> op1, bits<4> crn, bits<4> crm,
   code Requires = [{ {} }];
 }
 
+def DCValues : GenericEnum {
+  let FilterClass = "DC";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def DCsList : GenericTable {
+  let FilterClass = "DC";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupDCByName : SearchIndex {
+  let Table = DCsList;
+  let Key = ["Name"];
+}
+
+def lookupDCByEncoding : SearchIndex {
+  let Table = DCsList;
+  let Key = ["Encoding"];
+}
+
 def : DC<"ZVA",   0b011, 0b0111, 0b0100, 0b001>;
 def : DC<"IVAC",  0b000, 0b0111, 0b0110, 0b001>;
 def : DC<"ISW",   0b000, 0b0111, 0b0110, 0b010>;
@@ -193,10 +270,7 @@ def : DC<"CGDVAOC",  0b011, 0b0111, 0b1011, 0b111>;
 //===----------------------------------------------------------------------===//
 
 class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
-         bit needsreg> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+         bit needsreg> {
   string Name = name;
   bits<14> Encoding;
   let Encoding{13-11} = op1;
@@ -206,6 +280,27 @@ class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
   bit NeedsReg = needsreg;
 }
 
+def ICValues : GenericEnum {
+  let FilterClass = "IC";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def ICsList : GenericTable {
+  let FilterClass = "IC";
+  let Fields = ["Name", "Encoding", "NeedsReg"];
+}
+
+def lookupICByName : SearchIndex {
+  let Table = ICsList;
+  let Key = ["Name"];
+}
+
+def lookupICByEncoding : SearchIndex {
+  let Table = ICsList;
+  let Key = ["Encoding"];
+}
+
 def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
 def : IC<"IALLU",   0b000, 0b0111, 0b0101, 0b000, 0>;
 def : IC<"IVAU",    0b011, 0b0111, 0b0101, 0b001, 1>;
@@ -214,25 +309,40 @@ def : IC<"IVAU",    0b011, 0b0111, 0b0101, 0b001, 1>;
 // ISB (instruction-fetch barrier) instruction options.
 //===----------------------------------------------------------------------===//
 
-class ISB<string name, bits<4> encoding> : SearchableTable{
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class ISB<string name, bits<4> encoding> {
   string Name = name;
   bits<4> Encoding;
   let Encoding = encoding;
 }
 
+def ISBValues : GenericEnum {
+  let FilterClass = "ISB";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def ISBsList : GenericTable {
+  let FilterClass = "ISB";
+  let Fields = ["Name", "Encoding"];
+}
+
+def lookupISBByName : SearchIndex {
+  let Table = ISBsList;
+  let Key = ["Name"];
+}
+
+def lookupISBByEncoding : SearchIndex {
+  let Table = ISBsList;
+  let Key = ["Encoding"];
+}
+
 def : ISB<"sy", 0xf>;
 
 //===----------------------------------------------------------------------===//
 // TSB (Trace synchronization barrier) instruction options.
 //===----------------------------------------------------------------------===//
 
-class TSB<string name, bits<4> encoding> : SearchableTable{
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class TSB<string name, bits<4> encoding> {
   string Name = name;
   bits<4> Encoding;
   let Encoding = encoding;
@@ -240,6 +350,27 @@ class TSB<string name, bits<4> encoding> : SearchableTable{
   code Requires = [{ {AArch64::FeatureTRACEV8_4} }];
 }
 
+def TSBValues : GenericEnum {
+  let FilterClass = "TSB";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def TSBsList : GenericTable {
+  let FilterClass = "TSB";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupTSBByName : SearchIndex {
+  let Table = TSBsList;
+  let Key = ["Name"];
+}
+
+def lookupTSBByEncoding : SearchIndex {
+  let Table = TSBsList;
+  let Key = ["Encoding"];
+}
+
 def : TSB<"csync", 0>;
 
 //===----------------------------------------------------------------------===//
@@ -248,10 +379,7 @@ def : TSB<"csync", 0>;
 
 class PRFM<string type,   bits<2> type_encoding,
            string target, bits<2> target_encoding,
-           string policy, bits<1> policy_encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+           string policy, bits<1> policy_encoding> {
   string Name = type # target # policy;
   bits<5> Encoding;
   let Encoding{4-3} = type_encoding;
@@ -261,6 +389,27 @@ class PRFM<string type,   bits<2> type_encoding,
   code Requires = [{ {} }];
 }
 
+def PRFMValues : GenericEnum {
+  let FilterClass = "PRFM";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def PRFMsList : GenericTable {
+  let FilterClass = "PRFM";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupPRFMByName : SearchIndex {
+  let Table = PRFMsList;
+  let Key = ["Name"];
+}
+
+def lookupPRFMByEncoding : SearchIndex {
+  let Table = PRFMsList;
+  let Key = ["Encoding"];
+}
+
 def : PRFM<"pld", 0b00, "l1",  0b00, "keep", 0b0>;
 def : PRFM<"pld", 0b00, "l1",  0b00, "strm", 0b1>;
 def : PRFM<"pld", 0b00, "l2",  0b01, "keep", 0b0>;
@@ -296,16 +445,34 @@ def : PRFM<"pst", 0b10, "slc", 0b11, "strm", 0b1>;
 // SVE Prefetch instruction options.
 //===----------------------------------------------------------------------===//
 
-class SVEPRFM<string name, bits<4> encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class SVEPRFM<string name, bits<4> encoding> {
   string Name = name;
   bits<4> Encoding;
   let Encoding = encoding;
   code Requires = [{ {} }];
 }
 
+def SVEPRFMValues : GenericEnum {
+  let FilterClass = "SVEPRFM";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def SVEPRFMsList : GenericTable {
+  let FilterClass = "SVEPRFM";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupSVEPRFMByName : SearchIndex {
+  let Table = SVEPRFMsList;
+  let Key = ["Name"];
+}
+
+def lookupSVEPRFMByEncoding : SearchIndex {
+  let Table = SVEPRFMsList;
+  let Key = ["Encoding"];
+}
+
 let Requires = [{ {AArch64::FeatureSVE} }] in {
 def : SVEPRFM<"pldl1keep", 0x00>;
 def : SVEPRFM<"pldl1strm", 0x01>;
@@ -325,10 +492,7 @@ def : SVEPRFM<"pstl3strm", 0x0d>;
 // RPRFM (prefetch) instruction options.
 //===----------------------------------------------------------------------===//
 
-class RPRFM<string name, bits<1> type_encoding, bits<5> policy_encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class RPRFM<string name, bits<1> type_encoding, bits<5> policy_encoding> {
   string Name = name;
   bits<6> Encoding;
   let Encoding{0} = type_encoding;
@@ -336,6 +500,27 @@ class RPRFM<string name, bits<1> type_encoding, bits<5> policy_encoding> : Searc
   code Requires = [{ {} }];
 }
 
+def RPRFMValues : GenericEnum {
+  let FilterClass = "RPRFM";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def RPRFMsList : GenericTable {
+  let FilterClass = "RPRFM";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupRPRFMByName : SearchIndex {
+  let Table = RPRFMsList;
+  let Key = ["Name"];
+}
+
+def lookupRPRFMByEncoding : SearchIndex {
+  let Table = RPRFMsList;
+  let Key = ["Encoding"];
+}
+
 def : RPRFM<"pldkeep", 0b0, 0b00000>;
 def : RPRFM<"pstkeep", 0b1, 0b00000>;
 def : RPRFM<"pldstrm", 0b0, 0b00010>;
@@ -345,15 +530,33 @@ def : RPRFM<"pststrm", 0b1, 0b00010>;
 // SVE Predicate patterns
 //===----------------------------------------------------------------------===//
 
-class SVEPREDPAT<string name, bits<5> encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class SVEPREDPAT<string name, bits<5> encoding> {
   string Name = name;
   bits<5> Encoding;
   let Encoding = encoding;
 }
 
+def SVEPREDPATValues : GenericEnum {
+  let FilterClass = "SVEPREDPAT";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def SVEPREDPATsList : GenericTable {
+  let FilterClass = "SVEPREDPAT";
+  let Fields = ["Name", "Encoding"];
+}
+
+def lookupSVEPREDPATByName : SearchIndex {
+  let Table = SVEPREDPATsList;
+  let Key = ["Name"];
+}
+
+def lookupSVEPREDPATByEncoding : SearchIndex {
+  let Table = SVEPREDPATsList;
+  let Key = ["Encoding"];
+}
+
 def : SVEPREDPAT<"pow2",  0x00>;
 def : SVEPREDPAT<"vl1",   0x01>;
 def : SVEPREDPAT<"vl2",   0x02>;
@@ -376,15 +579,33 @@ def : SVEPREDPAT<"all",   0x1f>;
 // SVE Predicate-as-counter patterns
 //===----------------------------------------------------------------------===//
 
-class SVEVECLENSPECIFIER<string name, bits<1> encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class SVEVECLENSPECIFIER<string name, bits<1> encoding> {
   string Name = name;
   bits<1> Encoding;
   let Encoding = encoding;
 }
 
+def SVEVECLENSPECIFIERValues : GenericEnum {
+  let FilterClass = "SVEVECLENSPECIFIER";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def SVEVECLENSPECIFIERsList : GenericTable {
+  let FilterClass = "SVEVECLENSPECIFIER";
+  let Fields = ["Name", "Encoding"];
+}
+
+def lookupSVEVECLENSPECIFIERByName : SearchIndex {
+  let Table = SVEVECLENSPECIFIERsList;
+  let Key = ["Name"];
+}
+
+def lookupSVEVECLENSPECIFIERByEncoding : SearchIndex {
+  let Table = SVEVECLENSPECIFIERsList;
+  let Key = ["Encoding"];
+}
+
 def : SVEVECLENSPECIFIER<"vlx2", 0x0>;
 def : SVEVECLENSPECIFIER<"vlx4", 0x1>;
 
@@ -395,15 +616,28 @@ def : SVEVECLENSPECIFIER<"vlx4", 0x1>;
 // is used for a few instructions that only accept a limited set of exact FP
 // immediates values.
 //===----------------------------------------------------------------------===//
-class ExactFPImm<string name, string repr, bits<4> enum > : SearchableTable {
-  let SearchableFields = ["Enum", "Repr"];
-  let EnumValueField = "Enum";
-
+class ExactFPImm<string name, string repr, bits<4> enum > {
   string Name = name;
   bits<4> Enum = enum;
   string Repr = repr;
 }
 
+def ExactFPImmValues : GenericEnum {
+  let FilterClass = "ExactFPImm";
+  let NameField = "Name";
+  let ValueField = "Enum";
+}
+
+def ExactFPImmsList : GenericTable {
+  let FilterClass = "ExactFPImm";
+  let Fields = ["Enum", "Repr"];
+}
+
+def lookupExactFPImmByEnum : SearchIndex {
+  let Table = ExactFPImmsList;
+  let Key = ["Enum"];
+}
+
 def : ExactFPImm<"zero", "0.0", 0x0>;
 def : ExactFPImm<"half", "0.5", 0x1>;
 def : ExactFPImm<"one",  "1.0", 0x2>;
@@ -413,10 +647,7 @@ def : ExactFPImm<"two",  "2.0", 0x3>;
 // PState instruction options.
 //===----------------------------------------------------------------------===//
 
-class PStateImm0_15<string name, bits<3> op1, bits<3> op2> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class PStateImm0_15<string name, bits<3> op1, bits<3> op2> {
   string Name = name;
   bits<6> Encoding;
   let Encoding{5-3} = op1;
@@ -424,10 +655,28 @@ class PStateImm0_15<string name, bits<3> op1, bits<3> op2> : SearchableTable {
   code Requires = [{ {} }];
 }
 
-class PStateImm0_1<string name, bits<3> op1, bits<3> op2, bits<3> crm_high> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
+def PStateImm0_15Values : GenericEnum {
+  let FilterClass = "PStateImm0_15";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def PStateImm0_15sList : GenericTable {
+  let FilterClass = "PStateImm0_15";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
 
+def lookupPStateImm0_15ByName : SearchIndex {
+  let Table = PStateImm0_15sList;
+  let Key = ["Name"];
+}
+
+def lookupPStateImm0_15ByEncoding : SearchIndex {
+  let Table = PStateImm0_15sList;
+  let Key = ["Encoding"];
+}
+
+class PStateImm0_1<string name, bits<3> op1, bits<3> op2, bits<3> crm_high> {
   string Name = name;
   bits<9> Encoding;
   let Encoding{8-6} = crm_high;
@@ -436,6 +685,27 @@ class PStateImm0_1<string name, bits<3> op1, bits<3> op2, bits<3> crm_high> : Se
   code Requires = [{ {} }];
 }
 
+def PStateImm0_1Values : GenericEnum {
+  let FilterClass = "PStateImm0_1";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def PStateImm0_1sList : GenericTable {
+  let FilterClass = "PStateImm0_1";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupPStateImm0_1ByName : SearchIndex {
+  let Table = PStateImm0_1sList;
+  let Key = ["Name"];
+}
+
+def lookupPStateImm0_1ByEncoding : SearchIndex {
+  let Table = PStateImm0_1sList;
+  let Key = ["Encoding"];
+}
+
 //                   Name,     Op1,   Op2
 def : PStateImm0_15<"SPSel",   0b000, 0b101>;
 def : PStateImm0_15<"DAIFSet", 0b011, 0b110>;
@@ -467,16 +737,34 @@ def : PStateImm0_1<"PM",       0b001, 0b000, 0b001>;
 // SVCR instruction options.
 //===----------------------------------------------------------------------===//
 
-class SVCR<string name, bits<3> encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class SVCR<string name, bits<3> encoding> {
   string Name = name;
   bits<3> Encoding;
   let Encoding = encoding;
   code Requires = [{ {} }];
 }
 
+def SVCRValues : GenericEnum {
+  let FilterClass = "SVCR";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def SVCRsList : GenericTable {
+  let FilterClass = "SVCR";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupSVCRByName : SearchIndex {
+  let Table = SVCRsList;
+  let Key = ["Name"];
+}
+
+def lookupSVCRByEncoding : SearchIndex {
+  let Table = SVCRsList;
+  let Key = ["Encoding"];
+}
+
 let Requires = [{ {AArch64::FeatureSME} }] in {
 def : SVCR<"SVCRSM",   0b001>;
 def : SVCR<"SVCRZA",   0b010>;
@@ -487,30 +775,66 @@ def : SVCR<"SVCRSMZA", 0b011>;
 // PSB instruction options.
 //===----------------------------------------------------------------------===//
 
-class PSB<string name, bits<5> encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class PSB<string name, bits<5> encoding> {
   string Name = name;
   bits<5> Encoding;
   let Encoding = encoding;
 }
 
+def PSBValues : GenericEnum {
+  let FilterClass = "PSB";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def PSBsList : GenericTable {
+  let FilterClass = "PSB";
+  let Fields = ["Name", "Encoding"];
+}
+
+def lookupPSBByName : SearchIndex {
+  let Table = PSBsList;
+  let Key = ["Name"];
+}
+
+def lookupPSBByEncoding : SearchIndex {
+  let Table = PSBsList;
+  let Key = ["Encoding"];
+}
+
 def : PSB<"csync", 0x11>;
 
 //===----------------------------------------------------------------------===//
 // BTI instruction options.
 //===----------------------------------------------------------------------===//
 
-class BTI<string name, bits<3> encoding> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+class BTI<string name, bits<3> encoding> {
   string Name = name;
   bits<3> Encoding;
   let Encoding = encoding;
 }
 
+def BTIValues : GenericEnum {
+  let FilterClass = "BTI";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def BTIsList : GenericTable {
+  let FilterClass = "BTI";
+  let Fields = ["Name", "Encoding"];
+}
+
+def lookupBTIByName : SearchIndex {
+  let Table = BTIsList;
+  let Key = ["Name"];
+}
+
+def lookupBTIByEncoding : SearchIndex {
+  let Table = BTIsList;
+  let Key = ["Encoding"];
+}
+
 def : BTI<"c",  0b010>;
 def : BTI<"j",  0b100>;
 def : BTI<"jc", 0b110>;
@@ -667,12 +991,8 @@ defm : TLBI<"VMALLWS2E1OS",  0b100, 0b1000, 0b0101, 0b010, 0>;
 //===----------------------------------------------------------------------===//
 
 class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
-             bits<3> op2> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+             bits<3> op2> {
   string Name = name;
-  string AltName = name;
   bits<16> Encoding;
   let Encoding{15-14} = op0;
   let Encoding{13-11} = op1;
@@ -684,6 +1004,26 @@ class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
   code Requires = [{ {} }];
 }
 
+def SysRegValues : GenericEnum {
+  let FilterClass = "SysReg";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def SysRegsList : GenericTable {
+  let FilterClass = "SysReg";
+  let Fields = ["Name", "Encoding", "Readable", "Writeable", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupSysRegByEncoding";
+  let PrimaryKeyReturnRange = true;
+}
+
+def lookupSysRegByName : SearchIndex {
+  let Table = SysRegsList;
+  let Key = ["Name"];
+}
+
 class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
                bits<3> op2>
     : SysReg<name, op0, op1, crn, crm, op2> {
@@ -969,9 +1309,7 @@ def : RWSysReg<"TTBR0_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b000>;
 def : RWSysReg<"TTBR0_EL3",          0b11, 0b110, 0b0010, 0b0000, 0b000>;
 
 let Requires = [{ {AArch64::FeatureEL2VMSA} }] in {
-def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000> {
-  let AltName = "VSCTLR_EL2";
-}
+def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000>;
 def : RWSysReg<"VTTBR_EL2",          0b11, 0b100, 0b0010, 0b0001, 0b000>;
 }
 
@@ -1358,9 +1696,7 @@ def : RWSysReg<"ICH_LR15_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b111>;
 let Requires = [{ {AArch64::HasV8_0rOps} }] in {
 //Virtualization System Control Register
 //                                 Op0   Op1    CRn     CRm     Op2
-def : RWSysReg<"VSCTLR_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b000> {
-  let AltName = "TTBR0_EL2";
-}
+def : RWSysReg<"VSCTLR_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b000>;
 
 //MPU Type Register
 //                                 Op0   Op1    CRn     CRm     Op2
@@ -2026,12 +2362,8 @@ def : RWSysReg<"ACTLRALIAS_EL1",  0b11, 0b000, 0b0001, 0b0100, 0b101>;
 //===----------------------------------------------------------------------===//
 
 class PHint<bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
-              bits<3> op2, string name> : SearchableTable {
-  let SearchableFields = ["Name", "Encoding"];
-  let EnumValueField = "Encoding";
-
+              bits<3> op2, string name> {
   string Name = name;
-  string AltName = name;
   bits<16> Encoding;
   let Encoding{15-14} = op0;
   let Encoding{13-11} = op1;
@@ -2041,6 +2373,27 @@ class PHint<bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
   code Requires = [{ {} }];
 }
 
+def PHintValues : GenericEnum {
+  let FilterClass = "PHint";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+def PHintsList : GenericTable {
+  let FilterClass = "PHint";
+  let Fields = ["Name", "Encoding", "Requires"];
+}
+
+def lookupPHintByName : SearchIndex {
+  let Table = PHintsList;
+  let Key = ["Name"];
+}
+
+def lookupPHintByEncoding : SearchIndex {
+  let Table = PHintsList;
+  let Key = ["Encoding"];
+}
+
 let Requires = [{ {AArch64::FeaturePCDPHINT} }] in {
   def KEEP : PHint<0b00, 0b000, 0b0000, 0b0000, 0b000, "keep">;
   def STRM : PHint<0b00, 0b000, 0b0000, 0b0000, 0b001, "strm">;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0566a87..25b6731 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -35,6 +35,9 @@ using namespace llvm::PatternMatch;
 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
                                                cl::init(true), cl::Hidden);
 
+static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
+    "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
+
 static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
                                            cl::Hidden);
 
@@ -256,7 +259,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
     CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
   }
 
-  if (CalleeAttrs.isNewZA())
+  if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())
     return false;
 
   if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
@@ -1635,10 +1638,8 @@ instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
       !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
                               m_ConstantInt<AArch64SVEPredPattern::all>())))
     return std::nullopt;
-  IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder);
-  IC.Builder.setFastMathFlags(II.getFastMathFlags());
-  auto BinOp =
-      IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
+  auto BinOp = IC.Builder.CreateBinOpFMF(
+      BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
   return IC.replaceInstUsesWith(II, BinOp);
 }
 
@@ -2760,6 +2761,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     return AdjustCost(
         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 
+  static const TypeConversionCostTblEntry BF16Tbl[] = {
+      {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1},     // bfcvt
+      {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1},     // bfcvt
+      {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
+      {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
+      {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
+      {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
+      {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
+  };
+
+  if (ST->hasBF16())
+    if (const auto *Entry = ConvertCostTableLookup(
+            BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+      return AdjustCost(Entry->Cost);
+
   static const TypeConversionCostTblEntry ConversionTbl[] = {
       {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1},    // xtn
       {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1},   // xtn
@@ -2847,6 +2863,14 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
       {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
       {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
+      //   BF16 (uses shift)
+      {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1},     // shl
+      {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2},     // shl+fcvt
+      {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
+      {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
+      {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
+      {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
+      {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
       // FP Ext and trunc
       {ISD::FP_ROUND, MVT::f32, MVT::f64, 1},     // fcvt
       {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
@@ -2859,6 +2883,15 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
       {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
       {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
+      //   BF16 (more complex, with +bf16 is handled above)
+      {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
+      {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
+      {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
+      {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
+      {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
+      {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
+      {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
+      {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
 
       // LowerVectorINT_TO_FP:
       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
@@ -4705,10 +4738,21 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
   }
 
   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
-  // Treat extractsubvector as single op permutation.
   bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
-  if (IsExtractSubvector && LT.second.isFixedLengthVector())
+  // A sebvector extract can be implemented with a ext (or trivial extract, if
+  // from lane 0). This currently only handles low or high extracts to prevent
+  // SLP vectorizer regressions.
+  if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
+    if (LT.second.is128BitVector() &&
+        cast<FixedVectorType>(SubTp)->getNumElements() ==
+            LT.second.getVectorNumElements() / 2) {
+      if (Index == 0)
+        return 0;
+      if (Index == (int)LT.second.getVectorNumElements() / 2)
+        return 1;
+    }
     Kind = TTI::SK_PermuteSingleSrc;
+  }
 
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
@@ -4919,6 +4963,12 @@ static bool containsDecreasingPointers(Loop *TheLoop,
   return false;
 }
 
+bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
+  if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
+    return SVEPreferFixedOverScalableIfEqualCost;
+  return ST->useFixedOverScalableIfEqualCost();
+}
+
 unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
   return ST->getEpilogueVectorizationMinVF();
 }
@@ -5283,11 +5333,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     }
   }
 
-  // Sink vscales closer to uses for better isel
+  auto ShouldSinkCondition = [](Value *Cond) -> bool {
+    auto *II = dyn_cast<IntrinsicInst>(Cond);
+    return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
+           isa<ScalableVectorType>(II->getOperand(0)->getType());
+  };
+
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
   case Instruction::Add:
   case Instruction::Sub:
+    // Sink vscales closer to uses for better isel
     for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
       if (shouldSinkVScale(I->getOperand(Op), Ops)) {
         Ops.push_back(&I->getOperandUse(Op));
@@ -5295,6 +5351,23 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
       }
     }
     break;
+  case Instruction::Select: {
+    if (!ShouldSinkCondition(I->getOperand(0)))
+      return false;
+
+    Ops.push_back(&I->getOperandUse(0));
+    return true;
+  }
+  case Instruction::Br: {
+    if (cast<BranchInst>(I)->isUnconditional())
+      return false;
+
+    if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
+      return false;
+
+    Ops.push_back(&I->getOperandUse(0));
+    return true;
+  }
   default:
     break;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 83b86e3..214fb4e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -387,9 +387,7 @@ public:
     return TailFoldingStyle::DataWithoutLaneMask;
   }
 
-  bool preferFixedOverScalableIfEqualCost() const {
-    return ST->useFixedOverScalableIfEqualCost();
-  }
+  bool preferFixedOverScalableIfEqualCost() const;
 
   unsigned getEpilogueVectorizationMinVF() const;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 4b7d415..93461e3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -454,6 +454,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           {nxv2s64, p0, nxv2s64, 8},
       })
       .clampScalar(0, s8, s64)
+      .minScalarOrElt(0, s8)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
@@ -466,14 +467,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotPow2()
       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
-      .bitcastIf(typeInSet(0, {v4s8}),
+      .bitcastIf(all(typeInSet(0, {v4s8}),
+                     LegalityPredicate([=](const LegalityQuery &Query) {
+                       return Query.Types[0].getSizeInBits() ==
+                              Query.MMODescrs[0].MemoryTy.getSizeInBits();
+                     })),
                  [=](const LegalityQuery &Query) {
                    const LLT VecTy = Query.Types[0];
                    return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
                  })
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
-      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+      .lower();
 
   getActionDefinitionsBuilder(G_INDEXED_STORE)
       // Idx 0 == Ptr, Idx 1 == Val
@@ -861,6 +867,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalForCartesianProduct({s32, v2s16, v4s8})
       .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
       .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
+      .customIf([=](const LegalityQuery &Query) {
+        // Handle casts from i1 vectors to scalars.
+        LLT DstTy = Query.Types[0];
+        LLT SrcTy = Query.Types[1];
+        return DstTy.isScalar() && SrcTy.isVector() &&
+               SrcTy.getScalarSizeInBits() == 1;
+      })
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isVector() != Query.Types[1].isVector();
       })
@@ -1062,10 +1075,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         return llvm::is_contained(
             {v2s64, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
       })
-      // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
-      // just want those lowered into G_BUILD_VECTOR
+      // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar
+      // destinations, we just want those lowered into G_BUILD_VECTOR or
+      // G_EXTRACT_ELEMENT.
       .lowerIf([=](const LegalityQuery &Query) {
-        return !Query.Types[1].isVector();
+        return !Query.Types[0].isVector() || !Query.Types[1].isVector();
       })
       .moreElementsIf(
           [](const LegalityQuery &Query) {
@@ -1404,11 +1418,28 @@ bool AArch64LegalizerInfo::legalizeCustom(
     return Helper.lowerAbsToCNeg(MI);
   case TargetOpcode::G_ICMP:
     return legalizeICMP(MI, MRI, MIRBuilder);
+  case TargetOpcode::G_BITCAST:
+    return legalizeBitcast(MI, Helper);
   }
 
   llvm_unreachable("expected switch to return");
 }
 
+bool AArch64LegalizerInfo::legalizeBitcast(MachineInstr &MI,
+                                           LegalizerHelper &Helper) const {
+  assert(MI.getOpcode() == TargetOpcode::G_BITCAST && "Unexpected opcode");
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  // We're trying to handle casts from i1 vectors to scalars but reloading from
+  // stack.
+  if (!DstTy.isScalar() || !SrcTy.isVector() ||
+      SrcTy.getElementType() != LLT::scalar(1))
+    return false;
+
+  Helper.createStackStoreLoad(DstReg, SrcReg);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
                                                MachineRegisterInfo &MRI,
                                                MachineIRBuilder &MIRBuilder,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 00d85a3..bcb29432 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -66,6 +66,7 @@ private:
                                 LegalizerHelper &Helper) const;
   bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const;
   bool legalizePrefetch(MachineInstr &MI, LegalizerHelper &Helper) const;
+  bool legalizeBitcast(MachineInstr &MI, LegalizerHelper &Helper) const;
   const AArch64Subtarget *ST;
 };
 } // End llvm namespace.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 5fe2e3c..6bba70d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -405,6 +405,19 @@ void applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
   MI.eraseFromParent();
 }
 
+void applyFullRev(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  assert(DstTy.getSizeInBits() == 128 &&
+         "Expected 128bit vector in applyFullRev");
+  MachineIRBuilder MIRBuilder(MI);
+  auto Cst = MIRBuilder.buildConstant(LLT::scalar(32), 8);
+  auto Rev = MIRBuilder.buildInstr(AArch64::G_REV64, {DstTy}, {Src});
+  MIRBuilder.buildInstr(AArch64::G_EXT, {Dst}, {Rev, Rev, Cst});
+  MI.eraseFromParent();
+}
+
 bool matchNonConstInsert(MachineInstr &MI, MachineRegisterInfo &MRI) {
   assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index ae84bc9..875b505 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1874,26 +1874,25 @@ void AArch64InstPrinter::printBarriernXSOption(const MCInst *MI, unsigned OpNo,
     markup(O, Markup::Immediate) << "#" << Val;
 }
 
-static bool isValidSysReg(const AArch64SysReg::SysReg *Reg, bool Read,
+static bool isValidSysReg(const AArch64SysReg::SysReg &Reg, bool Read,
                           const MCSubtargetInfo &STI) {
-  return (Reg && (Read ? Reg->Readable : Reg->Writeable) &&
-          Reg->haveFeatures(STI.getFeatureBits()));
+  return (Read ? Reg.Readable : Reg.Writeable) &&
+         Reg.haveFeatures(STI.getFeatureBits());
 }
 
-// Looks up a system register either by encoding or by name. Some system
+// Looks up a system register either by encoding. Some system
 // registers share the same encoding between different architectures,
-// therefore a tablegen lookup by encoding will return an entry regardless
-// of the register's predication on a specific subtarget feature. To work
-// around this problem we keep an alternative name for such registers and
-// look them up by that name if the first lookup was unsuccessful.
+// to work around this tablegen will return a range of registers with the same
+// encodings. We need to check each register in the range to see if it valid.
 static const AArch64SysReg::SysReg *lookupSysReg(unsigned Val, bool Read,
                                                  const MCSubtargetInfo &STI) {
-  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
-
-  if (Reg && !isValidSysReg(Reg, Read, STI))
-    Reg = AArch64SysReg::lookupSysRegByName(Reg->AltName);
+  auto Range = AArch64SysReg::lookupSysRegByEncoding(Val);
+  for (auto &Reg : Range) {
+    if (isValidSysReg(Reg, Read, STI))
+      return &Reg;
+  }
 
-  return Reg;
+  return nullptr;
 }
 
 void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
@@ -1917,7 +1916,7 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
 
   const AArch64SysReg::SysReg *Reg = lookupSysReg(Val, true /*Read*/, STI);
 
-  if (isValidSysReg(Reg, true /*Read*/, STI))
+  if (Reg)
     O << Reg->Name;
   else
     O << AArch64SysReg::genericRegisterString(Val);
@@ -1944,7 +1943,7 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
 
   const AArch64SysReg::SysReg *Reg = lookupSysReg(Val, false /*Read*/, STI);
 
-  if (isValidSysReg(Reg, false /*Read*/, STI))
+  if (Reg)
     O << Reg->Name;
   else
     O << AArch64SysReg::genericRegisterString(Val);
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a831de8..0ef862f 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -484,6 +484,7 @@ let Predicates = [HasSVEorSME] in {
 //===----------------------------------------------------------------------===//
 def SVEDup0 : ComplexPattern<vAny, 0, "SelectDupZero", []>;
 def SVEDup0Undef : ComplexPattern<vAny, 0, "SelectDupZeroOrUndef", []>;
+def SVEAny : ComplexPattern<vAny, 0, "SelectAny", []>;
 
 class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
                    Instruction inst>
@@ -504,10 +505,15 @@ multiclass SVE_1_Op_PassthruUndef_Pat<ValueType vtd, SDPatternOperator op, Value
             (inst $Op3, $Op1, $Op2)>;
 }
 
-class SVE_1_Op_PassthruUndefZero_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
-                   ValueType vts, Instruction inst>
-  : Pat<(vtd (op pg:$Op1, vts:$Op2, (vtd (SVEDup0Undef)))),
-        (inst $Op1, $Op2)>;
+multiclass SVE_1_Op_PassthruUndefZero_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+                   ValueType vts, Instruction inst> {
+  let AddedComplexity = 1 in {
+    def : Pat<(vtd (op pg:$Op1, vts:$Op2, (vtd (SVEDup0Undef)))),
+          (inst $Op1, $Op2)>;
+    def : Pat<(vtd (op (pg (SVEAllActive:$Op1)), vts:$Op2, (vtd (SVEAny)))),
+          (inst $Op1, $Op2)>;
+  }
+}
 
 // Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the
 // type of rounding. This is matched by timm0_1 in pattern below and ignored.
@@ -576,10 +582,15 @@ multiclass SVE_3_Op_Undef_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1
             (inst $Op1, $Op2, $Op3)>;
 }
 
-class SVE_3_Op_UndefZero_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
-                             ValueType vt2, ValueType vt3, Instruction inst>
-  : Pat<(vtd (op (vt1 (SVEDup0Undef)), vt2:$Op1, vt3:$Op2)),
-        (inst $Op1, $Op2)>;
+multiclass SVE_3_Op_UndefZero_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                             ValueType vt2, ValueType vt3, Instruction inst>  {
+  let AddedComplexity = 1 in {
+    def : Pat<(vtd (op (vt1 (SVEDup0Undef)), vt2:$Op1, vt3:$Op2)),
+              (inst $Op1, $Op2)>;
+    def : Pat<(vtd (op (vt1 (SVEAny)), (vt2 (SVEAllActive:$Op2)), vt3:$Op3)),
+              (inst $Op2, $Op3)>;
+  }
+}
 
 class SVE_4_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
                    ValueType vt2, ValueType vt3, ValueType vt4,
@@ -2840,8 +2851,8 @@ multiclass sve2_fp_convert_up_long_z<string asm, string op> {
   def _HtoS : sve2_fp_convert_precision<0b1001, 0b0, asm, ZPR32, ZPR16>;
   def _StoD : sve2_fp_convert_precision<0b1111, 0b0, asm, ZPR64, ZPR32>;
 
-  def : SVE_3_Op_UndefZero_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
 }
 
 multiclass sve2_fp_convert_down_narrow_z<string asm> {
@@ -3256,7 +3267,7 @@ class sve_fp_z2op_p_zd<bits<7> opc,string asm, RegisterOperand i_zprtype,
 multiclass sve_fp_z2op_p_zd<string asm, SDPatternOperator op> {
   def _DtoS : sve_fp_z2op_p_zd<0b0001010, asm, ZPR64, ZPR32>;
 
-  def : SVE_3_Op_UndefZero_Pat<nxv4f32, op, nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv4f32, op, nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
 multiclass sve_fp_z2op_p_zd_hsd<bits<5> opc, string asm> {
@@ -3273,7 +3284,7 @@ multiclass sve_fp_z2op_p_zd_frint<bits<2> opc, string asm> {
 multiclass sve_fp_z2op_p_zd_bfcvt<string asm, SDPatternOperator op> {
   def NAME : sve_fp_z2op_p_zd<0b1001010, asm, ZPR32, ZPR16>;
 
-  def : SVE_3_Op_UndefZero_Pat<nxv8bf16, op, nxv8bf16, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv8bf16, op, nxv8bf16, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
 }
 
 multiclass sve_fp_z2op_p_zd_d<bit U, string asm, string int_op, SDPatternOperator ir_op> {
@@ -3285,14 +3296,14 @@ multiclass sve_fp_z2op_p_zd_d<bit U, string asm, string int_op, SDPatternOperato
   def _DtoS : sve_fp_z2op_p_zd<{ 0b111100, U }, asm, ZPR64, ZPR32>;
   def _DtoD : sve_fp_z2op_p_zd<{ 0b111111, U }, asm, ZPR64, ZPR64>;
 
-  def : SVE_3_Op_UndefZero_Pat<nxv4i32, !cast<SDPatternOperator>(int_op # _i32f64), nxv4i32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv2i64, !cast<SDPatternOperator>(int_op # _i64f32), nxv2i64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv4i32, !cast<SDPatternOperator>(int_op # _i32f16), nxv4i32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv2i64, !cast<SDPatternOperator>(int_op # _i64f16), nxv2i64, nxv2i1, nxv8f16, !cast<Instruction>(NAME # _HtoD)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv4i32, !cast<SDPatternOperator>(int_op # _i32f64), nxv4i32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv2i64, !cast<SDPatternOperator>(int_op # _i64f32), nxv2i64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv4i32, !cast<SDPatternOperator>(int_op # _i32f16), nxv4i32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv2i64, !cast<SDPatternOperator>(int_op # _i64f16), nxv2i64, nxv2i1, nxv8f16, !cast<Instruction>(NAME # _HtoD)>;
 
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, ir_op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _HtoH)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, ir_op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoS)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, ir_op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoD)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, ir_op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _HtoH)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, ir_op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoS)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, ir_op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoD)>;
 }
 
 multiclass sve_fp_z2op_p_zd_c<bit U, string asm> {
@@ -3319,12 +3330,12 @@ multiclass sve_fp_z2op_p_zd_b_0<string asm, string op> {
   def _DtoS : sve_fp_z2op_p_zd<0b1101010, asm, ZPR64, ZPR32>;
   def _StoD : sve_fp_z2op_p_zd<0b1101011, asm, ZPR32, ZPR64>;
 
-  def : SVE_3_Op_UndefZero_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f64), nxv8f16, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoH)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f16), nxv2f64, nxv2i1, nxv8f16, !cast<Instruction>(NAME # _HtoD)>;
-  def : SVE_3_Op_UndefZero_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f64), nxv8f16, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoH)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f16), nxv2f64, nxv2i1, nxv8f16, !cast<Instruction>(NAME # _HtoD)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4842,6 +4853,16 @@ multiclass sve_int_un_pred_arit<bits<3> opc, string asm,
   def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def _B_UNDEF : PredOneOpPassthruPseudo<NAME # _B, ZPR8>;
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Pseudo>(NAME # _B_UNDEF)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Pseudo>(NAME # _H_UNDEF)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Pseudo>(NAME # _S_UNDEF)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Pseudo>(NAME # _D_UNDEF)>;
 }
 
 multiclass sve_int_un_pred_arit_z<bits<3> opc, string asm, SDPatternOperator op> {
@@ -4850,10 +4871,10 @@ multiclass sve_int_un_pred_arit_z<bits<3> opc, string asm, SDPatternOperator op>
   def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b0 }, asm, ZPR64>;
 
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_h<bits<3> opc, string asm,
@@ -4967,6 +4988,17 @@ multiclass sve_int_un_pred_arit_bitwise_fp<bits<3> opc, string asm,
   def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def _H_UNDEF : PredOneOpPassthruPseudo<NAME # _H, ZPR16>;
+  def _S_UNDEF : PredOneOpPassthruPseudo<NAME # _S, ZPR32>;
+  def _D_UNDEF : PredOneOpPassthruPseudo<NAME # _D, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Pseudo>(NAME # _H_UNDEF)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Pseudo>(NAME # _H_UNDEF)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Pseudo>(NAME # _H_UNDEF)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Pseudo>(NAME # _S_UNDEF)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Pseudo>(NAME # _S_UNDEF)>;
+  defm : SVE_1_Op_PassthruUndef_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Pseudo>(NAME # _D_UNDEF)>;
 }
 
 multiclass sve_int_un_pred_arit_bitwise_fp_z<bits<3> opc, string asm, SDPatternOperator op> {
@@ -4974,12 +5006,12 @@ multiclass sve_int_un_pred_arit_bitwise_fp_z<bits<3> opc, string asm, SDPatternO
   def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b1 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b1 }, asm, ZPR64>;
 
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_PassthruUndefZero_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_fp_un_pred_arit_hsd<SDPatternOperator op> {
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index d83c22e..7767028 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 
 namespace llvm {
   namespace AArch64AT {
-#define GET_AT_IMPL
+#define GET_ATsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
@@ -26,128 +26,121 @@ namespace llvm {
 
 namespace llvm {
   namespace AArch64DBnXS {
-#define GET_DBNXS_IMPL
+#define GET_DBnXSsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64DB {
-#define GET_DB_IMPL
+#define GET_DBsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64DC {
-#define GET_DC_IMPL
+#define GET_DCsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64IC {
-#define GET_IC_IMPL
+#define GET_ICsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64ISB {
-#define GET_ISB_IMPL
+#define GET_ISBsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64TSB {
-#define GET_TSB_IMPL
-#include "AArch64GenSystemOperands.inc"
-  }
-}
-
-namespace llvm {
-  namespace AArch64PRCTX {
-#define GET_PRCTX_IMPL
+#define GET_TSBsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64PRFM {
-#define GET_PRFM_IMPL
+#define GET_PRFMsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64SVEPRFM {
-#define GET_SVEPRFM_IMPL
+#define GET_SVEPRFMsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64RPRFM {
-#define GET_RPRFM_IMPL
+#define GET_RPRFMsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   } // namespace AArch64RPRFM
 } // namespace llvm
 
 namespace llvm {
   namespace AArch64SVEPredPattern {
-#define GET_SVEPREDPAT_IMPL
+#define GET_SVEPREDPATsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
 namespace AArch64SVEVecLenSpecifier {
-#define GET_SVEVECLENSPECIFIER_IMPL
+#define GET_SVEVECLENSPECIFIERsList_IMPL
 #include "AArch64GenSystemOperands.inc"
 } // namespace AArch64SVEVecLenSpecifier
 } // namespace llvm
 
 namespace llvm {
   namespace AArch64ExactFPImm {
-#define GET_EXACTFPIMM_IMPL
+#define GET_ExactFPImmsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64PState {
-#define GET_PSTATEIMM0_15_IMPL
+#define GET_PStateImm0_15sList_IMPL
 #include "AArch64GenSystemOperands.inc"
-#define GET_PSTATEIMM0_1_IMPL
+#define GET_PStateImm0_1sList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64PSBHint {
-#define GET_PSB_IMPL
+#define GET_PSBsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
 namespace AArch64PHint {
-#define GET_PHINT_IMPL
+#define GET_PHintsList_IMPL
 #include "AArch64GenSystemOperands.inc"
 } // namespace AArch64PHint
 } // namespace llvm
 
 namespace llvm {
   namespace AArch64BTIHint {
-#define GET_BTI_IMPL
+#define GET_BTIsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
 
 namespace llvm {
   namespace AArch64SysReg {
-#define GET_SYSREG_IMPL
+#define GET_SysRegsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
@@ -194,7 +187,7 @@ namespace llvm {
 
 namespace llvm {
   namespace AArch64SVCR {
-#define GET_SVCR_IMPL
+#define GET_SVCRsList_IMPL
 #include "AArch64GenSystemOperands.inc"
   }
 }
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e0ccba4..b8d3236 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -371,79 +371,89 @@ namespace AArch64SVCR {
   struct SVCR : SysAlias{
     using SysAlias::SysAlias;
   };
-  #define GET_SVCR_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_SVCRValues_DECL
+#define GET_SVCRsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64AT{
   struct AT : SysAlias {
     using SysAlias::SysAlias;
   };
-  #define GET_AT_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_ATValues_DECL
+#define GET_ATsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64DB {
   struct DB : SysAlias {
     using SysAlias::SysAlias;
   };
-  #define GET_DB_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_DBValues_DECL
+#define GET_DBsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64DBnXS {
   struct DBnXS : SysAliasImm {
     using SysAliasImm::SysAliasImm;
   };
-  #define GET_DBNXS_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_DBnXSValues_DECL
+#define GET_DBnXSsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64DC {
   struct DC : SysAlias {
     using SysAlias::SysAlias;
   };
-  #define GET_DC_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_DCValues_DECL
+#define GET_DCsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64IC {
   struct IC : SysAliasReg {
     using SysAliasReg::SysAliasReg;
   };
-  #define GET_IC_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_ICValues_DECL
+#define GET_ICsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64ISB {
   struct ISB : SysAlias {
     using SysAlias::SysAlias;
   };
-  #define GET_ISB_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_ISBValues_DECL
+#define GET_ISBsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64TSB {
   struct TSB : SysAlias {
     using SysAlias::SysAlias;
   };
-  #define GET_TSB_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_TSBValues_DECL
+#define GET_TSBsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PRFM {
   struct PRFM : SysAlias {
     using SysAlias::SysAlias;
   };
-  #define GET_PRFM_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_PRFMValues_DECL
+#define GET_PRFMsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64SVEPRFM {
   struct SVEPRFM : SysAlias {
     using SysAlias::SysAlias;
   };
-#define GET_SVEPRFM_DECL
+#define GET_SVEPRFMValues_DECL
+#define GET_SVEPRFMsList_DECL
 #include "AArch64GenSystemOperands.inc"
 }
 
@@ -451,7 +461,8 @@ namespace AArch64RPRFM {
 struct RPRFM : SysAlias {
   using SysAlias::SysAlias;
 };
-#define GET_RPRFM_DECL
+#define GET_RPRFMValues_DECL
+#define GET_RPRFMsList_DECL
 #include "AArch64GenSystemOperands.inc"
 } // namespace AArch64RPRFM
 
@@ -460,7 +471,8 @@ namespace AArch64SVEPredPattern {
     const char *Name;
     uint16_t Encoding;
   };
-#define GET_SVEPREDPAT_DECL
+#define GET_SVEPREDPATValues_DECL
+#define GET_SVEPREDPATsList_DECL
 #include "AArch64GenSystemOperands.inc"
 }
 
@@ -469,7 +481,8 @@ namespace AArch64SVEVecLenSpecifier {
     const char *Name;
     uint16_t Encoding;
   };
-#define GET_SVEVECLENSPECIFIER_DECL
+#define GET_SVEVECLENSPECIFIERValues_DECL
+#define GET_SVEVECLENSPECIFIERsList_DECL
 #include "AArch64GenSystemOperands.inc"
 } // namespace AArch64SVEVecLenSpecifier
 
@@ -551,12 +564,12 @@ LLVM_DECLARE_ENUM_AS_BITMASK(TailFoldingOpts,
                              /* LargestValue */ (long)TailFoldingOpts::Reverse);
 
 namespace AArch64ExactFPImm {
-  struct ExactFPImm {
-    const char *Name;
-    int Enum;
-    const char *Repr;
-  };
-#define GET_EXACTFPIMM_DECL
+struct ExactFPImm {
+  int Enum;
+  const char *Repr;
+};
+#define GET_ExactFPImmValues_DECL
+#define GET_ExactFPImmsList_DECL
 #include "AArch64GenSystemOperands.inc"
 }
 
@@ -564,28 +577,30 @@ namespace AArch64PState {
   struct PStateImm0_15 : SysAlias{
     using SysAlias::SysAlias;
   };
-  #define GET_PSTATEIMM0_15_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_PStateImm0_15Values_DECL
+#define GET_PStateImm0_15sList_DECL
+#include "AArch64GenSystemOperands.inc"
 
   struct PStateImm0_1 : SysAlias{
     using SysAlias::SysAlias;
   };
-  #define GET_PSTATEIMM0_1_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_PStateImm0_1Values_DECL
+#define GET_PStateImm0_1sList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PSBHint {
   struct PSB : SysAlias {
     using SysAlias::SysAlias;
   };
-  #define GET_PSB_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_PSBValues_DECL
+#define GET_PSBsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PHint {
 struct PHint {
   const char *Name;
-  const char *AltName;
   unsigned Encoding;
   FeatureBitset FeaturesRequired;
 
@@ -595,7 +610,8 @@ struct PHint {
   }
 };
 
-#define GET_PHINT_DECL
+#define GET_PHintValues_DECL
+#define GET_PHintsList_DECL
 #include "AArch64GenSystemOperands.inc"
 
 const PHint *lookupPHintByName(StringRef);
@@ -606,8 +622,9 @@ namespace AArch64BTIHint {
   struct BTI : SysAlias {
     using SysAlias::SysAlias;
   };
-  #define GET_BTI_DECL
-  #include "AArch64GenSystemOperands.inc"
+#define GET_BTIValues_DECL
+#define GET_BTIsList_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64SME {
@@ -701,7 +718,6 @@ AArch64StringToVectorLayout(StringRef LayoutStr) {
 namespace AArch64SysReg {
   struct SysReg {
     const char Name[32];
-    const char AltName[32];
     unsigned Encoding;
     bool Readable;
     bool Writeable;
@@ -713,11 +729,9 @@ namespace AArch64SysReg {
     }
   };
 
-  #define GET_SYSREG_DECL
-  #include "AArch64GenSystemOperands.inc"
-
-  const SysReg *lookupSysRegByName(StringRef);
-  const SysReg *lookupSysRegByEncoding(uint16_t);
+#define GET_SysRegsList_DECL
+#define GET_SysRegValues_DECL
+#include "AArch64GenSystemOperands.inc"
 
   uint32_t parseGenericRegister(StringRef Name);
   std::string genericRegisterString(uint32_t Bits);
@@ -731,14 +745,6 @@ namespace AArch64TLBI {
   #include "AArch64GenSystemOperands.inc"
 }
 
-namespace AArch64PRCTX {
-  struct PRCTX : SysAliasReg {
-    using SysAliasReg::SysAliasReg;
-  };
-  #define GET_PRCTX_DECL
-  #include "AArch64GenSystemOperands.inc"
-}
-
 namespace AArch64II {
 /// Target Operand Flag enum.
 enum TOF {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index e844904..0f97988 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1523,7 +1523,8 @@ Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
   bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
   bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
 
-  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
+  unsigned BitWidth = Num->getType()->getScalarSizeInBits();
+  int NumDivBits = getDivNumBits(I, Num, Den, BitWidth - 32, IsSigned);
   if (NumDivBits == -1)
     return nullptr;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 985fa8f..da47aaf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -124,6 +124,16 @@ def sign_extension_in_reg : GICombineRule<
          [{ return matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]),
   (apply [{ applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>;
 
+// Do the following combines :
+// fmul x, select(y, A, B)   -> fldexp (x, select i32 (y, a, b))
+// fmul x, select(y, -A, -B) -> fldexp ((fneg x), select i32 (y, a, b))
+def combine_fmul_with_select_to_fldexp : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match  (G_FMUL $dst, $x, $select):$root,
+          (G_SELECT $select, $y, $A, $B):$sel,
+          [{ return Helper.matchCombineFmulWithSelectToFldexp(*${root}, *${sel}, ${matchinfo}); }]),
+  (apply  [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
@@ -153,13 +163,13 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
 
 def AMDGPUPreLegalizerCombiner: GICombiner<
   "AMDGPUPreLegalizerCombinerImpl",
-  [all_combines, clamp_i64_to_i16, foldable_fneg]> {
+  [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
 def AMDGPUPostLegalizerCombiner: GICombiner<
   "AMDGPUPostLegalizerCombinerImpl",
-  [all_combines, gfx6gfx7_combines, gfx8_combines,
+  [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
    rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
   let CombineAllMethodName = "tryCombineAllImpl";
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index e5a376a..f6f9f4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -17,6 +17,13 @@
 using namespace llvm;
 using namespace MIPatternMatch;
 
+AMDGPUCombinerHelper::AMDGPUCombinerHelper(
+    GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize,
+    GISelKnownBits *KB, MachineDominatorTree *MDT, const LegalizerInfo *LI,
+    const GCNSubtarget &STI)
+    : CombinerHelper(Observer, B, IsPreLegalize, KB, MDT, LI), STI(STI),
+      TII(*STI.getInstrInfo()) {}
+
 LLVM_READNONE
 static bool fnegFoldsIntoMI(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
@@ -445,3 +452,67 @@ void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
   Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
   MI.eraseFromParent();
 }
+
+bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
+    MachineInstr &MI, MachineInstr &Sel,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FMUL);
+  assert(Sel.getOpcode() == TargetOpcode::G_SELECT);
+  assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg());
+
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DestTy = MRI.getType(Dst);
+  LLT ScalarDestTy = DestTy.getScalarType();
+
+  if ((ScalarDestTy != LLT::float64() && ScalarDestTy != LLT::float32() &&
+       ScalarDestTy != LLT::float16()) ||
+      !MRI.hasOneNonDBGUse(Sel.getOperand(0).getReg()))
+    return false;
+
+  Register SelectCondReg = Sel.getOperand(1).getReg();
+  MachineInstr *SelectTrue = MRI.getVRegDef(Sel.getOperand(2).getReg());
+  MachineInstr *SelectFalse = MRI.getVRegDef(Sel.getOperand(3).getReg());
+
+  const auto SelectTrueVal =
+      isConstantOrConstantSplatVectorFP(*SelectTrue, MRI);
+  if (!SelectTrueVal)
+    return false;
+  const auto SelectFalseVal =
+      isConstantOrConstantSplatVectorFP(*SelectFalse, MRI);
+  if (!SelectFalseVal)
+    return false;
+
+  if (SelectTrueVal->isNegative() != SelectFalseVal->isNegative())
+    return false;
+
+  // For f32, only non-inline constants should be transformed.
+  if (ScalarDestTy == LLT::float32() && TII.isInlineConstant(*SelectTrueVal) &&
+      TII.isInlineConstant(*SelectFalseVal))
+    return false;
+
+  int SelectTrueLog2Val = SelectTrueVal->getExactLog2Abs();
+  if (SelectTrueLog2Val == INT_MIN)
+    return false;
+  int SelectFalseLog2Val = SelectFalseVal->getExactLog2Abs();
+  if (SelectFalseLog2Val == INT_MIN)
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &Builder) {
+    LLT IntDestTy = DestTy.changeElementType(LLT::scalar(32));
+    auto NewSel = Builder.buildSelect(
+        IntDestTy, SelectCondReg,
+        Builder.buildConstant(IntDestTy, SelectTrueLog2Val),
+        Builder.buildConstant(IntDestTy, SelectFalseLog2Val));
+
+    Register XReg = MI.getOperand(1).getReg();
+    if (SelectTrueVal->isNegative()) {
+      auto NegX =
+          Builder.buildFNeg(DestTy, XReg, MRI.getVRegDef(XReg)->getFlags());
+      Builder.buildFLdexp(Dst, NegX, NewSel, MI.getFlags());
+    } else {
+      Builder.buildFLdexp(Dst, XReg, NewSel, MI.getFlags());
+    }
+  };
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
index 6510abe..893b3f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
@@ -15,13 +15,22 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H
 
+#include "GCNSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 
 namespace llvm {
 class AMDGPUCombinerHelper : public CombinerHelper {
+protected:
+  const GCNSubtarget &STI;
+  const SIInstrInfo &TII;
+
 public:
   using CombinerHelper::CombinerHelper;
+  AMDGPUCombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
+                       bool IsPreLegalize, GISelKnownBits *KB,
+                       MachineDominatorTree *MDT, const LegalizerInfo *LI,
+                       const GCNSubtarget &STI);
 
   bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
   void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
@@ -30,6 +39,10 @@ public:
                                    Register Src1, Register Src2);
   void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
                                    Register Src1, Register Src2);
+
+  bool matchCombineFmulWithSelectToFldexp(
+      MachineInstr &MI, MachineInstr &Sel,
+      std::function<void(MachineIRBuilder &)> &MatchInfo);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index d9eaf82..27e9018 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1997,7 +1997,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
     return false;
   SAddr = SelectSAddrFI(CurDAG, SAddr);
-  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+  Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3be865f..041b9b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1125,8 +1125,9 @@ static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
                           unsigned FakeS16Opc, unsigned S32Opc,
                           unsigned S64Opc) {
     if (Size == 16)
+      // FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code
       return ST.hasTrue16BitInsts()
-                 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
+                 ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc
                  : S16Opc;
     if (Size == 32)
       return S32Opc;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 54d927c..888817e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -134,7 +134,7 @@ AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
     const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
     : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
       TII(*STI.getInstrInfo()),
-      Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
+      Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI, STI),
 #define GET_GICOMBINER_CONSTRUCTOR_INITS
 #include "AMDGPUGenPostLegalizeGICombiner.inc"
 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index ff8189c..e1564d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -94,7 +94,7 @@ AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl(
     const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
     const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
     : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
-      Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
+      Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI, STI),
 #define GET_GICOMBINER_CONSTRUCTOR_INITS
 #include "AMDGPUGenPreLegalizeGICombiner.inc"
 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d94c400..08e23cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1190,9 +1190,13 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
 
   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
 
-  // TODO: Need to emit a wave reduction to get the maximum size.
-  if (SizeBank != &AMDGPU::SGPRRegBank)
-    return false;
+  if (SizeBank != &AMDGPU::SGPRRegBank) {
+    auto WaveReduction =
+        B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)})
+            .addUse(AllocSize)
+            .addImm(0);
+    AllocSize = WaveReduction.getReg(0);
+  }
 
   LLT PtrTy = MRI.getType(Dst);
   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ed956a1..d8f441d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -9760,10 +9760,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
   case MCK_SReg_64:
   case MCK_SReg_64_XEXEC:
     // Null is defined as a 32-bit register but
-    // it should also be enabled with 64-bit operands.
-    // The following code enables it for SReg_64 operands
+    // it should also be enabled with 64-bit operands or larger.
+    // The following code enables it for SReg_64 and larger operands
     // used as source and destination. Remaining source
     // operands are handled in isInlinableImm.
+  case MCK_SReg_96:
+  case MCK_SReg_128:
+  case MCK_SReg_256:
+  case MCK_SReg_512:
     return Operand.isNull() ? Match_Success : Match_InvalidOperand;
   default:
     return Match_InvalidOperand;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a351f45..f2686bd 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -168,7 +168,7 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset),
                                  (ins SCSrc_b32:$soffset));
 
-  dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset,
+  dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset,
                             (ins Offset:$offset, FORMAT:$format, CPol_0:$cpol, i1imm_0:$swz));
 
   dag Inputs = !if(!empty(vaddrList),
@@ -418,7 +418,7 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
   RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret;
 
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
-  dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
+  dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
 
   dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs));
   dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs));
@@ -680,7 +680,7 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
 class MUBUF_Pseudo_Store_Lds<string opName>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 (ins SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset, CPol:$cpol, i1imm:$swz),
+                 (ins SReg_128_XNULL:$srsrc, SCSrc_b32:$soffset, Offset:$offset, CPol:$cpol, i1imm:$swz),
                  " $srsrc, $soffset$offset lds$cpol"> {
   let LGKM_CNT = 1;
   let mayLoad = 1;
@@ -703,7 +703,7 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasRestric
   dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata));
   dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr)));
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
-  dag MainInputs = !con((ins SReg_128:$srsrc), SOffset, (ins Offset:$offset));
+  dag MainInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset));
   dag CPol = !if(vdata_in, (ins CPol_GLC_WithDefault:$cpol),
                            (ins CPol_NonGLC_WithDefault:$cpol));
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 5908351..d236327 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -279,7 +279,9 @@ DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
 DECODE_OPERAND_REG_7(SReg_64_XEXEC_XNULL, OPW64)
 DECODE_OPERAND_REG_7(SReg_96, OPW96)
 DECODE_OPERAND_REG_7(SReg_128, OPW128)
+DECODE_OPERAND_REG_7(SReg_128_XNULL, OPW128)
 DECODE_OPERAND_REG_7(SReg_256, OPW256)
+DECODE_OPERAND_REG_7(SReg_256_XNULL, OPW256)
 DECODE_OPERAND_REG_7(SReg_512, OPW512)
 
 DECODE_OPERAND_REG_8(AGPR_32)
@@ -1692,6 +1694,11 @@ AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
   case OPW64:
   case OPWV232:
     return decodeSpecialReg64(Val);
+  case OPW96:
+  case OPW128:
+  case OPW256:
+  case OPW512:
+    return decodeSpecialReg96Plus(Val);
   default:
     llvm_unreachable("unexpected immediate type");
   }
@@ -1778,6 +1785,24 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
   return errOperand(Val, "unknown operand encoding " + Twine(Val));
 }
 
+MCOperand AMDGPUDisassembler::decodeSpecialReg96Plus(unsigned Val) const {
+  using namespace AMDGPU;
+
+  switch (Val) {
+  case 124:
+    if (isGFX11Plus())
+      return createRegOperand(SGPR_NULL);
+    break;
+  case 125:
+    if (!isGFX11Plus())
+      return createRegOperand(SGPR_NULL);
+    break;
+  default:
+    break;
+  }
+  return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
 MCOperand
 AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, const unsigned Val,
                                   unsigned ImmWidth,
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index b19e4b7..9a06cc3 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -259,6 +259,7 @@ public:
   MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
+  MCOperand decodeSpecialReg96Plus(unsigned Val) const;
 
   MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val,
                           unsigned ImmWidth,
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 4722d33..1b94d6c 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -422,7 +422,7 @@ class MIMG_NoSampler_Helper <mimgopc op, string asm,
                              RegisterClass addr_rc,
                              string dns="">
   : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> {
-  let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+  let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -435,7 +435,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm,
                                     RegisterClass addr_rc,
                                     string dns="">
   : MIMG_gfx90a <op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
-  let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
+  let InOperandList = !con((ins addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -447,7 +447,7 @@ class MIMG_NoSampler_gfx10<mimgopc op, string opcode,
                            RegisterClass DataRC, RegisterClass AddrRC,
                            string dns="">
   : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> {
-  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
+  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -460,7 +460,7 @@ class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode,
                                string dns="">
   : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
-                           (ins SReg_256:$srsrc, DMask:$dmask,
+                           (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -472,7 +472,7 @@ class MIMG_NoSampler_gfx11<mimgopc op, string opcode,
                            RegisterClass DataRC, RegisterClass AddrRC,
                            string dns="">
   : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> {
-  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
+  let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -485,7 +485,7 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
                                string dns="">
   : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
-                           (ins SReg_256:$srsrc, DMask:$dmask,
+                           (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -498,7 +498,7 @@ class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
                              string dns="">
   : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
-                           (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim,
+                           (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim,
                                 CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
   let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe"
@@ -510,8 +510,8 @@ class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
                             string dns="">
   : VSAMPLE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns, Addr3RC> {
   let InOperandList = !con(AddrIns,
-                           (ins SReg_256:$rsrc),
-                           !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)),
+                           (ins SReg_256_XNULL:$rsrc),
+                           !if(BaseOpcode.Sampler, (ins SReg_128_XNULL:$samp), (ins)),
                            (ins DMask:$dmask, Dim:$dim, UNorm:$unorm,
                                 CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe,
                                 LWE:$lwe),
@@ -527,8 +527,8 @@ class VSAMPLE_Sampler_nortn_gfx12<mimgopc op, string opcode,
                             string dns="">
   : VSAMPLE_gfx12<op.GFX12, (outs), num_addrs, dns, Addr3RC> {
   let InOperandList = !con(AddrIns,
-                           (ins SReg_256:$rsrc),
-                           !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)),
+                           (ins SReg_256_XNULL:$rsrc),
+                           !if(BaseOpcode.Sampler, (ins SReg_128_XNULL:$samp), (ins)),
                            (ins DMask:$dmask, Dim:$dim, UNorm:$unorm,
                                 CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe,
                                 LWE:$lwe),
@@ -679,7 +679,7 @@ class MIMG_Store_Helper <mimgopc op, string asm,
                          RegisterClass addr_rc,
                          string dns = "">
   : MIMG_gfx6789<op.GFX10M, (outs), dns> {
-  let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+  let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -693,7 +693,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm,
                                 string dns = "">
   : MIMG_gfx90a<op.GFX10M, (outs), dns> {
   let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata,
-                                addr_rc:$vaddr, SReg_256:$srsrc,
+                                addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -705,7 +705,7 @@ class MIMG_Store_gfx10<mimgopc op, string opcode,
                        RegisterClass DataRC, RegisterClass AddrRC,
                        string dns="">
   : MIMG_gfx10<op.GFX10M, (outs), dns> {
-  let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+  let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -719,7 +719,7 @@ class MIMG_Store_nsa_gfx10<mimgopc op, string opcode,
   : MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> {
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
-                           (ins SReg_256:$srsrc, DMask:$dmask,
+                           (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -731,7 +731,7 @@ class MIMG_Store_gfx11<mimgopc op, string opcode,
                        RegisterClass DataRC, RegisterClass AddrRC,
                        string dns="">
   : MIMG_gfx11<op.GFX11, (outs), dns> {
-  let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+  let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                                 DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -745,7 +745,7 @@ class MIMG_Store_nsa_gfx11<mimgopc op, string opcode,
   : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns> {
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
-                           (ins SReg_256:$srsrc, DMask:$dmask,
+                           (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -759,7 +759,7 @@ class VIMAGE_Store_gfx12<mimgopc op, string opcode,
   : VIMAGE_gfx12<op.GFX12, (outs), num_addrs, dns> {
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
-                           (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim,
+                           (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim,
                                 CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
   let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe"
@@ -875,7 +875,7 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
   : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
   let Constraints = "$vdst = $vdata";
 
-  let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+  let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
   let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
@@ -887,7 +887,7 @@ class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc,
   let Constraints = "$vdst = $vdata";
 
   let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata,
-                           addr_rc:$vaddr, SReg_256:$srsrc,
+                           addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, LWE:$lwe, DA:$da);
   let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
@@ -921,7 +921,7 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode,
                !if(enableDisasm, "GFX10", "")> {
   let Constraints = "$vdst = $vdata";
 
-  let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+  let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
   let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
@@ -936,7 +936,7 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
 
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
-                           (ins SReg_256:$srsrc, DMask:$dmask,
+                           (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe));
   let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
@@ -949,7 +949,7 @@ class MIMG_Atomic_gfx11<mimgopc op, string opcode,
                !if(enableDisasm, "GFX11", "")> {
   let Constraints = "$vdst = $vdata";
 
-  let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
+  let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
   let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
@@ -964,7 +964,7 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
 
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
-                           (ins SReg_256:$srsrc, DMask:$dmask,
+                           (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe));
   let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
@@ -978,7 +978,7 @@ class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
 
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
-                           (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim,
+                           (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim,
                                  CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe));
   let AsmString = !if(!empty(renamed), opcode, renamed)#" $vdata, "#AddrAsm#
                   ", $rsrc$dmask$dim$cpol$r128$a16$tfe";
@@ -1128,7 +1128,7 @@ multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
 class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
                            RegisterClass src_rc, string dns="">
   : MIMG_gfx6789 <op.VI, (outs dst_rc:$vdata), dns> {
-  let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+  let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -1139,7 +1139,7 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
 class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
                           RegisterClass src_rc, string dns="">
   : MIMG_gfx90a<op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> {
-  let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+  let InOperandList = !con((ins src_rc:$vaddr, SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
                                 DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                                 R128A16:$r128, LWE:$lwe, DA:$da),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
@@ -1149,7 +1149,7 @@ class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc,
 
 class MIMG_Sampler_OpList_gfx10p<dag OpPrefix, bit HasD16> {
   dag ret = !con(OpPrefix,
-                 (ins SReg_256:$srsrc, SReg_128:$ssamp,
+                 (ins SReg_256_XNULL:$srsrc, SReg_128_XNULL:$ssamp,
                   DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                   R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe),
                  !if(HasD16, (ins D16:$d16), (ins)));
@@ -1524,7 +1524,7 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> {
 
 class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC>
     : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "GFX10"> {
-  let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16);
+  let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16);
   let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16";
 
   let nsa = 0;
@@ -1532,13 +1532,13 @@ class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC>
 
 class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs>
     : MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "GFX10"> {
-  let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16));
+  let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16";
 }
 
 class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC>
     : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "GFX11"> {
-  let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16);
+  let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16);
   let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16";
 
   let nsa = 0;
@@ -1548,7 +1548,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
                                   list<RegisterClass> addr_types>
     : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11",
                      addr_types> {
-  let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16));
+  let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16";
 }
 
@@ -1556,7 +1556,7 @@ class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
                                 list<RegisterClass> addr_types>
     : VIMAGE_gfx12<op.GFX12, (outs VReg_128:$vdata),
                    num_addrs, "GFX12", addr_types> {
-  let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$rsrc, A16:$a16));
+  let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc, A16:$a16));
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 4fb5cb0..2bc1913 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -199,7 +199,7 @@ static unsigned macToMad(unsigned Opc) {
   case AMDGPU::V_FMAC_F16_e64:
     return AMDGPU::V_FMA_F16_gfx9_e64;
   case AMDGPU::V_FMAC_F16_fake16_e64:
-    return AMDGPU::V_FMA_F16_gfx9_e64;
+    return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
   case AMDGPU::V_FMAC_LEGACY_F32_e64:
     return AMDGPU::V_FMA_LEGACY_F32_e64;
   case AMDGPU::V_FMAC_F64_e64:
@@ -1096,21 +1096,8 @@ void SIFoldOperandsImpl::foldOperand(
           B.addImm(Defs[I].second);
         }
         LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
-        return;
       }
 
-      if (Size != 4)
-        return;
-
-      Register Reg0 = UseMI->getOperand(0).getReg();
-      Register Reg1 = UseMI->getOperand(1).getReg();
-      if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
-      else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
-      else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
-               TRI->isAGPR(*MRI, Reg1))
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
       return;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 58b061f..0ac84f4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4017,29 +4017,26 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 }
 
 // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for stack growth direction(default: downwards, AMDGPU: upwards) and
-// applying the wave size scale to the increment amount.
-SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
-                                                      SelectionDAG &DAG) const {
+// except for:
+// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
+// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
+SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                  SelectionDAG &DAG) const {
   const MachineFunction &MF = DAG.getMachineFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
-  SDValue Tmp1 = Op;
-  SDValue Tmp2 = Op.getValue(1);
-  SDValue Tmp3 = Op.getOperand(2);
-  SDValue Chain = Tmp1.getOperand(0);
-
+  SDValue Chain = Op.getOperand(0);
   Register SPReg = Info->getStackPtrOffsetReg();
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
-  SDValue Size = Tmp2.getOperand(1);
+  SDValue Size = Op.getOperand(1);
   SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
-  Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
+  Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
 
   const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
   assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
@@ -4057,30 +4054,36 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
                            DAG.getSignedConstant(-ScaledAlignment, dl, VT));
   }
 
-  SDValue ScaledSize = DAG.getNode(
-      ISD::SHL, dl, VT, Size,
-      DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
-
-  SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
+  assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
+  SDValue NewSP;
+  if (isa<ConstantSDNode>(Size)) {
+    // For constant sized alloca, scale alloca size by wave-size
+    SDValue ScaledSize = DAG.getNode(
+        ISD::SHL, dl, VT, Size,
+        DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+    NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
+  } else {
+    // For dynamic sized alloca, perform wave-wide reduction to get max of
+    // alloca size(divergent) and then scale it by wave-size
+    SDValue WaveReduction =
+        DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
+    Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
+                       Size, DAG.getConstant(0, dl, MVT::i32));
+    SDValue ScaledSize = DAG.getNode(
+        ISD::SHL, dl, VT, Size,
+        DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+    NewSP =
+        DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
+    SDValue ReadFirstLaneID =
+        DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
+    NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
+                        NewSP);
+  }
 
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
-  Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
+  SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
 
-  return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
-}
-
-SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  // We only handle constant sizes here to allow non-entry block, static sized
-  // allocas. A truly dynamic value is more difficult to support because we
-  // don't know if the size value is uniform or not. If the size isn't uniform,
-  // we would need to do a wave reduction to get the maximum size to know how
-  // much to increment the uniform stack pointer.
-  SDValue Size = Op.getOperand(1);
-  if (isa<ConstantSDNode>(Size))
-    return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
-
-  return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
+  return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
 }
 
 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
@@ -13982,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
   return Accum;
 }
 
+SDValue
+SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SDValue RHS = N->getOperand(1);
+  auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (!CRHS)
+    return SDValue();
+
+  // TODO: Worth using computeKnownBits? Maybe expensive since it's so
+  // common.
+  uint64_t Val = CRHS->getZExtValue();
+  if (countr_zero(Val) >= 32) {
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc SL(N);
+    SDValue LHS = N->getOperand(0);
+
+    // Avoid carry machinery if we know the low half of the add does not
+    // contribute to the final result.
+    //
+    // add i64:x, K if computeTrailingZeros(K) >= 32
+    //  => build_pair (add x.hi, K.hi), x.lo
+
+    // Breaking the 64-bit add here with this strange constant is unlikely
+    // to interfere with addressing mode patterns.
+
+    SDValue Hi = getHiHalf64(LHS, DAG);
+    SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+    SDValue AddHi =
+        DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+    return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
+  }
+
+  return SDValue();
+}
+
 // Collect the ultimate src of each of the mul node's operands, and confirm
 // each operand is 8 bytes.
 static std::optional<ByteProvider<SDValue>>
@@ -14258,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
     return V;
   }
 
+  if (VT == MVT::i64) {
+    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+      return Folded;
+  }
+
   if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
       (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
     SDValue TempNode(N, 0);
@@ -14443,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
+  if (VT == MVT::i64) {
+    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+      return Folded;
+  }
+
   if (VT != MVT::i32)
     return SDValue();
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 631f265..299c8f5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -212,6 +212,9 @@ private:
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
   SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N,
+                                          DAGCombinerInfo &DCI) const;
+
   SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -421,7 +424,6 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f97ea40..e6f333f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3805,6 +3805,36 @@ static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
   }
 }
 
+static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::V_MAC_F16_e32:
+  case AMDGPU::V_MAC_F16_e64:
+    return AMDGPU::V_MAD_F16_e64;
+  case AMDGPU::V_MAC_F32_e32:
+  case AMDGPU::V_MAC_F32_e64:
+    return AMDGPU::V_MAD_F32_e64;
+  case AMDGPU::V_MAC_LEGACY_F32_e32:
+  case AMDGPU::V_MAC_LEGACY_F32_e64:
+    return AMDGPU::V_MAD_LEGACY_F32_e64;
+  case AMDGPU::V_FMAC_LEGACY_F32_e32:
+  case AMDGPU::V_FMAC_LEGACY_F32_e64:
+    return AMDGPU::V_FMA_LEGACY_F32_e64;
+  case AMDGPU::V_FMAC_F16_e32:
+  case AMDGPU::V_FMAC_F16_e64:
+  case AMDGPU::V_FMAC_F16_fake16_e64:
+    return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
+                                  : AMDGPU::V_FMA_F16_gfx9_e64;
+  case AMDGPU::V_FMAC_F32_e32:
+  case AMDGPU::V_FMAC_F32_e64:
+    return AMDGPU::V_FMA_F32_e64;
+  case AMDGPU::V_FMAC_F64_e32:
+  case AMDGPU::V_FMAC_F64_e64:
+    return AMDGPU::V_FMA_F64_e64;
+  default:
+    llvm_unreachable("invalid instruction");
+  }
+}
+
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                                  LiveVariables *LV,
                                                  LiveIntervals *LIS) const {
@@ -4040,14 +4070,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
   if (Src0Literal && !ST.hasVOP3Literal())
     return nullptr;
 
-  unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
-                                  : IsF64 ? AMDGPU::V_FMA_F64_e64
-                                          : IsLegacy
-                                                ? AMDGPU::V_FMA_LEGACY_F32_e64
-                                                : AMDGPU::V_FMA_F32_e64
-                          : IsF16 ? AMDGPU::V_MAD_F16_e64
-                                  : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
-                                             : AMDGPU::V_MAD_F32_e64;
+  unsigned NewOpc = getNewFMAInst(ST, Opc);
+
   if (pseudoToMCOpcode(NewOpc) == -1)
     return nullptr;
 
@@ -6866,9 +6890,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
   if (RsrcIdx != -1) {
     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
-    if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
+    if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
       isRsrcLegal = false;
-    }
   }
 
   // The operands are legal.
@@ -9294,6 +9317,7 @@ static bool isRenamedInGFX9(int Opcode) {
   case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
   case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
   case AMDGPU::V_FMA_F16_gfx9_e64:
+  case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
   case AMDGPU::V_INTERP_P2_F16:
   case AMDGPU::V_MAD_F16_e64:
   case AMDGPU::V_MAD_U16_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 789ce88..ee83dff 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2674,8 +2674,8 @@ let OtherPredicates = [NotHasTrue16BitInsts] in {
 } // end OtherPredicates = [NotHasTrue16BitInsts]
 
 let OtherPredicates = [HasTrue16BitInsts] in {
-  def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
-  def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
+  def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
+  def : FPToI1Pat<V_CMP_EQ_F16_fake16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
 } // end OtherPredicates = [HasTrue16BitInsts]
 
 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
@@ -3055,7 +3055,7 @@ def : GCNPat<
    (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
 
 // If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
-let AddedComplexity = 1000 in {
+let AddedComplexity = 8 in {
 foreach vt = [f16, v2f16, f32, v2f32, f64] in {
   def : GCNPat<
     (fcanonicalize (vt is_canonicalized:$src)),
@@ -3710,12 +3710,15 @@ def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
 def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
 def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
 def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
-def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
-def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
 def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
 def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
-def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
-def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
+let True16Predicate = UseFakeTrue16Insts in {
+def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
 }
 
 let SubtargetPredicate = isGFX9Plus in {
@@ -3723,6 +3726,10 @@ let True16Predicate = NotHasTrue16BitInsts in {
   defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, VSrc_b16>;
   defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, VSrc_b16>;
 }
+let True16Predicate = UseRealTrue16Insts in {
+  defm : Int16Med3Pat<V_MED3_I16_t16_e64, smin, smax, VSrcT_b16>;
+  defm : Int16Med3Pat<V_MED3_U16_t16_e64, umin, umax, VSrcT_b16>;
+}
 let True16Predicate = UseFakeTrue16Insts in {
   defm : Int16Med3Pat<V_MED3_I16_fake16_e64, smin, smax, VSrc_b16>;
   defm : Int16Med3Pat<V_MED3_U16_fake16_e64, umin, umax, VSrc_b16>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 16a7a9c..7c98ccd 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -153,14 +153,16 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
 }
 
 multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
-                        bit isVGPR = 0, bit isAGPR = 0> {
+                        bit isVGPR = 0, bit isAGPR = 0,
+                        list<int> DwarfEncodings = [-1, -1]> {
   def _LO16 : SIReg<n#".l", regIdx, isVGPR, isAGPR>;
   def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx, isVGPR, isAGPR,
                     /* isHi16 */ 1> {
     let isArtificial = ArtificialHigh;
   }
   def "" : RegisterWithSubRegs<n, [!cast<Register>(NAME#"_LO16"),
-                                   !cast<Register>(NAME#"_HI16")]> {
+                                   !cast<Register>(NAME#"_HI16")]>,
+           DwarfRegNum<DwarfEncodings> {
     let Namespace = "AMDGPU";
     let SubRegIndices = [lo16, hi16];
     let CoveredBySubRegs = !not(ArtificialHigh);
@@ -197,7 +199,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
   let HWEncoding = VCC_LO.HWEncoding;
 }
 
-defm EXEC_LO : SIRegLoHi16<"exec_lo", 126>, DwarfRegNum<[1, 1]>;
+defm EXEC_LO : SIRegLoHi16<"exec_lo", 126, /*ArtificialHigh=*/1, /*isVGPR=*/0,
+                           /*isAGPR=*/0, /*DwarfEncodings=*/[1, 1]>;
 defm EXEC_HI : SIRegLoHi16<"exec_hi", 127>;
 
 def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> {
@@ -337,25 +340,26 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
 // SGPR registers
 foreach Index = 0...105 in {
   defm SGPR#Index :
-     SIRegLoHi16 <"s"#Index, Index>,
-     DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)),
-                  !if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>;
+     SIRegLoHi16 <"s"#Index, Index, /*ArtificialHigh=*/1,
+                  /*isVGPR=*/0, /*isAGPR=*/0, /*DwarfEncodings=*/
+                  [!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)),
+                   !if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>;
 }
 
 // VGPR registers
 foreach Index = 0...255 in {
   defm VGPR#Index :
-    SIRegLoHi16 <"v"#Index, Index, /* ArtificialHigh= */ 0,
-                 /* isVGPR= */ 1, /* isAGPR= */ 0>,
-    DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]>;
+    SIRegLoHi16 <"v"#Index, Index, /*ArtificialHigh=*/ 0,
+                 /*isVGPR=*/ 1, /*isAGPR=*/ 0, /*DwarfEncodings=*/
+                 [!add(Index, 2560), !add(Index, 1536)]>;
 }
 
 // AccVGPR registers
 foreach Index = 0...255 in {
   defm AGPR#Index :
-      SIRegLoHi16 <"a"#Index, Index, /* ArtificialHigh= */ 1,
-                   /* isVGPR= */ 0, /* isAGPR= */ 1>,
-      DwarfRegNum<[!add(Index, 3072), !add(Index, 2048)]>;
+      SIRegLoHi16 <"a"#Index, Index, /*ArtificialHigh=*/ 1,
+                   /*isVGPR=*/ 0, /*isAGPR=*/ 1, /*DwarfEncodings=*/
+                   [!add(Index, 3072), !add(Index, 2048)]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -809,6 +813,9 @@ def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16,
   let BaseClassOrder = 32;
 }
 
+def SGPR_NULL128 : SIReg<"null">;
+def SGPR_NULL256 : SIReg<"null">;
+
 let GeneratePressureSet = 0 in {
 def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
   (add SReg_32, LDS_DIRECT_CLASS)> {
@@ -885,6 +892,7 @@ multiclass SRegClass<int numRegs,
                      list<ValueType> regTypes,
                      SIRegisterTuples regList,
                      SIRegisterTuples ttmpList = regList,
+                     bit hasNull = 0,
                      int copyCost = !sra(!add(numRegs, 1), 1)> {
   defvar hasTTMP = !ne(regList, ttmpList);
   defvar suffix = !cast<string>(!mul(numRegs, 32));
@@ -901,7 +909,7 @@ multiclass SRegClass<int numRegs,
       }
     }
 
-    def SReg_ # suffix :
+    def SReg_ # suffix # !if(hasNull, "_XNULL", ""):
       SIRegisterClass<"AMDGPU", regTypes, 32,
                       !con((add !cast<RegisterClass>(sgprName)),
                            !if(hasTTMP,
@@ -910,15 +918,24 @@ multiclass SRegClass<int numRegs,
       let isAllocatable = 0;
       let BaseClassOrder = !mul(numRegs, 32);
     }
+
+    if hasNull then {
+      def SReg_ # suffix :
+        SIRegisterClass<"AMDGPU", regTypes, 32,
+                        (add !cast<RegisterClass>("SReg_" # suffix # "_XNULL"), !cast<Register>("SGPR_NULL" # suffix))> {
+        let isAllocatable = 0;
+        let BaseClassOrder = !mul(numRegs, 32);
+      }
+    }
   }
 }
 
 defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs, /*hasNull*/ true>;
 defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
-defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs, /*hasNull*/ true>;
 defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
 defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
 defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 42df457..979812e 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -455,6 +455,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
       break;
     case AMDGPU::V_FMA_F16_e64:
     case AMDGPU::V_FMA_F16_gfx9_e64:
+    case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
       NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
                                           : AMDGPU::V_FMAAK_F16;
       break;
@@ -484,6 +485,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
       break;
     case AMDGPU::V_FMA_F16_e64:
     case AMDGPU::V_FMA_F16_gfx9_e64:
+    case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
       NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
                                           : AMDGPU::V_FMAMK_F16;
       break;
@@ -956,7 +958,8 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
           MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
           MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
           MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
-          MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) {
+          MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
+          MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
         shrinkMadFma(MI);
         continue;
       }
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 1aeb4e8..37dcc100 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -332,19 +332,19 @@ defm S_LOAD_I16      : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
 defm S_LOAD_U16      : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
 
 let is_buffer = 1 in {
-defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
 // FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
 // SI/CI, bit disallowed for SMEM on VI.
-defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_64_XEXEC>;
 let SubtargetPredicate = HasScalarDwordx3Loads in
-  defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128, SReg_96>;
-defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>;
-defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>;
-defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>;
-defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
+  defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_96>;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_128>;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_256>;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_512>;
+defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
 }
 
 let SubtargetPredicate = HasScalarStores in {
@@ -353,9 +353,9 @@ defm S_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_64, SReg_64_XEXEC>;
 defm S_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_64, SReg_128>;
 
 let is_buffer = 1 in {
-defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_128, SReg_128>;
+defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_128_XNULL, SReg_128>;
 }
 } // End SubtargetPredicate = HasScalarStores
 
@@ -375,7 +375,7 @@ def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb
 
 defm S_ATC_PROBE        : SM_Pseudo_Probe <SReg_64>;
 let is_buffer = 1 in {
-defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128>;
+defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128_XNULL>;
 }
 } // SubtargetPredicate = isGFX8Plus
 
@@ -401,33 +401,33 @@ defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_64, SReg_128>;
 let SubtargetPredicate = HasScalarAtomics in {
 
 let is_buffer = 1 in {
-defm S_BUFFER_ATOMIC_SWAP         : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_CMPSWAP      : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_ADD          : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_SUB          : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_SMIN         : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_UMIN         : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_SMAX         : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_UMAX         : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_AND          : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_OR           : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_XOR          : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_INC          : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_DEC          : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-
-defm S_BUFFER_ATOMIC_SWAP_X2      : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_CMPSWAP_X2   : SM_Pseudo_Atomics <SReg_128, SReg_128>;
-defm S_BUFFER_ATOMIC_ADD_X2       : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_SUB_X2       : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_SMIN_X2      : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_UMIN_X2      : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_SMAX_X2      : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_UMAX_X2      : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_AND_X2       : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_OR_X2        : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_XOR_X2       : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_INC_X2       : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_DEC_X2       : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SWAP         : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP      : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_ADD          : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB          : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN         : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN         : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX         : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX         : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_AND          : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_OR           : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR          : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_INC          : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC          : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+
+defm S_BUFFER_ATOMIC_SWAP_X2      : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2   : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_128>;
+defm S_BUFFER_ATOMIC_ADD_X2       : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB_X2       : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN_X2      : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN_X2      : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX_X2      : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX_X2      : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_AND_X2       : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_OR_X2        : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR_X2       : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_INC_X2       : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC_X2       : SM_Pseudo_Atomics <SReg_128_XNULL, SReg_64_XEXEC>;
 }
 
 defm S_ATOMIC_SWAP                : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
@@ -470,7 +470,7 @@ def S_PREFETCH_INST        : SM_Prefetch_Pseudo <"s_prefetch_inst", SReg_64, 1>;
 def S_PREFETCH_INST_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_inst_pc_rel", SReg_64, 0>;
 def S_PREFETCH_DATA        : SM_Prefetch_Pseudo <"s_prefetch_data", SReg_64, 1>;
 def S_PREFETCH_DATA_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_data_pc_rel", SReg_64, 0>;
-def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128, 1> {
+def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128_XNULL, 1> {
   let is_buffer = 1;
 }
 } // end let SubtargetPredicate = isGFX12Plus
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1dd39be..b9c73e6 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1018,9 +1018,9 @@ defm V_CLS_I32             : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
 defm V_SWAP_B16              : VOP1Only_Real_gfx11_gfx12<0x066>;
 defm V_PERMLANE64_B32        : VOP1Only_Real_gfx11_gfx12<0x067>;
 defm V_MOV_B16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
-defm V_NOT_B16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
-defm V_CVT_I32_I16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
-defm V_CVT_U32_U16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
+defm V_NOT_B16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069, "v_not_b16">;
+defm V_CVT_I32_I16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
+defm V_CVT_U32_U16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
 
 defm V_CVT_F16_U16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x050, "v_cvt_f16_u16">;
 defm V_CVT_F16_I16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
@@ -1036,18 +1036,18 @@ defm V_LOG_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16"
 defm V_LOG_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
 defm V_EXP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
 defm V_EXP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
-defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
+defm V_FREXP_MANT_F16        : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
 defm V_FREXP_EXP_I16_F16     : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
 defm V_FLOOR_F16_t16         : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
 defm V_FLOOR_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
 defm V_CEIL_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
 defm V_CEIL_F16_fake16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
-defm V_TRUNC_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">;
-defm V_RNDNE_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">;
-defm V_FRACT_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">;
-defm V_SIN_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">;
-defm V_COS_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">;
-defm V_SAT_PK_U8_I16_fake16  : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
+defm V_TRUNC_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d, "v_trunc_f16">;
+defm V_RNDNE_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05e, "v_rndne_f16">;
+defm V_FRACT_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f, "v_fract_f16">;
+defm V_SIN_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060, "v_sin_f16">;
+defm V_COS_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061, "v_cos_f16">;
+defm V_SAT_PK_U8_I16         : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
 defm V_CVT_NORM_I16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">;
 defm V_CVT_NORM_U16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e4576..24a2eed 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -340,7 +340,7 @@ let FPDPRounding = 1 in {
 
   let SubtargetPredicate = isGFX9Plus in {
     defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst_t16 <"v_div_fixup_f16_gfx9", VOP_F16_F16_F16_F16, AMDGPUdiv_fixup>;
-    defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>;
+    defm V_FMA_F16_gfx9 : VOP3Inst_t16 <"v_fma_f16_gfx9", VOP_F16_F16_F16_F16, any_fma>;
   } // End SubtargetPredicate = isGFX9Plus
 } // End FPDPRounding = 1
 
@@ -1374,8 +1374,8 @@ class VOP3_DOT_Profile_fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR
 let SubtargetPredicate = isGFX11Plus in {
   defm V_MAXMIN_F32     : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
   defm V_MINMAX_F32     : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-  defm V_MAXMIN_F16     : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
-  defm V_MINMAX_F16     : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+  defm V_MAXMIN_F16     : VOP3Inst_t16<"v_maxmin_f16", VOP_F16_F16_F16_F16>;
+  defm V_MINMAX_F16     : VOP3Inst_t16<"v_minmax_f16", VOP_F16_F16_F16_F16>;
   defm V_MAXMIN_U32     : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
   defm V_MINMAX_U32     : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
   defm V_MAXMIN_I32     : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
 
 defm V_MIN3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
 defm V_MAX3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
-defm V_MIN3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
-defm V_MAX3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
+defm V_MIN3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">;
+defm V_MAX3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
 defm V_MINIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22d>;
 defm V_MAXIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22e>;
 defm V_MINIMUM3_F16       : VOP3Only_Realtriple_t16_gfx12<0x22f>;
@@ -1588,8 +1588,8 @@ defm V_MED3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32",
 defm V_MED3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
 defm V_MINMAX_NUM_F32     : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
 defm V_MAXMIN_NUM_F32     : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">;
-defm V_MINMAX_NUM_F16     : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">;
-defm V_MAXMIN_NUM_F16     : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">;
+defm V_MINMAX_NUM_F16     : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">;
+defm V_MAXMIN_NUM_F16     : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">;
 defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
 defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
 defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
@@ -1708,7 +1708,7 @@ defm V_PERM_B32            : VOP3_Realtriple_gfx11_gfx12<0x244>;
 defm V_XAD_U32             : VOP3_Realtriple_gfx11_gfx12<0x245>;
 defm V_LSHL_ADD_U32        : VOP3_Realtriple_gfx11_gfx12<0x246>;
 defm V_ADD_LSHL_U32        : VOP3_Realtriple_gfx11_gfx12<0x247>;
-defm V_FMA_F16             : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
+defm V_FMA_F16             : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x248, "v_fma_f16", "V_FMA_F16_gfx9">;
 defm V_MIN3_F16            : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x249, "v_min3_f16">;
 defm V_MIN3_I16            : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24a, "v_min3_i16">;
 defm V_MIN3_U16            : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24b, "v_min3_u16">;
@@ -1730,8 +1730,8 @@ defm V_PERMLANE16_B32      : VOP3_Real_Base_gfx11_gfx12<0x25b>;
 defm V_PERMLANEX16_B32     : VOP3_Real_Base_gfx11_gfx12<0x25c>;
 defm V_MAXMIN_F32          : VOP3_Realtriple_gfx11<0x25e>;
 defm V_MINMAX_F32          : VOP3_Realtriple_gfx11<0x25f>;
-defm V_MAXMIN_F16          : VOP3_Realtriple_gfx11<0x260>;
-defm V_MINMAX_F16          : VOP3_Realtriple_gfx11<0x261>;
+defm V_MAXMIN_F16          : VOP3_Realtriple_t16_and_fake16_gfx11<0x260, "v_maxmin_f16">;
+defm V_MINMAX_F16          : VOP3_Realtriple_t16_and_fake16_gfx11<0x261, "v_minmax_f16">;
 defm V_MAXMIN_U32          : VOP3_Realtriple_gfx11_gfx12<0x262>;
 defm V_MINMAX_U32          : VOP3_Realtriple_gfx11_gfx12<0x263>;
 defm V_MAXMIN_I32          : VOP3_Realtriple_gfx11_gfx12<0x264>;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 9bf043e..8589d59 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1130,20 +1130,20 @@ defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
 defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
 defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
 
-let OtherPredicates = [HasTrue16BitInsts] in {
-defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_t16_e64, i16>;
-defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_t16_e64, i16>;
-} // End OtherPredicates = [HasTrue16BitInsts]
-
-let OtherPredicates = [NotHasTrue16BitInsts] in {
+let True16Predicate = UseFakeTrue16Insts in {
+defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_fake16_e64, i16>;
+defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_fake16_e64, i16>;
+} // End True16Predicate = UseFakeTrue16Insts
+
+let True16Predicate = NotHasTrue16BitInsts in {
 defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>;
 defm : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>;
 defm : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>;
@@ -1154,7 +1154,7 @@ defm : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>;
 defm : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>;
 defm : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
 defm : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
-} // End OtherPredicates = [NotHasTrue16BitInsts]
+} // End True16Predicate = NotHasTrue16BitInsts
 
 multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
   let WaveSizePredicate = isWave64 in
@@ -1215,25 +1215,25 @@ defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
 defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
 defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
 
-let OtherPredicates = [HasTrue16BitInsts] in {
-defm : FCMP_Pattern <COND_O, V_CMP_O_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_t16_e64, f16>;
-
-defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_t16_e64, f16>;
-defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_t16_e64, f16>;
-} // End OtherPredicates = [HasTrue16BitInsts]
-
-let OtherPredicates = [NotHasTrue16BitInsts] in {
+let True16Predicate = UseFakeTrue16Insts in {
+defm : FCMP_Pattern <COND_O, V_CMP_O_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_fake16_e64, f16>;
+
+defm : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_fake16_e64, f16>;
+defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_fake16_e64, f16>;
+} // End True16Predicate = UseFakeTrue16Insts
+
+let True16Predicate = NotHasTrue16BitInsts in {
 defm : FCMP_Pattern <COND_O, V_CMP_O_F16_e64, f16>;
 defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_e64, f16>;
 defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>;
@@ -1249,7 +1249,7 @@ defm : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>;
 defm : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>;
 defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
 defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
-} // End OtherPredicates = [NotHasTrue16BitInsts]
+} // End True16Predicate = NotHasTrue16BitInsts
 
 //===----------------------------------------------------------------------===//
 // DPP Encodings
@@ -1707,23 +1707,6 @@ multiclass VOPCX_Real_t16_gfx11_gfx12<bits<9> op, string asm_name,
   VOPCX_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>,
   VOPCX_Real_t16<GFX12Gen, op, asm_name, OpName, pseudo_mnemonic>;
 
-defm V_CMP_F_F16_t16      : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">;
-defm V_CMP_LT_F16_t16     : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">;
-defm V_CMP_EQ_F16_t16     : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">;
-defm V_CMP_LE_F16_t16     : VOPC_Real_t16_gfx11_gfx12<0x003, "v_cmp_le_f16">;
-defm V_CMP_GT_F16_t16     : VOPC_Real_t16_gfx11_gfx12<0x004, "v_cmp_gt_f16">;
-defm V_CMP_LG_F16_t16     : VOPC_Real_t16_gfx11_gfx12<0x005, "v_cmp_lg_f16">;
-defm V_CMP_GE_F16_t16     : VOPC_Real_t16_gfx11_gfx12<0x006, "v_cmp_ge_f16">;
-defm V_CMP_O_F16_t16      : VOPC_Real_t16_gfx11_gfx12<0x007, "v_cmp_o_f16">;
-defm V_CMP_U_F16_t16      : VOPC_Real_t16_gfx11_gfx12<0x008, "v_cmp_u_f16">;
-defm V_CMP_NGE_F16_t16    : VOPC_Real_t16_gfx11_gfx12<0x009, "v_cmp_nge_f16">;
-defm V_CMP_NLG_F16_t16    : VOPC_Real_t16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">;
-defm V_CMP_NGT_F16_t16    : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">;
-defm V_CMP_NLE_F16_t16    : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">;
-defm V_CMP_NEQ_F16_t16    : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">;
-defm V_CMP_NLT_F16_t16    : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">;
-defm V_CMP_T_F16_t16      : VOPC_Real_t16_gfx11<0x00f, "v_cmp_t_f16", "V_CMP_TRU_F16_t16", "v_cmp_tru_f16">;
-
 defm V_CMP_F_F16_fake16      : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">;
 defm V_CMP_LT_F16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">;
 defm V_CMP_EQ_F16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">;
@@ -1759,19 +1742,6 @@ defm V_CMP_NLT_F32    : VOPC_Real_gfx11_gfx12<0x01e>;
 defm V_CMP_T_F32      : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">;
 defm V_CMP_T_F64      : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">;
 
-defm V_CMP_LT_I16_t16     : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">;
-defm V_CMP_EQ_I16_t16     : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">;
-defm V_CMP_LE_I16_t16     : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">;
-defm V_CMP_GT_I16_t16     : VOPC_Real_t16_gfx11_gfx12<0x034, "v_cmp_gt_i16">;
-defm V_CMP_NE_I16_t16     : VOPC_Real_t16_gfx11_gfx12<0x035, "v_cmp_ne_i16">;
-defm V_CMP_GE_I16_t16     : VOPC_Real_t16_gfx11_gfx12<0x036, "v_cmp_ge_i16">;
-defm V_CMP_LT_U16_t16     : VOPC_Real_t16_gfx11_gfx12<0x039, "v_cmp_lt_u16">;
-defm V_CMP_EQ_U16_t16     : VOPC_Real_t16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">;
-defm V_CMP_LE_U16_t16     : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">;
-defm V_CMP_GT_U16_t16     : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">;
-defm V_CMP_NE_U16_t16     : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">;
-defm V_CMP_GE_U16_t16     : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">;
-
 defm V_CMP_LT_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">;
 defm V_CMP_EQ_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">;
 defm V_CMP_LE_I16_fake16     : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">;
@@ -1819,28 +1789,10 @@ defm V_CMP_NE_U64     : VOPC_Real_gfx11_gfx12<0x05d>;
 defm V_CMP_GE_U64     : VOPC_Real_gfx11_gfx12<0x05e>;
 defm V_CMP_T_U64      : VOPC_Real_gfx11<0x05f>;
 
-defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">;
 defm V_CMP_CLASS_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">;
 defm V_CMP_CLASS_F32     : VOPC_Real_gfx11_gfx12<0x07e>;
 defm V_CMP_CLASS_F64     : VOPC_Real_gfx11_gfx12<0x07f>;
 
-defm V_CMPX_F_F16_t16     : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">;
-defm V_CMPX_LT_F16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">;
-defm V_CMPX_EQ_F16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">;
-defm V_CMPX_LE_F16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">;
-defm V_CMPX_GT_F16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">;
-defm V_CMPX_LG_F16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">;
-defm V_CMPX_GE_F16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">;
-defm V_CMPX_O_F16_t16     : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">;
-defm V_CMPX_U_F16_t16     : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">;
-defm V_CMPX_NGE_F16_t16   : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">;
-defm V_CMPX_NLG_F16_t16   : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">;
-defm V_CMPX_NGT_F16_t16   : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">;
-defm V_CMPX_NLE_F16_t16   : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">;
-defm V_CMPX_NEQ_F16_t16   : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">;
-defm V_CMPX_NLT_F16_t16   : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">;
-defm V_CMPX_T_F16_t16     : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_t16", "v_cmpx_t_f16", "v_cmpx_tru_f16">;
-
 defm V_CMPX_F_F16_fake16     : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">;
 defm V_CMPX_LT_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">;
 defm V_CMPX_EQ_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">;
@@ -1892,19 +1844,6 @@ defm V_CMPX_NEQ_F64   : VOPCX_Real_gfx11_gfx12<0x0ad>;
 defm V_CMPX_NLT_F64   : VOPCX_Real_gfx11_gfx12<0x0ae>;
 defm V_CMPX_T_F64     : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">;
 
-defm V_CMPX_LT_I16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">;
-defm V_CMPX_EQ_I16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">;
-defm V_CMPX_LE_I16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">;
-defm V_CMPX_GT_I16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">;
-defm V_CMPX_NE_I16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">;
-defm V_CMPX_GE_I16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">;
-defm V_CMPX_LT_U16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">;
-defm V_CMPX_EQ_U16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">;
-defm V_CMPX_LE_U16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">;
-defm V_CMPX_GT_U16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">;
-defm V_CMPX_NE_U16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">;
-defm V_CMPX_GE_U16_t16    : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">;
-
 defm V_CMPX_LT_I16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">;
 defm V_CMPX_EQ_I16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">;
 defm V_CMPX_LE_I16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">;
@@ -1951,7 +1890,6 @@ defm V_CMPX_GT_U64    : VOPCX_Real_gfx11_gfx12<0x0dc>;
 defm V_CMPX_NE_U64    : VOPCX_Real_gfx11_gfx12<0x0dd>;
 defm V_CMPX_GE_U64    : VOPCX_Real_gfx11_gfx12<0x0de>;
 defm V_CMPX_T_U64     : VOPCX_Real_gfx11<0x0df>;
-defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">;
 defm V_CMPX_CLASS_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">;
 defm V_CMPX_CLASS_F32     : VOPCX_Real_gfx11_gfx12<0x0fe>;
 defm V_CMPX_CLASS_F64     : VOPCX_Real_gfx11_gfx12<0x0ff>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index d236907..930ed9a 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1909,8 +1909,8 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName
 
 multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME,
                                                 string pseudo_mnemonic = "", bit isSingle = 0> {
-  defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>;
-  defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>;
+  defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+  defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
 }
 
 multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 5ec2d83..2e517c2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -806,7 +806,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
   } else {
     setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand);
     setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom);
+    setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom);
   }
 
   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index c67177c..009b60c 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -3320,7 +3320,7 @@ def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
 }
 
 
-
+let mayStore = 1, hasSideEffects = 0 in {
 def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
                            (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre,
                            StMiscFrm, IIC_iStore_bh_ru,
@@ -3352,6 +3352,7 @@ def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
   let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
   let DecoderMethod = "DecodeAddrMode3Instruction";
 }
+} // mayStore = 1, hasSideEffects = 0
 
 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
 def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
diff --git a/llvm/lib/Target/ARM/ARMSystemRegister.td b/llvm/lib/Target/ARM/ARMSystemRegister.td
index c03db15..3afc410 100644
--- a/llvm/lib/Target/ARM/ARMSystemRegister.td
+++ b/llvm/lib/Target/ARM/ARMSystemRegister.td
@@ -19,17 +19,13 @@ class MClassSysReg<bits<1> UniqMask1,
                    bits<1> UniqMask2,
                    bits<1> UniqMask3,
                    bits<12> Enc12,
-                   string name> : SearchableTable {
-  let SearchableFields = ["Name", "M1Encoding12", "M2M3Encoding8", "Encoding"];
+                   string name> {
   string Name;
   bits<13> M1Encoding12;
   bits<10> M2M3Encoding8;
   bits<12> Encoding;
 
   let Name = name;
-  let EnumValueField = "M1Encoding12";
-  let EnumValueField = "M2M3Encoding8";
-  let EnumValueField = "Encoding";
 
   let M1Encoding12{12}    = UniqMask1;
   let M1Encoding12{11-00} = Enc12;
@@ -41,6 +37,27 @@ class MClassSysReg<bits<1> UniqMask1,
   code Requires           = [{ {} }];
 }
 
+def MClassSysRegsList : GenericTable {
+  let FilterClass = "MClassSysReg";
+  let Fields = ["Name", "M1Encoding12", "M2M3Encoding8", "Encoding",
+                "Requires"];
+}
+
+def lookupMClassSysRegByName : SearchIndex {
+  let Table = MClassSysRegsList;
+  let Key = ["Name"];
+}
+
+def lookupMClassSysRegByM1Encoding12 : SearchIndex {
+  let Table = MClassSysRegsList;
+  let Key = ["M1Encoding12"];
+}
+
+def lookupMClassSysRegByM2M3Encoding8 : SearchIndex {
+  let Table = MClassSysRegsList;
+  let Key = ["M2M3Encoding8"];
+}
+
 // [|i|e|x]apsr_nzcvq has alias [|i|e|x]apsr.
 //                 Mask1 Mask2 Mask3 Enc12, Name
 let Requires = [{ {ARM::FeatureDSP} }] in {
@@ -127,15 +144,29 @@ def : MClassSysReg<0,    0,    1,    0x8a7, "pac_key_u_3_ns">;
 
 // Banked Registers
 //
-class BankedReg<string name,  bits<8> enc>
-               : SearchableTable {
+class BankedReg<string name,  bits<8> enc> {
   string Name;
   bits<8> Encoding;
   let Name = name;
   let Encoding = enc;
-  let SearchableFields = ["Name", "Encoding"];
 }
 
+def BankedRegsList : GenericTable {
+  let FilterClass = "BankedReg";
+  let Fields = ["Name", "Encoding"];
+}
+
+def lookupBankedRegByName : SearchIndex {
+  let Table = BankedRegsList;
+  let Key = ["Name"];
+}
+
+def lookupBankedRegByEncoding : SearchIndex {
+  let Table = BankedRegsList;
+  let Key = ["Encoding"];
+}
+
+
 // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM
 // and bit 5 is R.
 def : BankedReg<"r8_usr",   0x00>;
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index ab3a445..7efd298 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -752,7 +752,7 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
   // The gep was in charge of making sure the offsets are scaled correctly
   // - calculate that factor so it can be applied by hand
   int TypeScale =
-      computeScale(DL->getTypeSizeInBits(GEP->getOperand(0)->getType()),
+      computeScale(DL->getTypeSizeInBits(GEP->getSourceElementType()),
                    DL->getTypeSizeInBits(GEP->getType()) /
                        cast<FixedVectorType>(GEP->getType())->getNumElements());
   if (TypeScale == -1)
diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
index 494c67d..e76a70b 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
@@ -62,13 +62,13 @@ const MClassSysReg *lookupMClassSysRegBy8bitSYSmValue(unsigned SYSm) {
   return ARMSysReg::lookupMClassSysRegByM2M3Encoding8((1<<8)|(SYSm & 0xFF));
 }
 
-#define GET_MCLASSSYSREG_IMPL
+#define GET_MClassSysRegsList_IMPL
 #include "ARMGenSystemRegister.inc"
 
 } // end namespace ARMSysReg
 
 namespace ARMBankedReg {
-#define GET_BANKEDREG_IMPL
+#define GET_BankedRegsList_IMPL
 #include "ARMGenSystemRegister.inc"
 } // end namespce ARMSysReg
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
index 5562572..dc4f811 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -206,8 +206,8 @@ namespace ARMSysReg {
     }
   };
 
-  #define GET_MCLASSSYSREG_DECL
-  #include "ARMGenSystemRegister.inc"
+#define GET_MClassSysRegsList_DECL
+#include "ARMGenSystemRegister.inc"
 
   // lookup system register using 12-bit SYSm value.
   // Note: the search is uniqued using M1 mask
@@ -228,8 +228,8 @@ namespace ARMBankedReg {
     const char *Name;
     uint16_t Encoding;
   };
-  #define GET_BANKEDREG_DECL
-  #include "ARMGenSystemRegister.inc"
+#define GET_BankedRegsList_DECL
+#include "ARMGenSystemRegister.inc"
 } // end namespace ARMBankedReg
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index 5eca92ab..56147bb 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -60,6 +60,18 @@ def FeatureSmallStack
                        "The device has an 8-bit "
                        "stack pointer">;
 
+// The device potentially requires emitting rjmp that wraps across the flash
+// boundary.
+//
+// We enable this for devices that have exactly 8 kB of flash memory and don't
+// support the `jmp` instruction - with this feature enabled, we try to convert
+// out-of-bounds relative jumps into in-bounds by wrapping the offset, e.g.
+// `rjmp +5000` becomes `rjmp -3192`.
+def FeatureWrappingRjmp
+    : SubtargetFeature<"wrappingrjmp", "HasWrappingRjmp", "true",
+                       "The device potentially requires emitting rjmp that "
+                       "wraps across the flash boundary">;
+
 // The device supports the 16-bit GPR pair MOVW instruction.
 def FeatureMOVW : SubtargetFeature<"movw", "HasMOVW", "true",
                                    "The device supports the 16-bit MOVW "
@@ -274,11 +286,11 @@ def : Device<"at86rf401", FamilyAVR2, ELFArchAVR25, [FeatureMOVW, FeatureLPMX]>;
 def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
 def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
 def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2, [FeatureSmallStack]>;
-def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2>;
-def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2>;
-def : Device<"ata5272", FamilyAVR25, ELFArchAVR25>;
-def : Device<"ata6616c", FamilyAVR25, ELFArchAVR25>;
+def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2, [FeatureWrappingRjmp]>;
+def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2, [FeatureWrappingRjmp]>;
+def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2, [FeatureWrappingRjmp]>;
+def : Device<"ata5272", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
+def : Device<"ata6616c", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
 def : Device<"attiny13", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
@@ -288,24 +300,24 @@ def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny4313", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny44", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny44a", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny84", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
+def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
 def : Device<"attiny25", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny85", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
 def : Device<"attiny261", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25, [FeatureSmallStack]>;
 def : Device<"attiny441", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny841", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny861", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny87", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny841", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
+def : Device<"attiny861", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
+def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
+def : Device<"attiny87", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
 def : Device<"attiny43u", FamilyAVR25, ELFArchAVR25>;
 def : Device<"attiny48", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny88", FamilyAVR25, ELFArchAVR25>;
-def : Device<"attiny828", FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny88", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
+def : Device<"attiny828", FamilyAVR25, ELFArchAVR25, [FeatureWrappingRjmp]>;
 def : Device<"at43usb355", FamilyAVR3, ELFArchAVR3>;
 def : Device<"at76c711", FamilyAVR3, ELFArchAVR3>;
 def : Device<"atmega103", FamilyAVR31, ELFArchAVR31>;
@@ -321,11 +333,11 @@ def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>;
 def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>;
 def : Device<"attiny1634", FamilyAVR35, ELFArchAVR35>;
 def : Device<"atmega8", FamilyAVR2, ELFArchAVR4,
-             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureWrappingRjmp]>;
 def : Device<"ata6289", FamilyAVR4, ELFArchAVR4>;
 def : Device<"atmega8a", FamilyAVR2, ELFArchAVR4,
-             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
-def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>;
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureWrappingRjmp]>;
+def : Device<"ata6285", FamilyAVR4, ELFArchAVR4, [FeatureWrappingRjmp]>;
 def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>;
 def : Device<"ata6612c", FamilyAVR4, ELFArchAVR4>;
 def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>;
@@ -339,9 +351,9 @@ def : Device<"atmega88p", FamilyAVR4, ELFArchAVR4>;
 def : Device<"atmega88pa", FamilyAVR4, ELFArchAVR4>;
 def : Device<"atmega88pb", FamilyAVR4, ELFArchAVR4>;
 def : Device<"atmega8515", FamilyAVR2, ELFArchAVR4,
-             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureWrappingRjmp]>;
 def : Device<"atmega8535", FamilyAVR2, ELFArchAVR4,
-             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM, FeatureWrappingRjmp]>;
 def : Device<"atmega8hva", FamilyAVR4, ELFArchAVR4>;
 def : Device<"at90pwm1", FamilyAVR4, ELFArchAVR4>;
 def : Device<"at90pwm2", FamilyAVR4, ELFArchAVR4>;
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 17c48c2..fd35f8f 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -27,16 +27,13 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
-// FIXME: we should be doing checks to make sure asm operands
-// are not out of bounds.
-
 namespace adjust {
 
 using namespace llvm;
 
 static void signed_width(unsigned Width, uint64_t Value,
                          std::string Description, const MCFixup &Fixup,
-                         MCContext *Ctx = nullptr) {
+                         MCContext *Ctx) {
   if (!isIntN(Width, Value)) {
     std::string Diagnostic = "out of range " + Description;
 
@@ -46,17 +43,13 @@ static void signed_width(unsigned Width, uint64_t Value,
     Diagnostic += " (expected an integer in the range " + std::to_string(Min) +
                   " to " + std::to_string(Max) + ")";
 
-    if (Ctx) {
-      Ctx->reportError(Fixup.getLoc(), Diagnostic);
-    } else {
-      llvm_unreachable(Diagnostic.c_str());
-    }
+    Ctx->reportError(Fixup.getLoc(), Diagnostic);
   }
 }
 
 static void unsigned_width(unsigned Width, uint64_t Value,
                            std::string Description, const MCFixup &Fixup,
-                           MCContext *Ctx = nullptr) {
+                           MCContext *Ctx) {
   if (!isUIntN(Width, Value)) {
     std::string Diagnostic = "out of range " + Description;
 
@@ -65,17 +58,13 @@ static void unsigned_width(unsigned Width, uint64_t Value,
     Diagnostic +=
         " (expected an integer in the range 0 to " + std::to_string(Max) + ")";
 
-    if (Ctx) {
-      Ctx->reportError(Fixup.getLoc(), Diagnostic);
-    } else {
-      llvm_unreachable(Diagnostic.c_str());
-    }
+    Ctx->reportError(Fixup.getLoc(), Diagnostic);
   }
 }
 
 /// Adjusts the value of a branch target before fixup application.
 static void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                         MCContext *Ctx = nullptr) {
+                         MCContext *Ctx) {
   // We have one extra bit of precision because the value is rightshifted by
   // one.
   unsigned_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx);
@@ -86,13 +75,28 @@ static void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 
 /// Adjusts the value of a relative branch target before fixup application.
 static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup,
-                                 uint64_t &Value, MCContext *Ctx = nullptr) {
+                                 uint64_t &Value, MCContext *Ctx) {
   // Jumps are relative to the current instruction.
   Value -= 2;
 
   // We have one extra bit of precision because the value is rightshifted by
   // one.
-  signed_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx);
+  Size += 1;
+
+  if (!isIntN(Size, Value) &&
+      Ctx->getSubtargetInfo()->hasFeature(AVR::FeatureWrappingRjmp)) {
+    const int32_t FlashSize = 0x2000;
+    int32_t SignedValue = Value;
+
+    uint64_t WrappedValue = SignedValue > 0 ? (uint64_t)(Value - FlashSize)
+                                            : (uint64_t)(FlashSize + Value);
+
+    if (isIntN(Size, WrappedValue)) {
+      Value = WrappedValue;
+    }
+  }
+
+  signed_width(Size, Value, std::string("branch target"), Fixup, Ctx);
 
   // Rightshifts the value by one.
   AVR::fixups::adjustBranchTarget(Value);
@@ -105,7 +109,7 @@ static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup,
 ///
 /// Offset of 0 (so the result is left shifted by 3 bits before application).
 static void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                       MCContext *Ctx = nullptr) {
+                       MCContext *Ctx) {
   adjustBranch(Size, Fixup, Value, Ctx);
 
   auto top = Value & (0xf00000 << 6);   // the top four bits
@@ -121,7 +125,7 @@ static void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 /// 0000 00kk kkkk k000
 /// Offset of 0 (so the result is left shifted by 3 bits before application).
 static void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                          MCContext *Ctx = nullptr) {
+                          MCContext *Ctx) {
   adjustRelativeBranch(Size, Fixup, Value, Ctx);
 
   // Because the value may be negative, we must mask out the sign bits
@@ -135,7 +139,7 @@ static void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 /// 0000 kkkk kkkk kkkk
 /// Offset of 0 (so the result isn't left-shifted before application).
 static void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                           MCContext *Ctx = nullptr) {
+                           MCContext *Ctx) {
   adjustRelativeBranch(Size, Fixup, Value, Ctx);
 
   // Because the value may be negative, we must mask out the sign bits
@@ -147,8 +151,7 @@ static void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 ///
 /// Resolves to:
 /// 10q0 qq10 0000 1qqq
-static void fixup_6(const MCFixup &Fixup, uint64_t &Value,
-                    MCContext *Ctx = nullptr) {
+static void fixup_6(const MCFixup &Fixup, uint64_t &Value, MCContext *Ctx) {
   unsigned_width(6, Value, std::string("immediate"), Fixup, Ctx);
 
   Value = ((Value & 0x20) << 8) | ((Value & 0x18) << 7) | (Value & 0x07);
@@ -160,7 +163,7 @@ static void fixup_6(const MCFixup &Fixup, uint64_t &Value,
 /// Resolves to:
 /// 0000 0000 kk00 kkkk
 static void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value,
-                         MCContext *Ctx = nullptr) {
+                         MCContext *Ctx) {
   unsigned_width(6, Value, std::string("immediate"), Fixup, Ctx);
 
   Value = ((Value & 0x30) << 2) | (Value & 0x0f);
@@ -170,8 +173,7 @@ static void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value,
 ///
 /// Resolves to:
 /// 0000 0000 AAAA A000
-static void fixup_port5(const MCFixup &Fixup, uint64_t &Value,
-                        MCContext *Ctx = nullptr) {
+static void fixup_port5(const MCFixup &Fixup, uint64_t &Value, MCContext *Ctx) {
   unsigned_width(5, Value, std::string("port number"), Fixup, Ctx);
 
   Value &= 0x1f;
@@ -183,8 +185,7 @@ static void fixup_port5(const MCFixup &Fixup, uint64_t &Value,
 ///
 /// Resolves to:
 /// 1011 0AAd dddd AAAA
-static void fixup_port6(const MCFixup &Fixup, uint64_t &Value,
-                        MCContext *Ctx = nullptr) {
+static void fixup_port6(const MCFixup &Fixup, uint64_t &Value, MCContext *Ctx) {
   unsigned_width(6, Value, std::string("port number"), Fixup, Ctx);
 
   Value = ((Value & 0x30) << 5) | (Value & 0x0f);
@@ -195,7 +196,7 @@ static void fixup_port6(const MCFixup &Fixup, uint64_t &Value,
 /// Resolves to:
 /// 1010 ikkk dddd kkkk
 static void fixup_lds_sts_16(const MCFixup &Fixup, uint64_t &Value,
-                             MCContext *Ctx = nullptr) {
+                             MCContext *Ctx) {
   unsigned_width(7, Value, std::string("immediate"), Fixup, Ctx);
   Value = ((Value & 0x70) << 8) | (Value & 0x0f);
 }
@@ -213,7 +214,7 @@ namespace ldi {
 /// 0000 KKKK 0000 KKKK
 /// Offset of 0 (so the result isn't left-shifted before application).
 static void fixup(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                  MCContext *Ctx = nullptr) {
+                  MCContext *Ctx) {
   uint64_t upper = Value & 0xf0;
   uint64_t lower = Value & 0x0f;
 
@@ -223,25 +224,25 @@ static void fixup(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 static void neg(uint64_t &Value) { Value *= -1; }
 
 static void lo8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                MCContext *Ctx = nullptr) {
+                MCContext *Ctx) {
   Value &= 0xff;
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
 
 static void hi8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                MCContext *Ctx = nullptr) {
+                MCContext *Ctx) {
   Value = (Value & 0xff00) >> 8;
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
 
 static void hh8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                MCContext *Ctx = nullptr) {
+                MCContext *Ctx) {
   Value = (Value & 0xff0000) >> 16;
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
 
 static void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                MCContext *Ctx = nullptr) {
+                MCContext *Ctx) {
   Value = (Value & 0xff000000) >> 24;
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 5d865a3..62b5b70 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -42,8 +42,10 @@ def FloatTy : DXILOpParamType;
 def DoubleTy : DXILOpParamType;
 def ResRetHalfTy : DXILOpParamType;
 def ResRetFloatTy : DXILOpParamType;
+def ResRetDoubleTy : DXILOpParamType;
 def ResRetInt16Ty : DXILOpParamType;
 def ResRetInt32Ty : DXILOpParamType;
+def ResRetInt64Ty : DXILOpParamType;
 def HandleTy : DXILOpParamType;
 def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
@@ -890,6 +892,23 @@ def SplitDouble :  DXILOp<102, splitDouble> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def RawBufferLoad : DXILOp<139, rawBufferLoad> {
+  let Doc = "reads from a raw buffer and structured buffer";
+  // Handle, Coord0, Coord1, Mask, Alignment
+  let arguments = [HandleTy, Int32Ty, Int32Ty, Int8Ty, Int32Ty];
+  let result = OverloadTy;
+  let overloads = [
+    Overloads<DXIL1_2,
+              [ResRetHalfTy, ResRetFloatTy, ResRetInt16Ty, ResRetInt32Ty]>,
+    Overloads<DXIL1_3,
+              [
+                ResRetHalfTy, ResRetFloatTy, ResRetDoubleTy, ResRetInt16Ty,
+                ResRetInt32Ty, ResRetInt64Ty
+              ]>
+  ];
+  let stages = [Stages<DXIL1_2, [all_stages]>];
+}
+
 def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
   let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
             "accumulate to i32";
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 5d5bb3e..9f88ccd 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -263,10 +263,14 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return getResRetType(Type::getHalfTy(Ctx));
   case OpParamType::ResRetFloatTy:
     return getResRetType(Type::getFloatTy(Ctx));
+  case OpParamType::ResRetDoubleTy:
+    return getResRetType(Type::getDoubleTy(Ctx));
   case OpParamType::ResRetInt16Ty:
     return getResRetType(Type::getInt16Ty(Ctx));
   case OpParamType::ResRetInt32Ty:
     return getResRetType(Type::getInt32Ty(Ctx));
+  case OpParamType::ResRetInt64Ty:
+    return getResRetType(Type::getInt64Ty(Ctx));
   case OpParamType::HandleTy:
     return getHandleType(Ctx);
   case OpParamType::ResBindTy:
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 4e01dd1..f43815b 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -415,8 +415,16 @@ public:
         }
       }
 
-      OldResult = cast<Instruction>(
-          IRB.CreateExtractValue(Op, 0, OldResult->getName()));
+      if (OldResult->use_empty()) {
+        // Only the check bit was used, so we're done here.
+        OldResult->eraseFromParent();
+        return Error::success();
+      }
+
+      assert(OldResult->hasOneUse() &&
+             isa<ExtractValueInst>(*OldResult->user_begin()) &&
+             "Expected only use to be extract of first element");
+      OldResult = cast<Instruction>(*OldResult->user_begin());
       OldTy = ST->getElementType(0);
     }
 
@@ -534,6 +542,48 @@ public:
     });
   }
 
+  [[nodiscard]] bool lowerRawBufferLoad(Function &F) {
+    Triple TT(Triple(M.getTargetTriple()));
+    VersionTuple DXILVersion = TT.getDXILVersion();
+    const DataLayout &DL = F.getDataLayout();
+    IRBuilder<> &IRB = OpBuilder.getIRB();
+    Type *Int8Ty = IRB.getInt8Ty();
+    Type *Int32Ty = IRB.getInt32Ty();
+
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
+      IRB.SetInsertPoint(CI);
+
+      Type *OldTy = cast<StructType>(CI->getType())->getElementType(0);
+      Type *ScalarTy = OldTy->getScalarType();
+      Type *NewRetTy = OpBuilder.getResRetType(ScalarTy);
+
+      Value *Handle =
+          createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
+      Value *Index0 = CI->getArgOperand(1);
+      Value *Index1 = CI->getArgOperand(2);
+      uint64_t NumElements =
+          DL.getTypeSizeInBits(OldTy) / DL.getTypeSizeInBits(ScalarTy);
+      Value *Mask = ConstantInt::get(Int8Ty, ~(~0U << NumElements));
+      Value *Align =
+          ConstantInt::get(Int32Ty, DL.getPrefTypeAlign(ScalarTy).value());
+
+      Expected<CallInst *> OpCall =
+          DXILVersion >= VersionTuple(1, 2)
+              ? OpBuilder.tryCreateOp(OpCode::RawBufferLoad,
+                                      {Handle, Index0, Index1, Mask, Align},
+                                      CI->getName(), NewRetTy)
+              : OpBuilder.tryCreateOp(OpCode::BufferLoad,
+                                      {Handle, Index0, Index1}, CI->getName(),
+                                      NewRetTy);
+      if (Error E = OpCall.takeError())
+        return E;
+      if (Error E = replaceResRetUses(CI, *OpCall, /*HasCheckBit=*/true))
+        return E;
+
+      return Error::success();
+    });
+  }
+
   [[nodiscard]] bool lowerUpdateCounter(Function &F) {
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int32Ty = IRB.getInt32Ty();
@@ -723,14 +773,14 @@ public:
         HasErrors |= lowerGetPointer(F);
         break;
       case Intrinsic::dx_resource_load_typedbuffer:
-        HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/false);
-        break;
-      case Intrinsic::dx_resource_loadchecked_typedbuffer:
         HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true);
         break;
       case Intrinsic::dx_resource_store_typedbuffer:
         HasErrors |= lowerTypedBufferStore(F);
         break;
+      case Intrinsic::dx_resource_load_rawbuffer:
+        HasErrors |= lowerRawBufferLoad(F);
+        break;
       case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 1ff8f09..8376249 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -30,6 +30,9 @@ static void replaceTypedBufferAccess(IntrinsicInst *II,
          "Unexpected typed buffer type");
   Type *ContainedType = HandleType->getTypeParameter(0);
 
+  Type *LoadType =
+      StructType::get(ContainedType, Type::getInt1Ty(II->getContext()));
+
   // We need the size of an element in bytes so that we can calculate the offset
   // in elements given a total offset in bytes later.
   Type *ScalarType = ContainedType->getScalarType();
@@ -81,13 +84,15 @@ static void replaceTypedBufferAccess(IntrinsicInst *II,
         // We're storing a scalar, so we need to load the current value and only
         // replace the relevant part.
         auto *Load = Builder.CreateIntrinsic(
-            ContainedType, Intrinsic::dx_resource_load_typedbuffer,
+            LoadType, Intrinsic::dx_resource_load_typedbuffer,
             {II->getOperand(0), II->getOperand(1)});
+        auto *Struct = Builder.CreateExtractValue(Load, {0});
+
         // If we have an offset from seeing a GEP earlier, use it.
         Value *IndexOp = Current.Index
                              ? Current.Index
                              : ConstantInt::get(Builder.getInt32Ty(), 0);
-        V = Builder.CreateInsertElement(Load, V, IndexOp);
+        V = Builder.CreateInsertElement(Struct, V, IndexOp);
       } else {
         llvm_unreachable("Store to typed resource has invalid type");
       }
@@ -101,8 +106,10 @@ static void replaceTypedBufferAccess(IntrinsicInst *II,
     } else if (auto *LI = dyn_cast<LoadInst>(Current.Access)) {
       IRBuilder<> Builder(LI);
       Value *V = Builder.CreateIntrinsic(
-          ContainedType, Intrinsic::dx_resource_load_typedbuffer,
+          LoadType, Intrinsic::dx_resource_load_typedbuffer,
           {II->getOperand(0), II->getOperand(1)});
+      V = Builder.CreateExtractValue(V, {0});
+
       if (Current.Index)
         V = Builder.CreateExtractElement(V, Current.Index);
 
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 45aadac..be68d46 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -749,8 +749,8 @@ uint64_t DXILBitcodeWriter::getOptimizationFlags(const Value *V) {
     if (PEO->isExact())
       Flags |= 1 << bitc::PEO_EXACT;
   } else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) {
-    if (FPMO->hasAllowReassoc())
-      Flags |= bitc::AllowReassoc;
+    if (FPMO->hasAllowReassoc() || FPMO->hasAllowContract())
+      Flags |= bitc::UnsafeAlgebra;
     if (FPMO->hasNoNaNs())
       Flags |= bitc::NoNaNs;
     if (FPMO->hasNoInfs())
@@ -759,10 +759,6 @@ uint64_t DXILBitcodeWriter::getOptimizationFlags(const Value *V) {
       Flags |= bitc::NoSignedZeros;
     if (FPMO->hasAllowReciprocal())
       Flags |= bitc::AllowReciprocal;
-    if (FPMO->hasAllowContract())
-      Flags |= bitc::AllowContract;
-    if (FPMO->hasApproxFunc())
-      Flags |= bitc::ApproxFunc;
   }
 
   return Flags;
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 46a8ab3..991ee5b 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1796,6 +1796,8 @@ bool PolynomialMultiplyRecognize::recognize() {
     IterCount = CV->getValue()->getZExtValue() + 1;
 
   Value *CIV = getCountIV(LoopB);
+  if (CIV == nullptr)
+    return false;
   ParsedValues PV;
   Simplifier PreSimp;
   PV.IterCount = IterCount;
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 30742c7..0218934 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -352,11 +352,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
+  // lu12i.w $rd, %le_hi20_r(sym)
+  // add.w/d $rd, $rd, $tp, %le_add_r(sym)
+  // addi.w/d $rd, $rd, %le_lo12_r(sym)
+  //
+  // Code Sequence while using the large code model:
   // lu12i.w $rd, %le_hi20(sym)
   // ori $rd, $rd, %le_lo12(sym)
-  //
-  // And additionally if generating code using the large code model:
-  //
   // lu32i.d $rd, %le64_lo20(sym)
   // lu52i.d $rd, $rd, %le64_hi12(sym)
   MachineFunction *MF = MBB.getParent();
@@ -366,20 +368,35 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
   bool Large = MF->getTarget().getCodeModel() == CodeModel::Large;
   Register DestReg = MI.getOperand(0).getReg();
   Register Parts01 =
-      Large ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-            : DestReg;
+      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
   Register Part1 =
       MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
   MachineOperand &Symbol = MI.getOperand(1);
 
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU12I_W), Part1)
-      .addDisp(Symbol, 0, LoongArchII::MO_LE_HI);
+  if (!Large) {
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU12I_W), Part1)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_HI_R);
 
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ORI), Parts01)
-      .addReg(Part1, RegState::Kill)
-      .addDisp(Symbol, 0, LoongArchII::MO_LE_LO);
+    const auto &STI = MF->getSubtarget<LoongArchSubtarget>();
+    unsigned AddOp = STI.is64Bit() ? LoongArch::PseudoAddTPRel_D
+                                   : LoongArch::PseudoAddTPRel_W;
+    BuildMI(MBB, MBBI, DL, TII->get(AddOp), Parts01)
+        .addReg(Part1, RegState::Kill)
+        .addReg(LoongArch::R2)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_ADD_R);
+
+    unsigned AddiOp = STI.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
+    BuildMI(MBB, MBBI, DL, TII->get(AddiOp), DestReg)
+        .addReg(Parts01, RegState::Kill)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_LO_R);
+  } else {
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU12I_W), Part1)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_HI);
+
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ORI), Parts01)
+        .addReg(Part1, RegState::Kill)
+        .addDisp(Symbol, 0, LoongArchII::MO_LE_LO);
 
-  if (Large) {
     Register Parts012 =
         MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 7f67def..96e6f71 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1866,9 +1866,17 @@ SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
   // PseudoLA_*_LARGE nodes.
   SDValue Tmp = DAG.getConstant(0, DL, Ty);
   SDValue Addr = DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, 0);
-  SDValue Offset = Large
+
+  // Only IE needs an extra argument for large code model.
+  SDValue Offset = Opc == LoongArch::PseudoLA_TLS_IE_LARGE
                        ? SDValue(DAG.getMachineNode(Opc, DL, Ty, Tmp, Addr), 0)
                        : SDValue(DAG.getMachineNode(Opc, DL, Ty, Addr), 0);
+
+  // If it is LE for normal/medium code model, the add tp operation will occur
+  // during the pseudo-instruction expansion.
+  if (Opc == LoongArch::PseudoLA_TLS_LE && !Large)
+    return Offset;
+
   if (UseGOT) {
     // Mark the load instruction as invariant to enable hoisting in MachineLICM.
     MachineFunction &MF = DAG.getMachineFunction();
@@ -1989,7 +1997,7 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op,
     //
     // This node doesn't need an extra argument for the large code model.
     return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE,
-                            /*UseGOT=*/false);
+                            /*UseGOT=*/false, Large);
   }
 
   return getTLSDescAddr(N, DAG,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 363cacf..32bc8bb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -154,6 +154,9 @@ void LoongArchInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                               Register VReg) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
 
   unsigned Opcode;
   if (LoongArch::GPRRegClass.hasSubClassEq(RC))
@@ -177,7 +180,7 @@ void LoongArchInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
       MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
-  BuildMI(MBB, I, DebugLoc(), get(Opcode), DstReg)
+  BuildMI(MBB, I, DL, get(Opcode), DstReg)
       .addFrameIndex(FI)
       .addImm(0)
       .addMemOperand(MMO);
@@ -406,6 +409,11 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   //   lu32i.d $a1, %ie64_pc_lo20(s)
   //   lu52i.d $a1, $a1, %ie64_pc_hi12(s)
   //
+  // * pcalau12i $a0, %desc_pc_hi20(s)
+  //   addi.d $a1, $zero, %desc_pc_lo12(s)
+  //   lu32i.d $a1, %desc64_pc_lo20(s)
+  //   lu52i.d $a1, $a1, %desc64_pc_hi12(s)
+  //
   // For simplicity, only pcalau12i and lu52i.d are marked as scheduling
   // boundaries, and the instructions between them are guaranteed to be
   // ordered according to data dependencies.
@@ -430,12 +438,16 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
     if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO &&
         MO2 == LoongArchII::MO_IE_PC64_LO)
       return true;
+    if (MO0 == LoongArchII::MO_DESC_PC_HI &&
+        MO1 == LoongArchII::MO_DESC_PC_LO &&
+        MO2 == LoongArchII::MO_DESC64_PC_LO)
+      return true;
     break;
   }
   case LoongArch::LU52I_D: {
     auto MO = MI.getOperand(2).getTargetFlags();
     if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI ||
-        MO == LoongArchII::MO_IE_PC64_HI)
+        MO == LoongArchII::MO_IE_PC64_HI || MO == LoongArchII::MO_DESC64_PC_HI)
       return true;
     break;
   }
@@ -651,7 +663,10 @@ LoongArchInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
       {MO_DESC_LD, "loongarch-desc-ld"},
       {MO_DESC_CALL, "loongarch-desc-call"},
       {MO_LD_PC_HI, "loongarch-ld-pc-hi"},
-      {MO_GD_PC_HI, "loongarch-gd-pc-hi"}};
+      {MO_GD_PC_HI, "loongarch-gd-pc-hi"},
+      {MO_LE_HI_R, "loongarch-le-hi-r"},
+      {MO_LE_ADD_R, "loongarch-le-add-r"},
+      {MO_LE_LO_R, "loongarch-le-lo-r"}};
   return ArrayRef(TargetFlags);
 }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index 2bacc12..d1de060 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -114,6 +114,15 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   case LoongArchII::MO_DESC_CALL:
     Kind = LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL;
     break;
+  case LoongArchII::MO_LE_HI_R:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R;
+    break;
+  case LoongArchII::MO_LE_ADD_R:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_LE_ADD_R;
+    break;
+  case LoongArchII::MO_LE_LO_R:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R;
+    break;
     // TODO: Handle more target-flags.
   }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index bd63c5e..2369904 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -54,6 +54,9 @@ enum {
   MO_DESC64_PC_LO,
   MO_DESC_LD,
   MO_DESC_CALL,
+  MO_LE_HI_R,
+  MO_LE_ADD_R,
+  MO_LE_LO_R,
   // TODO: Add more flags.
 };
 
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 65e1893..d34f45f 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -14,7 +14,7 @@
 #include "NVPTX.h"
 #include "NVPTXUtilities.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/NVVMIntrinsicFlags.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
index f940dc0..c03ef8d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -49,39 +50,34 @@ static std::string getHash(StringRef Str) {
   return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
 }
 
-static void addKernelMetadata(Module &M, GlobalValue *GV) {
+static void addKernelMetadata(Module &M, Function *F) {
   llvm::LLVMContext &Ctx = M.getContext();
 
   // Get "nvvm.annotations" metadata node.
   llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
 
-  llvm::Metadata *KernelMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "kernel"),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
-
   // This kernel is only to be called single-threaded.
   llvm::Metadata *ThreadXMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidx"),
+      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
   llvm::Metadata *ThreadYMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidy"),
+      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
   llvm::Metadata *ThreadZMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidz"),
+      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
 
   llvm::Metadata *BlockMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV),
+      llvm::ConstantAsMetadata::get(F),
       llvm::MDString::get(Ctx, "maxclusterrank"),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
 
   // Append metadata to nvvm.annotations.
-  MD->addOperand(llvm::MDNode::get(Ctx, KernelMDVals));
+  F->setCallingConv(CallingConv::PTX_Kernel);
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index c51729e..ef97844 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -14,10 +14,11 @@
 #include "NVPTXUtilities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/IR/NVVMIntrinsicFlags.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -2449,6 +2450,11 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
   return true;
 }
 
+static inline bool isAddLike(const SDValue V) {
+  return V.getOpcode() == ISD::ADD ||
+         (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
+}
+
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
@@ -2475,7 +2481,7 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
 // symbol+offset
 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
-  if (Addr.getOpcode() == ISD::ADD) {
+  if (isAddLike(Addr)) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
       SDValue base = Addr.getOperand(0);
       if (SelectDirectAddr(base, Base)) {
@@ -2512,7 +2518,7 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp(
       Addr.getOpcode() == ISD::TargetGlobalAddress)
     return false; // direct calls.
 
-  if (Addr.getOpcode() == ISD::ADD) {
+  if (isAddLike(Addr)) {
     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
       return false;
     }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 4a98fe2..c9b7e87 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -261,6 +261,9 @@ public:
     return true;
   }
 
+  bool isFAbsFree(EVT VT) const override { return true; }
+  bool isFNegFree(EVT VT) const override { return true; }
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 711cd67..c3e72d6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -733,12 +733,12 @@ def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
 
 def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse f32:$lo)),
                                 (bf16 (fpround_oneuse f32:$hi)))),
-          (CVT_bf16x2_f32 Float32Regs:$hi, Float32Regs:$lo, CvtRN)>,
+          (CVT_bf16x2_f32 $hi, $lo, CvtRN)>,
       Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>;
 
 def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse f32:$lo)),
                                (f16 (fpround_oneuse f32:$hi)))),
-          (CVT_f16x2_f32 Float32Regs:$hi, Float32Regs:$lo, CvtRN)>,
+          (CVT_f16x2_f32 $hi, $lo, CvtRN)>,
       Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>;
 
 //-----------------------------------
@@ -813,7 +813,7 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
 
 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
 def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
-          (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
+          (SELP_b32rr $a, $b, $p)>;
 }
 
 //-----------------------------------
@@ -952,29 +952,29 @@ def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
 
 // Matchers for signed, unsigned mul.wide ISD nodes.
 def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
-          (MULWIDES32 i16:$a, i16:$b)>,
+          (MULWIDES32 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)),
-          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+          (MULWIDES32Imm $a, imm:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
-          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+          (MULWIDEU32 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)),
-          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+          (MULWIDEU32Imm $a, imm:$b)>,
       Requires<[doMulWide]>;
 
 def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
-          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          (MULWIDES64 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)),
-          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+          (MULWIDES64Imm $a, imm:$b)>,
       Requires<[doMulWide]>;
 def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
-          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+          (MULWIDEU64 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)),
-          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+          (MULWIDEU64Imm $a, imm:$b)>,
       Requires<[doMulWide]>;
 
 // Predicates used for converting some patterns to mul.wide.
@@ -1024,46 +1024,46 @@ def SHL2MUL16 : SDNodeXForm<imm, [{
 
 // Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
 def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)),
-          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          (MULWIDES64Imm $a, (SHL2MUL32 $b))>,
       Requires<[doMulWide]>;
 def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)),
-          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          (MULWIDEU64Imm $a, (SHL2MUL32 $b))>,
       Requires<[doMulWide]>;
 
 def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)),
-          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          (MULWIDES32Imm $a, (SHL2MUL16 $b))>,
       Requires<[doMulWide]>;
 def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)),
-          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          (MULWIDEU32Imm $a, (SHL2MUL16 $b))>,
       Requires<[doMulWide]>;
 
 // Convert "sign/zero-extend then multiply" to mul.wide.
 def : Pat<(mul (sext i32:$a), (sext i32:$b)),
-          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          (MULWIDES64 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)),
-          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+          (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>,
       Requires<[doMulWide]>;
 
 def : Pat<(mul (zext i32:$a), (zext i32:$b)),
-          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+          (MULWIDEU64 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)),
-          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+          (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>,
       Requires<[doMulWide]>;
 
 def : Pat<(mul (sext i16:$a), (sext i16:$b)),
-          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+          (MULWIDES32 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)),
-          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+          (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>,
       Requires<[doMulWide]>;
 
 def : Pat<(mul (zext i16:$a), (zext i16:$b)),
-          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+          (MULWIDEU32 $a, $b)>,
       Requires<[doMulWide]>;
 def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
-          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+          (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>,
       Requires<[doMulWide]>;
 
 //
@@ -1242,7 +1242,7 @@ def FDIV64ri :
 // fdiv will be converted to rcp
 // fneg (fdiv 1.0, X) => fneg (rcp.rn X)
 def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b),
-          (FNEGf64 (FDIV641r (NegDoubleConst node:$a), Float64Regs:$b))>;
+          (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>;
 
 //
 // F32 Approximate reciprocal
@@ -1436,83 +1436,83 @@ def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
 
 // frem - f32 FTZ
 def : Pat<(frem f32:$x, f32:$y),
-          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
-            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
-             Float32Regs:$y))>,
+          (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32
+            (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ),
+             $y))>,
           Requires<[doF32FTZ, allowUnsafeFPMath]>;
 def : Pat<(frem f32:$x, fpimm:$y),
-          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
-            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
+          (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32
+            (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ),
              fpimm:$y))>,
           Requires<[doF32FTZ, allowUnsafeFPMath]>;
 
-def : Pat<(frem f32:$x, Float32Regs:$y),
-          (SELP_f32rr Float32Regs:$x,
-            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
-              (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
-              Float32Regs:$y)),
-            (TESTINF_f32r Float32Regs:$y))>,
+def : Pat<(frem f32:$x, f32:$y),
+          (SELP_f32rr $x,
+            (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32
+              (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ),
+              $y)),
+            (TESTINF_f32r $y))>,
           Requires<[doF32FTZ, noUnsafeFPMath]>;
 def : Pat<(frem f32:$x, fpimm:$y),
-          (SELP_f32rr Float32Regs:$x,
-            (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
-              (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
+          (SELP_f32rr $x,
+            (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32
+              (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ),
               fpimm:$y)),
             (TESTINF_f32i fpimm:$y))>,
           Requires<[doF32FTZ, noUnsafeFPMath]>;
 
 // frem - f32
 def : Pat<(frem f32:$x, f32:$y),
-          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
-            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
-             Float32Regs:$y))>,
+          (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32
+            (FDIV32rr_prec $x, $y), CvtRZI),
+             $y))>,
           Requires<[allowUnsafeFPMath]>;
 def : Pat<(frem f32:$x, fpimm:$y),
-          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
-            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
+          (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32
+            (FDIV32ri_prec $x, fpimm:$y), CvtRZI),
              fpimm:$y))>,
           Requires<[allowUnsafeFPMath]>;
 
 def : Pat<(frem f32:$x, f32:$y),
-          (SELP_f32rr Float32Regs:$x,
-            (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
-              (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
-              Float32Regs:$y)),
+          (SELP_f32rr $x,
+            (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32
+              (FDIV32rr_prec $x, $y), CvtRZI),
+              $y)),
             (TESTINF_f32r Float32Regs:$y))>,
           Requires<[noUnsafeFPMath]>;
 def : Pat<(frem f32:$x, fpimm:$y),
-          (SELP_f32rr Float32Regs:$x,
-            (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
-              (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
+          (SELP_f32rr $x,
+            (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32
+              (FDIV32ri_prec $x, fpimm:$y), CvtRZI),
               fpimm:$y)),
             (TESTINF_f32i fpimm:$y))>,
           Requires<[noUnsafeFPMath]>;
 
 // frem - f64
 def : Pat<(frem f64:$x, f64:$y),
-          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
-            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
-             Float64Regs:$y))>,
+          (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64
+            (FDIV64rr $x, $y), CvtRZI),
+             $y))>,
           Requires<[allowUnsafeFPMath]>;
 def : Pat<(frem f64:$x, fpimm:$y),
-          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
-            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
+          (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64
+            (FDIV64ri $x, fpimm:$y), CvtRZI),
              fpimm:$y))>,
           Requires<[allowUnsafeFPMath]>;
 
 def : Pat<(frem f64:$x, f64:$y),
-          (SELP_f64rr Float64Regs:$x,
-            (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
-              (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
-               Float64Regs:$y)),
+          (SELP_f64rr $x,
+            (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64
+              (FDIV64rr $x, $y), CvtRZI),
+               $y)),
             (TESTINF_f64r Float64Regs:$y))>,
           Requires<[noUnsafeFPMath]>;
 def : Pat<(frem f64:$x, fpimm:$y),
-          (SELP_f64rr Float64Regs:$x,
-            (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
-              (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
+          (SELP_f64rr $x,
+            (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64
+              (FDIV64ri $x, fpimm:$y), CvtRZI),
               fpimm:$y)),
-            (TESTINF_f64r Float64Regs:$y))>,
+            (TESTINF_f64r $y))>,
           Requires<[noUnsafeFPMath]>;
 
 //-----------------------------------
@@ -1561,32 +1561,32 @@ defm AND : BITWISE<"and", and>;
 defm XOR : BITWISE<"xor", xor>;
 
 // PTX does not support mul on predicates, convert to and instructions
-def : Pat<(mul i1:$a, i1:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(mul i1:$a, imm:$b), (ANDb1ri Int1Regs:$a, imm:$b)>;
+def : Pat<(mul i1:$a, i1:$b), (ANDb1rr $a, $b)>;
+def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>;
 
 // These transformations were once reliably performed by instcombine, but thanks
 // to poison semantics they are no longer safe for LLVM IR, perform them here
 // instead.
-def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>;
+def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>;
 
 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
 foreach vt = [v2i16, v4i8] in {
   def: Pat<(or vt:$a, vt:$b),
-           (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
+           (ORb32rr $a, $b)>;
   def: Pat<(xor vt:$a, vt:$b),
-           (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
+           (XORb32rr $a, $b)>;
   def: Pat<(and vt:$a, vt:$b),
-           (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
+           (ANDb32rr $a, $b)>;
 
   // The constants get legalized into a bitcast from i32, so that's what we need
   // to match here.
   def: Pat<(or vt:$a, (vt (bitconvert (i32 imm:$b)))),
-           (ORb32ri Int32Regs:$a, imm:$b)>;
+           (ORb32ri $a, imm:$b)>;
   def: Pat<(xor vt:$a, (vt (bitconvert (i32 imm:$b)))),
-           (XORb32ri Int32Regs:$a, imm:$b)>;
+           (XORb32ri $a, imm:$b)>;
   def: Pat<(and vt:$a, (vt (bitconvert (i32 imm:$b)))),
-           (ANDb32ri Int32Regs:$a, imm:$b)>;
+           (ANDb32ri $a, imm:$b)>;
 }
 
 def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
@@ -1770,34 +1770,34 @@ let hasSideEffects = false in {
 
 // byte extraction + signed/unsigned extension to i32.
 def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
-          (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
+          (BFE_S32rri $s, $o, 8)>;
 def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)),
-          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
+          (BFE_S32rii $s, imm:$o, 8)>;
 def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)),
-          (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
+          (BFE_U32rri $s, $o, 8)>;
 def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)),
-          (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
+          (BFE_U32rii $s, imm:$o, 8)>;
 
 // byte extraction + signed extension to i16
 def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)),
-          (CVT_s8_s32 (BFE_S32rii i32:$s, imm:$o, 8), CvtNONE)>;
+          (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
 
 
 // Byte extraction via shift/trunc/sext
 def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)),
-          (CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
+          (CVT_s8_s32 $s, CvtNONE)>;
 def : Pat<(i16 (sext_inreg (trunc (srl i32:$s,  (i32 imm:$o))), i8)),
-          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
+          (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
 def : Pat<(sext_inreg (srl i32:$s,  (i32 imm:$o)), i8),
-          (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
+          (BFE_S32rii $s, imm:$o, 8)>;
 def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))),
-          (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
+          (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>;
 def : Pat<(sext_inreg (srl i64:$s,  (i32 imm:$o)), i8),
-          (BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
+          (BFE_S64rii $s, imm:$o, 8)>;
 def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)),
-          (CVT_s8_s64 Int64Regs:$s, CvtNONE)>;
+          (CVT_s8_s64 $s, CvtNONE)>;
 def : Pat<(i16 (sext_inreg (trunc (srl i64:$s,  (i32 imm:$o))), i8)),
-          (CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
+          (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
 
 //-----------------------------------
 // Comparison instructions (setp, set)
@@ -2032,47 +2032,47 @@ multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
                        Instruction set_64ir> {
   // i16 -> pred
   def : Pat<(i1 (OpNode i16:$a, i16:$b)),
-            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+            (setp_16rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode i16:$a, imm:$b)),
-            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+            (setp_16ri $a, imm:$b, Mode)>;
   def : Pat<(i1 (OpNode imm:$a, i16:$b)),
-            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+            (setp_16ir imm:$a, $b, Mode)>;
   // i32 -> pred
   def : Pat<(i1 (OpNode i32:$a, i32:$b)),
-            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+            (setp_32rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode i32:$a, imm:$b)),
-            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+            (setp_32ri $a, imm:$b, Mode)>;
   def : Pat<(i1 (OpNode imm:$a, i32:$b)),
-            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+            (setp_32ir imm:$a, $b, Mode)>;
   // i64 -> pred
   def : Pat<(i1 (OpNode i64:$a, i64:$b)),
-            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+            (setp_64rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode i64:$a, imm:$b)),
-            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+            (setp_64ri $a, imm:$b, Mode)>;
   def : Pat<(i1 (OpNode imm:$a, i64:$b)),
-            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+            (setp_64ir imm:$a, $b, Mode)>;
 
   // i16 -> i32
   def : Pat<(i32 (OpNode i16:$a, i16:$b)),
-            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+            (set_16rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode i16:$a, imm:$b)),
-            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+            (set_16ri $a, imm:$b, Mode)>;
   def : Pat<(i32 (OpNode imm:$a, i16:$b)),
-            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+            (set_16ir imm:$a, $b, Mode)>;
   // i32 -> i32
   def : Pat<(i32 (OpNode i32:$a, i32:$b)),
-            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+            (set_32rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode i32:$a, imm:$b)),
-            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+            (set_32ri $a, imm:$b, Mode)>;
   def : Pat<(i32 (OpNode imm:$a, i32:$b)),
-            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+            (set_32ir imm:$a, $b, Mode)>;
   // i64 -> i32
   def : Pat<(i32 (OpNode i64:$a, Int64Regs:$b)),
-            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+            (set_64rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode i64:$a, imm:$b)),
-            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+            (set_64ri $a, imm:$b, Mode)>;
   def : Pat<(i32 (OpNode imm:$a, i64:$b)),
-            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+            (set_64ir imm:$a, $b, Mode)>;
 }
 
 multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
@@ -2179,94 +2179,94 @@ def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
 
 // i1 compare -> i32
 def : Pat<(i32 (setne i1:$a, i1:$b)),
-          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+          (SELP_u32ii -1, 0, (XORb1rr $a, $b))>;
 def : Pat<(i32 (setne i1:$a, i1:$b)),
-          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+          (SELP_u32ii 0, -1, (XORb1rr $a, $b))>;
 
 
 
 multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   // f16 -> pred
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
-            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
+            (SETP_f16rr $a, $b, ModeFTZ)>,
         Requires<[useFP16Math,doF32FTZ]>;
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
-            (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
+            (SETP_f16rr $a, $b, Mode)>,
         Requires<[useFP16Math]>;
 
   // bf16 -> pred
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
-            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
+            (SETP_bf16rr $a, $b, ModeFTZ)>,
         Requires<[hasBF16Math,doF32FTZ]>;
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
-            (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
+            (SETP_bf16rr $a, $b, Mode)>,
         Requires<[hasBF16Math]>;
 
   // f32 -> pred
   def : Pat<(i1 (OpNode f32:$a, f32:$b)),
-            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+            (SETP_f32rr $a, $b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i1 (OpNode f32:$a, f32:$b)),
-            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+            (SETP_f32rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
-            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+            (SETP_f32ri $a, fpimm:$b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i1 (OpNode f32:$a, fpimm:$b)),
-            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+            (SETP_f32ri $a, fpimm:$b, Mode)>;
   def : Pat<(i1 (OpNode fpimm:$a, f32:$b)),
-            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+            (SETP_f32ir fpimm:$a, $b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i1 (OpNode fpimm:$a, f32:$b)),
-            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+            (SETP_f32ir fpimm:$a, $b, Mode)>;
 
   // f64 -> pred
   def : Pat<(i1 (OpNode f64:$a, f64:$b)),
-            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+            (SETP_f64rr $a, $b, Mode)>;
   def : Pat<(i1 (OpNode f64:$a, fpimm:$b)),
-            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+            (SETP_f64ri $a, fpimm:$b, Mode)>;
   def : Pat<(i1 (OpNode fpimm:$a, f64:$b)),
-            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+            (SETP_f64ir fpimm:$a, $b, Mode)>;
 
   // f16 -> i32
   def : Pat<(i32 (OpNode f16:$a, f16:$b)),
-            (SET_f16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
+            (SET_f16rr $a, $b, ModeFTZ)>,
         Requires<[useFP16Math, doF32FTZ]>;
   def : Pat<(i32 (OpNode f16:$a, f16:$b)),
-            (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
+            (SET_f16rr $a, $b, Mode)>,
         Requires<[useFP16Math]>;
 
   // bf16 -> i32
   def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
-            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, ModeFTZ)>,
+            (SET_bf16rr $a, $b, ModeFTZ)>,
         Requires<[hasBF16Math, doF32FTZ]>;
   def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
-            (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
+            (SET_bf16rr $a, $b, Mode)>,
         Requires<[hasBF16Math]>;
 
   // f32 -> i32
   def : Pat<(i32 (OpNode f32:$a, f32:$b)),
-            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+            (SET_f32rr $a, $b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i32 (OpNode f32:$a, f32:$b)),
-            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+            (SET_f32rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode f32:$a, fpimm:$b)),
-            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+            (SET_f32ri $a, fpimm:$b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i32 (OpNode f32:$a, fpimm:$b)),
-            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+            (SET_f32ri $a, fpimm:$b, Mode)>;
   def : Pat<(i32 (OpNode fpimm:$a, f32:$b)),
-            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+            (SET_f32ir fpimm:$a, $b, ModeFTZ)>,
         Requires<[doF32FTZ]>;
   def : Pat<(i32 (OpNode fpimm:$a, f32:$b)),
-            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+            (SET_f32ir fpimm:$a, $b, Mode)>;
 
   // f64 -> i32
   def : Pat<(i32 (OpNode f64:$a, f64:$b)),
-            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+            (SET_f64rr $a, $b, Mode)>;
   def : Pat<(i32 (OpNode f64:$a, fpimm:$b)),
-            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+            (SET_f64ri $a, fpimm:$b, Mode)>;
   def : Pat<(i32 (OpNode fpimm:$a, f64:$b)),
-            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+            (SET_f64ir fpimm:$a, $b, Mode)>;
 }
 
 defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
@@ -2722,11 +2722,11 @@ def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
 def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
 
 foreach vt = [f16, bf16] in {
-  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
+  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 $src)>;
 }
 
 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
-  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
+  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 $src)>;
 }
 
 //
@@ -3029,9 +3029,9 @@ def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
 
 foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
 def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
-         (BITCONVERT_32_F2I Float32Regs:$a)>;
+         (BITCONVERT_32_F2I $a)>;
 def: Pat<(f32 (bitconvert vt:$a)),
-         (BITCONVERT_32_I2F Int32Regs:$a)>;
+         (BITCONVERT_32_I2F $a)>;
 }
 foreach vt = [f16, bf16] in {
   def: Pat<(vt (bitconvert i16:$a)),
@@ -3056,280 +3056,280 @@ foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
 
 // sint -> f16
 def : Pat<(f16 (sint_to_fp i1:$a)),
-          (CVT_f16_s32 (SELP_s32ii -1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f16_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
 def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
-          (CVT_f16_s16 i16:$a, CvtRN)>;
+          (CVT_f16_s16 $a, CvtRN)>;
 def : Pat<(f16 (sint_to_fp i32:$a)),
-          (CVT_f16_s32 i32:$a, CvtRN)>;
+          (CVT_f16_s32 $a, CvtRN)>;
 def : Pat<(f16 (sint_to_fp i64:$a)),
-          (CVT_f16_s64 i64:$a, CvtRN)>;
+          (CVT_f16_s64 $a, CvtRN)>;
 
 // uint -> f16
 def : Pat<(f16 (uint_to_fp i1:$a)),
-          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
 def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
-          (CVT_f16_u16 i16:$a, CvtRN)>;
+          (CVT_f16_u16 $a, CvtRN)>;
 def : Pat<(f16 (uint_to_fp i32:$a)),
-          (CVT_f16_u32 i32:$a, CvtRN)>;
+          (CVT_f16_u32 $a, CvtRN)>;
 def : Pat<(f16 (uint_to_fp i64:$a)),
-          (CVT_f16_u64 i64:$a, CvtRN)>;
+          (CVT_f16_u64 $a, CvtRN)>;
 
 // sint -> bf16
 def : Pat<(bf16 (sint_to_fp i1:$a)),
-          (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_s32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (sint_to_fp i16:$a)),
-          (CVT_bf16_s16 i16:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_s16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (sint_to_fp i32:$a)),
-          (CVT_bf16_s32 i32:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_s32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (sint_to_fp i64:$a)),
-          (CVT_bf16_s64 i64:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_s64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 
 // uint -> bf16
 def : Pat<(bf16 (uint_to_fp i1:$a)),
-          (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (uint_to_fp i16:$a)),
-          (CVT_bf16_u16 i16:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_u16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (uint_to_fp i32:$a)),
-          (CVT_bf16_u32 i32:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_u32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 def : Pat<(bf16 (uint_to_fp i64:$a)),
-          (CVT_bf16_u64 i64:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_u64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 
 // sint -> f32
 def : Pat<(f32 (sint_to_fp i1:$a)),
-          (CVT_f32_s32 (SELP_s32ii -1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f32_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
 def : Pat<(f32 (sint_to_fp i16:$a)),
-          (CVT_f32_s16 i16:$a, CvtRN)>;
+          (CVT_f32_s16 $a, CvtRN)>;
 def : Pat<(f32 (sint_to_fp i32:$a)),
-          (CVT_f32_s32 i32:$a, CvtRN)>;
+          (CVT_f32_s32 $a, CvtRN)>;
 def : Pat<(f32 (sint_to_fp i64:$a)),
-          (CVT_f32_s64 i64:$a, CvtRN)>;
+          (CVT_f32_s64 $a, CvtRN)>;
 
 // uint -> f32
 def : Pat<(f32 (uint_to_fp i1:$a)),
-          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f32_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
 def : Pat<(f32 (uint_to_fp i16:$a)),
-          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+          (CVT_f32_u16 $a, CvtRN)>;
 def : Pat<(f32 (uint_to_fp i32:$a)),
-          (CVT_f32_u32 i32:$a, CvtRN)>;
+          (CVT_f32_u32 $a, CvtRN)>;
 def : Pat<(f32 (uint_to_fp i64:$a)),
-          (CVT_f32_u64 i64:$a, CvtRN)>;
+          (CVT_f32_u64 $a, CvtRN)>;
 
 // sint -> f64
 def : Pat<(f64 (sint_to_fp i1:$a)),
-          (CVT_f64_s32 (SELP_s32ii -1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f64_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
 def : Pat<(f64 (sint_to_fp i16:$a)),
-          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+          (CVT_f64_s16 $a, CvtRN)>;
 def : Pat<(f64 (sint_to_fp i32:$a)),
-          (CVT_f64_s32 i32:$a, CvtRN)>;
+          (CVT_f64_s32 $a, CvtRN)>;
 def : Pat<(f64 (sint_to_fp i64:$a)),
-          (CVT_f64_s64 i64:$a, CvtRN)>;
+          (CVT_f64_s64 $a, CvtRN)>;
 
 // uint -> f64
 def : Pat<(f64 (uint_to_fp i1:$a)),
-          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+          (CVT_f64_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
 def : Pat<(f64 (uint_to_fp i16:$a)),
-          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+          (CVT_f64_u16 $a, CvtRN)>;
 def : Pat<(f64 (uint_to_fp i32:$a)),
-          (CVT_f64_u32 i32:$a, CvtRN)>;
+          (CVT_f64_u32 $a, CvtRN)>;
 def : Pat<(f64 (uint_to_fp i64:$a)),
-          (CVT_f64_u64 i64:$a, CvtRN)>;
+          (CVT_f64_u64 $a, CvtRN)>;
 
 
 // f16 -> sint
 def : Pat<(i1 (fp_to_sint f16:$a)),
-          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+          (SETP_b16ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f16:$a)),
-          (CVT_s16_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s16_f16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_sint f16:$a)),
-          (CVT_s32_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s32_f16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_sint f16:$a)),
-          (CVT_s64_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s64_f16 $a, CvtRZI)>;
 
 // f16 -> uint
 def : Pat<(i1 (fp_to_uint f16:$a)),
-          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+          (SETP_b16ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_uint f16:$a)),
-          (CVT_u16_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u16_f16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint f16:$a)),
-          (CVT_u32_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u32_f16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint f16:$a)),
-          (CVT_u64_f16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u64_f16 $a, CvtRZI)>;
 
 // bf16 -> sint
 def : Pat<(i1 (fp_to_sint bf16:$a)),
-          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+          (SETP_b16ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint bf16:$a)),
-          (CVT_s16_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s16_bf16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_sint bf16:$a)),
-          (CVT_s32_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s32_bf16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_sint bf16:$a)),
-          (CVT_s64_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_s64_bf16 $a, CvtRZI)>;
 
 // bf16 -> uint
 def : Pat<(i1 (fp_to_uint bf16:$a)),
-          (SETP_b16ri Int16Regs:$a, 0, CmpEQ)>;
+          (SETP_b16ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_uint bf16:$a)),
-          (CVT_u16_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u16_bf16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint bf16:$a)),
-          (CVT_u32_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u32_bf16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint bf16:$a)),
-          (CVT_u64_bf16 Int16Regs:$a, CvtRZI)>;
+          (CVT_u64_bf16 $a, CvtRZI)>;
 // f32 -> sint
 def : Pat<(i1 (fp_to_sint f32:$a)),
-          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+          (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f32:$a)),
-          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_s16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i16 (fp_to_sint f32:$a)),
-          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s16_f32 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_sint f32:$a)),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_s32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_sint f32:$a)),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s32_f32 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_sint f32:$a)),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_s64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_sint f32:$a)),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s64_f32 $a, CvtRZI)>;
 
 // f32 -> uint
 def : Pat<(i1 (fp_to_uint f32:$a)),
-          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+          (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_uint f32:$a)),
-          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_u16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i16 (fp_to_uint f32:$a)),
-          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u16_f32 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint f32:$a)),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_u32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_uint f32:$a)),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u32_f32 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint f32:$a)),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_u64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_uint f32:$a)),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u64_f32 $a, CvtRZI)>;
 
 // f64 -> sint
 def : Pat<(i1 (fp_to_sint f64:$a)),
-          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+          (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f64:$a)),
-          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s16_f64 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_sint f64:$a)),
-          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s32_f64 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_sint f64:$a)),
-          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s64_f64 $a, CvtRZI)>;
 
 // f64 -> uint
 def : Pat<(i1 (fp_to_uint f64:$a)),
-          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+          (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_uint f64:$a)),
-          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u16_f64 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint f64:$a)),
-          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u32_f64 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint f64:$a)),
-          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u64_f64 $a, CvtRZI)>;
 
 // sext i1
 def : Pat<(i16 (sext i1:$a)),
-          (SELP_s16ii -1, 0, Int1Regs:$a)>;
+          (SELP_s16ii -1, 0, $a)>;
 def : Pat<(i32 (sext i1:$a)),
-          (SELP_s32ii -1, 0, Int1Regs:$a)>;
+          (SELP_s32ii -1, 0, $a)>;
 def : Pat<(i64 (sext i1:$a)),
-          (SELP_s64ii -1, 0, Int1Regs:$a)>;
+          (SELP_s64ii -1, 0, $a)>;
 
 // zext i1
 def : Pat<(i16 (zext i1:$a)),
-          (SELP_u16ii 1, 0, Int1Regs:$a)>;
+          (SELP_u16ii 1, 0, $a)>;
 def : Pat<(i32 (zext i1:$a)),
-          (SELP_u32ii 1, 0, Int1Regs:$a)>;
+          (SELP_u32ii 1, 0, $a)>;
 def : Pat<(i64 (zext i1:$a)),
-          (SELP_u64ii 1, 0, Int1Regs:$a)>;
+          (SELP_u64ii 1, 0, $a)>;
 
 // anyext i1
 def : Pat<(i16 (anyext i1:$a)),
-          (SELP_u16ii -1, 0, Int1Regs:$a)>;
+          (SELP_u16ii -1, 0, $a)>;
 def : Pat<(i32 (anyext i1:$a)),
-          (SELP_u32ii -1, 0, Int1Regs:$a)>;
+          (SELP_u32ii -1, 0, $a)>;
 def : Pat<(i64 (anyext i1:$a)),
-          (SELP_u64ii -1, 0, Int1Regs:$a)>;
+          (SELP_u64ii -1, 0, $a)>;
 
 // sext i16
 def : Pat<(i32 (sext i16:$a)),
-          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+          (CVT_s32_s16 $a, CvtNONE)>;
 def : Pat<(i64 (sext i16:$a)),
-          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+          (CVT_s64_s16 $a, CvtNONE)>;
 
 // zext i16
 def : Pat<(i32 (zext i16:$a)),
-          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+          (CVT_u32_u16 $a, CvtNONE)>;
 def : Pat<(i64 (zext i16:$a)),
-          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+          (CVT_u64_u16 $a, CvtNONE)>;
 
 // anyext i16
 def : Pat<(i32 (anyext i16:$a)),
-          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+          (CVT_u32_u16 $a, CvtNONE)>;
 def : Pat<(i64 (anyext i16:$a)),
-          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+          (CVT_u64_u16 $a, CvtNONE)>;
 
 // sext i32
 def : Pat<(i64 (sext i32:$a)),
-          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+          (CVT_s64_s32 $a, CvtNONE)>;
 
 // zext i32
 def : Pat<(i64 (zext i32:$a)),
-          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+          (CVT_u64_u32 $a, CvtNONE)>;
 
 // anyext i32
 def : Pat<(i64 (anyext i32:$a)),
-          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+          (CVT_u64_u32 $a, CvtNONE)>;
 
 
 // truncate i64
 def : Pat<(i32 (trunc i64:$a)),
-          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+          (CVT_u32_u64 $a, CvtNONE)>;
 def : Pat<(i16 (trunc i64:$a)),
-          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+          (CVT_u16_u64 $a, CvtNONE)>;
 def : Pat<(i1 (trunc i64:$a)),
-          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+          (SETP_b64ri (ANDb64ri $a, 1), 1, CmpEQ)>;
 
 // truncate i32
 def : Pat<(i16 (trunc i32:$a)),
-          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+          (CVT_u16_u32 $a, CvtNONE)>;
 def : Pat<(i1 (trunc i32:$a)),
-          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+          (SETP_b32ri (ANDb32ri $a, 1), 1, CmpEQ)>;
 
 // truncate i16
 def : Pat<(i1 (trunc i16:$a)),
-          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+          (SETP_b16ri (ANDb16ri $a, 1), 1, CmpEQ)>;
 
 // sext_inreg
-def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
-def : Pat<(sext_inreg i32:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
-def : Pat<(sext_inreg i32:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
-def : Pat<(sext_inreg i64:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
-def : Pat<(sext_inreg i64:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
-def : Pat<(sext_inreg i64:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 $a)>;
+def : Pat<(sext_inreg i32:$a, i8), (CVT_INREG_s32_s8 $a)>;
+def : Pat<(sext_inreg i32:$a, i16), (CVT_INREG_s32_s16 $a)>;
+def : Pat<(sext_inreg i64:$a, i8), (CVT_INREG_s64_s8 $a)>;
+def : Pat<(sext_inreg i64:$a, i16), (CVT_INREG_s64_s16 $a)>;
+def : Pat<(sext_inreg i64:$a, i32), (CVT_INREG_s64_s32 $a)>;
 
 
 // Select instructions with 32-bit predicates
 def : Pat<(select i32:$pred, i16:$a, i16:$b),
-          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_b16rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, i32:$a, i32:$b),
-          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_b32rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, i64:$a, i64:$b),
-          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_b64rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, f16:$a, f16:$b),
-          (SELP_f16rr Int16Regs:$a, Int16Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_f16rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, bf16:$a, bf16:$b),
-          (SELP_bf16rr Int16Regs:$a, Int16Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_bf16rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, f32:$a, f32:$b),
-          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_f32rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 def : Pat<(select i32:$pred, f64:$a, f64:$b),
-          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+          (SELP_f64rr $a, $b,
+          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
 
 
 let hasSideEffects = false in {
@@ -3391,32 +3391,32 @@ let hasSideEffects = false in {
 // Using partial vectorized move produces better SASS code for extraction of
 // upper/lower parts of an integer.
 def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))),
-          (I32toI16H Int32Regs:$s)>;
+          (I32toI16H $s)>;
 def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))),
-          (I32toI16H Int32Regs:$s)>;
+          (I32toI16H $s)>;
 def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))),
-          (I64toI32H Int64Regs:$s)>;
+          (I64toI32H $s)>;
 def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))),
-          (I64toI32H Int64Regs:$s)>;
+          (I64toI32H $s)>;
 
 def: Pat<(i32 (sext (extractelt v2i16:$src, 0))),
-         (CVT_INREG_s32_s16 Int32Regs:$src)>;
+         (CVT_INREG_s32_s16 $src)>;
 
 foreach vt = [v2f16, v2bf16, v2i16] in {
 def : Pat<(extractelt vt:$src, 0),
-          (I32toI16L Int32Regs:$src)>;
+          (I32toI16L $src)>;
 def : Pat<(extractelt vt:$src, 1),
-          (I32toI16H Int32Regs:$src)>;
+          (I32toI16H $src)>;
 }
 def : Pat<(v2f16 (build_vector f16:$a, f16:$b)),
-          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+          (V2I16toI32 $a, $b)>;
 def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
-          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+          (V2I16toI32 $a, $b)>;
 def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
-          (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+          (V2I16toI32 $a, $b)>;
 
 def: Pat<(v2i16 (scalar_to_vector i16:$a)),
-         (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+         (CVT_u32_u16 $a, CvtNONE)>;
 
 //
 // Funnel-Shift
@@ -3455,13 +3455,13 @@ let hasSideEffects = false in {
 }
 
 def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, i32:$amt)),
-          (SHF_L_CLAMP_r Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt)>;
+          (SHF_L_CLAMP_r $lo, $hi, $amt)>;
 def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
-          (SHF_L_CLAMP_i Int32Regs:$lo, Int32Regs:$hi, imm:$amt)>;
+          (SHF_L_CLAMP_i $lo, $hi, imm:$amt)>;
 def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
-          (SHF_R_CLAMP_r Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt)>;
+          (SHF_R_CLAMP_r $lo, $hi, $amt)>;
 def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
-          (SHF_R_CLAMP_i Int32Regs:$lo, Int32Regs:$hi, imm:$amt)>;
+          (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
 
 // Count leading zeros
 let hasSideEffects = false in {
@@ -3472,14 +3472,14 @@ let hasSideEffects = false in {
 }
 
 // 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctlz i32:$a)), (CLZr32 i32:$a)>;
+def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
 
 // The return type of the ctlz ISD node is the same as its input, but the PTX
 // ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
 // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
 // truncating back down to 32 bits.
-def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 Int64Regs:$a)>;
+def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
+def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
 
 // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
 // result back to 16-bits if necessary.  We also need to subtract 16 because
@@ -3497,9 +3497,9 @@ def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 Int64Regs:$a)>;
 // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
 def : Pat<(i16 (ctlz i16:$a)),
           (SUBi16ri (CVT_u16_u32
-           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+           (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
 def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
-          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
+          (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
 
 // Population count
 let hasSideEffects = false in {
@@ -3510,67 +3510,67 @@ let hasSideEffects = false in {
 }
 
 // 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctpop i32:$a)), (POPCr32 Int32Regs:$a)>;
+def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
 
 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
 // to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
 // pattern that avoids the type conversion if we're truncating the result to
 // i32 anyway.
-def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 Int64Regs:$a)>;
+def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
+def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
 
 // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
 // If we know that we're storing into an i32, we can avoid the final trunc.
 def : Pat<(ctpop i16:$a),
-          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
 def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
-          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
+          (POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
 
 // fpround f32 -> f16
 def : Pat<(f16 (fpround f32:$a)),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+          (CVT_f16_f32 $a, CvtRN)>;
 
 // fpround f32 -> bf16
 def : Pat<(bf16 (fpround f32:$a)),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
+          (CVT_bf16_f32 $a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
 
 // fpround f64 -> f16
 def : Pat<(f16 (fpround f64:$a)),
-          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+          (CVT_f16_f64 $a, CvtRN)>;
 
 // fpround f64 -> bf16
 def : Pat<(bf16 (fpround f64:$a)),
-          (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_bf16_f64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
 // fpround f64 -> f32
 def : Pat<(f32 (fpround f64:$a)),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_f32_f64 $a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpround f64:$a)),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+          (CVT_f32_f64 $a, CvtRN)>;
 
 // fpextend f16 -> f32
 def : Pat<(f32 (fpextend f16:$a)),
-          (CVT_f32_f16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpextend f16:$a)),
-          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+          (CVT_f32_f16 $a, CvtNONE)>;
 // fpextend bf16 -> f32
 def : Pat<(f32 (fpextend bf16:$a)),
-          (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpextend bf16:$a)),
-          (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
+          (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
 
 // fpextend f16 -> f64
 def : Pat<(f64 (fpextend f16:$a)),
-          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
+          (CVT_f64_f16 $a, CvtNONE)>;
 
 // fpextend bf16 -> f64
 def : Pat<(f64 (fpextend bf16:$a)),
-          (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
+          (CVT_f64_bf16 $a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
 
 // fpextend f32 -> f64
 def : Pat<(f64 (fpextend f32:$a)),
-          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+          (CVT_f64_f32 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f64 (fpextend f32:$a)),
-          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+          (CVT_f64_f32 $a, CvtNONE)>;
 
 def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
                      [SDNPHasChain, SDNPOptInGlue]>;
@@ -3579,15 +3579,15 @@ def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
 
 multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   def : Pat<(OpNode f16:$a),
-            (CVT_f16_f16 Int16Regs:$a, Mode)>;
+            (CVT_f16_f16 $a, Mode)>;
   def : Pat<(OpNode bf16:$a),
-            (CVT_bf16_bf16 Int16Regs:$a, Mode)>;
+            (CVT_bf16_bf16 $a, Mode)>;
   def : Pat<(OpNode f32:$a),
-            (CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
+            (CVT_f32_f32 $a, ModeFTZ)>, Requires<[doF32FTZ]>;
   def : Pat<(OpNode f32:$a),
-            (CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
+            (CVT_f32_f32 $a, Mode)>, Requires<[doNoF32FTZ]>;
   def : Pat<(OpNode f64:$a),
-            (CVT_f64_f64 Float64Regs:$a, Mode)>;
+            (CVT_f64_f64 $a, Mode)>;
 }
 
 defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
@@ -3624,7 +3624,7 @@ let isTerminator=1 in {
 }
 
 def : Pat<(brcond i32:$a, bb:$target),
-          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+          (CBranch (SETP_u32ri $a, 0, CmpNE), bb:$target)>;
 
 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
 // conditional branch if the target block is the next block so that the code
@@ -3632,7 +3632,7 @@ def : Pat<(brcond i32:$a, bb:$target),
 // condition, 1', which will be translated to (setne condition, -1).  Since ptx
 // supports '@!pred bra target', we should use it.
 def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target),
-          (CBranchOther i1:$a, bb:$target)>;
+          (CBranchOther $a, bb:$target)>;
 
 // Call
 def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
@@ -3830,17 +3830,17 @@ include "NVPTXIntrinsics.td"
 
 def : Pat <
   (i32 (bswap i32:$a)),
-  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
+  (INT_NVVM_PRMT $a, (i32 0), (i32 0x0123))>;
 
 def : Pat <
   (v2i16 (bswap v2i16:$a)),
-  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
+  (INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>;
 
 def : Pat <
   (i64 (bswap i64:$a)),
   (V2I32toI64
-    (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
-    (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
+    (INT_NVVM_PRMT (I64toI32H $a), (i32 0), (i32 0x0123)),
+    (INT_NVVM_PRMT (I64toI32L $a), (i32 0), (i32 0x0123)))>;
 
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -3910,18 +3910,18 @@ def FMARELU_BF16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.bf16x2", [hasBF16Math
 
 // FTZ
 def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)),
-  (FMARELU_F16_FTZ Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>,
+  (FMARELU_F16_FTZ $a, $b, $c)>,
   Requires<[doF32FTZ]>;
 def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)),
-  (FMARELU_F16X2_FTZ Int32Regs:$a, Int32Regs:$b, Int32Regs:$c)>,
+  (FMARELU_F16X2_FTZ $a, $b, $c)>,
   Requires<[doF32FTZ]>;
 
 // NO FTZ
 def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)),
-  (FMARELU_F16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>;
+  (FMARELU_F16 $a, $b, $c)>;
 def : Pat<(bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan bf16:$a, bf16:$b, bf16:$c), fpimm_any_zero)),
-  (FMARELU_BF16 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c)>;
+  (FMARELU_BF16 $a, $b, $c)>;
 def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)),
-  (FMARELU_F16X2 Int32Regs:$a, Int32Regs:$b, Int32Regs:$c)>;
+  (FMARELU_F16X2 $a, $b, $c)>;
 def : Pat<(v2bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2bf16:$a, v2bf16:$b, v2bf16:$c), fpimm_positive_zero_v2bf16)),
-  (FMARELU_BF16X2 Int32Regs:$a, Int32Regs:$b, Int32Regs:$c)>;
+  (FMARELU_BF16X2 $a, $b, $c)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 0773c1b..8ede1ec 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -824,29 +824,29 @@ def MBARRIER_PENDING_COUNT :
 
 def : Pat<(int_nvvm_fmin_f immFloat1,
             (int_nvvm_fmax_f immFloat0, f32:$a)),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_f immFloat1,
             (int_nvvm_fmax_f f32:$a, immFloat0)),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_f
             (int_nvvm_fmax_f immFloat0, f32:$a), immFloat1),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_f
             (int_nvvm_fmax_f f32:$a, immFloat0), immFloat1),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 
 def : Pat<(int_nvvm_fmin_d immDouble1,
             (int_nvvm_fmax_d immDouble0, f64:$a)),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_d immDouble1,
             (int_nvvm_fmax_d f64:$a, immDouble0)),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_d
             (int_nvvm_fmax_d immDouble0, f64:$a), immDouble1),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 def : Pat<(int_nvvm_fmin_d
             (int_nvvm_fmax_d f64:$a, immDouble0), immDouble1),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 
 
 // We need a full string for OpcStr here because we need to deal with case like
@@ -1125,16 +1125,16 @@ def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
   Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
 
 def : Pat<(int_nvvm_div_full f32:$a, f32:$b),
-          (FDIV32rr Float32Regs:$a, Float32Regs:$b)>;
+          (FDIV32rr $a, $b)>;
 
 def : Pat<(int_nvvm_div_full f32:$a, fpimm:$b),
-          (FDIV32ri Float32Regs:$a, f32imm:$b)>;
+          (FDIV32ri $a, f32imm:$b)>;
 
 def : Pat<(int_nvvm_div_full_ftz f32:$a, f32:$b),
-          (FDIV32rr_ftz Float32Regs:$a, Float32Regs:$b)>;
+          (FDIV32rr_ftz $a, $b)>;
 
 def : Pat<(int_nvvm_div_full_ftz f32:$a, fpimm:$b),
-          (FDIV32ri_ftz Float32Regs:$a, f32imm:$b)>;
+          (FDIV32ri_ftz $a, f32imm:$b)>;
 
 //
 // Sad
@@ -1158,18 +1158,18 @@ def INT_NVVM_SAD_ULL : F_MATH_3<"sad.u64 \t$dst, $src0, $src1, $src2;",
 //
 
 def : Pat<(int_nvvm_floor_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_f32_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_floor_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_f32_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_floor_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_f64_f64 $a, CvtRMI)>;
 
 def : Pat<(int_nvvm_ceil_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_f32_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_ceil_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_f32_f32 $a, CvtRPI)>;
 def : Pat<(int_nvvm_ceil_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_f64_f64 $a, CvtRPI)>;
 
 //
 // Abs
@@ -1217,33 +1217,33 @@ def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", Int32Regs,
 //
 
 def : Pat<(int_nvvm_round_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_f32_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_round_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_f32_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_round_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_f64_f64 $a, CvtRNI)>;
 
 //
 // Trunc
 //
 
 def : Pat<(int_nvvm_trunc_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_f32_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_trunc_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_f32_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_trunc_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_f64_f64 $a, CvtRZI)>;
 
 //
 // Saturate
 //
 
 def : Pat<(int_nvvm_saturate_ftz_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT_FTZ)>;
+          (CVT_f32_f32 $a, CvtSAT_FTZ)>;
 def : Pat<(int_nvvm_saturate_f f32:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+          (CVT_f32_f32 $a, CvtSAT)>;
 def : Pat<(int_nvvm_saturate_d f64:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+          (CVT_f64_f64 $a, CvtSAT)>;
 
 //
 // Exp2  Log2
@@ -1430,13 +1430,13 @@ def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
 
 // nvvm_sqrt intrinsic
 def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+          (INT_NVVM_SQRT_RN_FTZ_F $a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
 def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+          (INT_NVVM_SQRT_RN_F $a)>, Requires<[do_SQRTF32_RN]>;
 def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+          (INT_NVVM_SQRT_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
 def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+          (INT_NVVM_SQRT_APPROX_F $a)>;
 
 //
 // Rsqrt
@@ -1456,24 +1456,24 @@ def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
 
 // 1.0f / sqrt_approx -> rsqrt_approx
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt]>;
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt]>;
 // same for int_nvvm_sqrt_f when non-precision sqrt is requested
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
 def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
 
 def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
 def: Pat<(fdiv FloatConst1, (fsqrt f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_FTZ_F Float32Regs:$a)>,
+         (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
 //
 // Add
@@ -1529,136 +1529,136 @@ foreach t = [I32RT, I64RT] in {
 //
 
 def : Pat<(int_nvvm_d2f_rn_ftz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>;
+          (CVT_f32_f64 $a, CvtRN_FTZ)>;
 def : Pat<(int_nvvm_d2f_rn f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+          (CVT_f32_f64 $a, CvtRN)>;
 def : Pat<(int_nvvm_d2f_rz_ftz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRZ_FTZ)>;
+          (CVT_f32_f64 $a, CvtRZ_FTZ)>;
 def : Pat<(int_nvvm_d2f_rz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRZ)>;
+          (CVT_f32_f64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_d2f_rm_ftz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRM_FTZ)>;
+          (CVT_f32_f64 $a, CvtRM_FTZ)>;
 def : Pat<(int_nvvm_d2f_rm f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRM)>;
+          (CVT_f32_f64 $a, CvtRM)>;
 def : Pat<(int_nvvm_d2f_rp_ftz f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRP_FTZ)>;
+          (CVT_f32_f64 $a, CvtRP_FTZ)>;
 def : Pat<(int_nvvm_d2f_rp f64:$a),
-          (CVT_f32_f64 Float64Regs:$a, CvtRP)>;
+          (CVT_f32_f64 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_d2i_rn f64:$a),
-          (CVT_s32_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_s32_f64 $a, CvtRNI)>;
 def : Pat<(int_nvvm_d2i_rz f64:$a),
-          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s32_f64 $a, CvtRZI)>;
 def : Pat<(int_nvvm_d2i_rm f64:$a),
-          (CVT_s32_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_s32_f64 $a, CvtRMI)>;
 def : Pat<(int_nvvm_d2i_rp f64:$a),
-          (CVT_s32_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_s32_f64 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_d2ui_rn f64:$a),
-          (CVT_u32_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_u32_f64 $a, CvtRNI)>;
 def : Pat<(int_nvvm_d2ui_rz f64:$a),
-          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u32_f64 $a, CvtRZI)>;
 def : Pat<(int_nvvm_d2ui_rm f64:$a),
-          (CVT_u32_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_u32_f64 $a, CvtRMI)>;
 def : Pat<(int_nvvm_d2ui_rp f64:$a),
-          (CVT_u32_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_u32_f64 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_i2d_rn i32:$a),
-          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+          (CVT_f64_s32 $a, CvtRN)>;
 def : Pat<(int_nvvm_i2d_rz i32:$a),
-          (CVT_f64_s32 Int32Regs:$a, CvtRZ)>;
+          (CVT_f64_s32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_i2d_rm i32:$a),
-          (CVT_f64_s32 Int32Regs:$a, CvtRM)>;
+          (CVT_f64_s32 $a, CvtRM)>;
 def : Pat<(int_nvvm_i2d_rp i32:$a),
-          (CVT_f64_s32 Int32Regs:$a, CvtRP)>;
+          (CVT_f64_s32 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ui2d_rn i32:$a),
-          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+          (CVT_f64_u32 $a, CvtRN)>;
 def : Pat<(int_nvvm_ui2d_rz i32:$a),
-          (CVT_f64_u32 Int32Regs:$a, CvtRZ)>;
+          (CVT_f64_u32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ui2d_rm i32:$a),
-          (CVT_f64_u32 Int32Regs:$a, CvtRM)>;
+          (CVT_f64_u32 $a, CvtRM)>;
 def : Pat<(int_nvvm_ui2d_rp i32:$a),
-          (CVT_f64_u32 Int32Regs:$a, CvtRP)>;
+          (CVT_f64_u32 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_f2i_rn_ftz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_s32_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_f2i_rn f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_s32_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_f2i_rz_ftz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_s32_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_f2i_rz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s32_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_f2i_rm_ftz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_s32_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_f2i_rm f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_s32_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_f2i_rp_ftz f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_s32_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_f2i_rp f32:$a),
-          (CVT_s32_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_s32_f32 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_f2ui_rn_ftz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_u32_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_f2ui_rn f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_u32_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_f2ui_rz_ftz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_u32_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_f2ui_rz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u32_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_f2ui_rm_ftz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_u32_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_f2ui_rm f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_u32_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_f2ui_rp_ftz f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_u32_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_f2ui_rp f32:$a),
-          (CVT_u32_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_u32_f32 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_i2f_rn i32:$a),
-          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+          (CVT_f32_s32 $a, CvtRN)>;
 def : Pat<(int_nvvm_i2f_rz i32:$a),
-          (CVT_f32_s32 Int32Regs:$a, CvtRZ)>;
+          (CVT_f32_s32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_i2f_rm i32:$a),
-          (CVT_f32_s32 Int32Regs:$a, CvtRM)>;
+          (CVT_f32_s32 $a, CvtRM)>;
 def : Pat<(int_nvvm_i2f_rp i32:$a),
-          (CVT_f32_s32 Int32Regs:$a, CvtRP)>;
+          (CVT_f32_s32 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ui2f_rn i32:$a),
-          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+          (CVT_f32_u32 $a, CvtRN)>;
 def : Pat<(int_nvvm_ui2f_rz i32:$a),
-          (CVT_f32_u32 Int32Regs:$a, CvtRZ)>;
+          (CVT_f32_u32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ui2f_rm i32:$a),
-          (CVT_f32_u32 Int32Regs:$a, CvtRM)>;
+          (CVT_f32_u32 $a, CvtRM)>;
 def : Pat<(int_nvvm_ui2f_rp i32:$a),
-          (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
+          (CVT_f32_u32 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ff2bf16x2_rn f32:$a, f32:$b),
-          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+          (CVT_bf16x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b),
-          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+          (CVT_bf16x2_f32 $a, $b, CvtRN_RELU)>;
 def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b),
-          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
+          (CVT_bf16x2_f32 $a, $b, CvtRZ)>;
 def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b),
-          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
+          (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>;
 
 def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b),
-          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+          (CVT_f16x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b),
-          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+          (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>;
 def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b),
-          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
+          (CVT_f16x2_f32 $a, $b, CvtRZ)>;
 def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b),
-          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
+          (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>;
 
 def : Pat<(int_nvvm_f2bf16_rn f32:$a),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
+          (CVT_bf16_f32 $a, CvtRN)>;
 def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>;
+          (CVT_bf16_f32 $a, CvtRN_RELU)>;
 def : Pat<(int_nvvm_f2bf16_rz f32:$a),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>;
+          (CVT_bf16_f32 $a, CvtRZ)>;
 def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a),
-          (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>;
+          (CVT_bf16_f32 $a, CvtRZ_RELU)>;
 
 def CVT_tf32_f32 :
    NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
@@ -1682,125 +1682,125 @@ def INT_NVVM_D2I_HI : F_MATH_1<
   Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
 
 def : Pat<(int_nvvm_f2ll_rn_ftz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_s64_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_f2ll_rn f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_s64_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_f2ll_rz_ftz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_s64_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_f2ll_rz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_s64_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_f2ll_rm_ftz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_s64_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_f2ll_rm f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_s64_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_f2ll_rp_ftz f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_s64_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_f2ll_rp f32:$a),
-          (CVT_s64_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_s64_f32 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_f2ull_rn_ftz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+          (CVT_u64_f32 $a, CvtRNI_FTZ)>;
 def : Pat<(int_nvvm_f2ull_rn f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRNI)>;
+          (CVT_u64_f32 $a, CvtRNI)>;
 def : Pat<(int_nvvm_f2ull_rz_ftz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+          (CVT_u64_f32 $a, CvtRZI_FTZ)>;
 def : Pat<(int_nvvm_f2ull_rz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+          (CVT_u64_f32 $a, CvtRZI)>;
 def : Pat<(int_nvvm_f2ull_rm_ftz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+          (CVT_u64_f32 $a, CvtRMI_FTZ)>;
 def : Pat<(int_nvvm_f2ull_rm f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRMI)>;
+          (CVT_u64_f32 $a, CvtRMI)>;
 def : Pat<(int_nvvm_f2ull_rp_ftz f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+          (CVT_u64_f32 $a, CvtRPI_FTZ)>;
 def : Pat<(int_nvvm_f2ull_rp f32:$a),
-          (CVT_u64_f32 Float32Regs:$a, CvtRPI)>;
+          (CVT_u64_f32 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_d2ll_rn f64:$a),
-          (CVT_s64_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_s64_f64 $a, CvtRNI)>;
 def : Pat<(int_nvvm_d2ll_rz f64:$a),
-          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_s64_f64 $a, CvtRZI)>;
 def : Pat<(int_nvvm_d2ll_rm f64:$a),
-          (CVT_s64_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_s64_f64 $a, CvtRMI)>;
 def : Pat<(int_nvvm_d2ll_rp f64:$a),
-          (CVT_s64_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_s64_f64 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_d2ull_rn f64:$a),
-          (CVT_u64_f64 Float64Regs:$a, CvtRNI)>;
+          (CVT_u64_f64 $a, CvtRNI)>;
 def : Pat<(int_nvvm_d2ull_rz f64:$a),
-          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+          (CVT_u64_f64 $a, CvtRZI)>;
 def : Pat<(int_nvvm_d2ull_rm f64:$a),
-          (CVT_u64_f64 Float64Regs:$a, CvtRMI)>;
+          (CVT_u64_f64 $a, CvtRMI)>;
 def : Pat<(int_nvvm_d2ull_rp f64:$a),
-          (CVT_u64_f64 Float64Regs:$a, CvtRPI)>;
+          (CVT_u64_f64 $a, CvtRPI)>;
 
 def : Pat<(int_nvvm_ll2f_rn i64:$a),
-          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+          (CVT_f32_s64 $a, CvtRN)>;
 def : Pat<(int_nvvm_ll2f_rz i64:$a),
-          (CVT_f32_s64 Int64Regs:$a, CvtRZ)>;
+          (CVT_f32_s64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ll2f_rm i64:$a),
-          (CVT_f32_s64 Int64Regs:$a, CvtRM)>;
+          (CVT_f32_s64 $a, CvtRM)>;
 def : Pat<(int_nvvm_ll2f_rp i64:$a),
-          (CVT_f32_s64 Int64Regs:$a, CvtRP)>;
+          (CVT_f32_s64 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ull2f_rn i64:$a),
-          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+          (CVT_f32_u64 $a, CvtRN)>;
 def : Pat<(int_nvvm_ull2f_rz i64:$a),
-          (CVT_f32_u64 Int64Regs:$a, CvtRZ)>;
+          (CVT_f32_u64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ull2f_rm i64:$a),
-          (CVT_f32_u64 Int64Regs:$a, CvtRM)>;
+          (CVT_f32_u64 $a, CvtRM)>;
 def : Pat<(int_nvvm_ull2f_rp i64:$a),
-          (CVT_f32_u64 Int64Regs:$a, CvtRP)>;
+          (CVT_f32_u64 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ll2d_rn i64:$a),
-          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+          (CVT_f64_s64 $a, CvtRN)>;
 def : Pat<(int_nvvm_ll2d_rz i64:$a),
-          (CVT_f64_s64 Int64Regs:$a, CvtRZ)>;
+          (CVT_f64_s64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ll2d_rm i64:$a),
-          (CVT_f64_s64 Int64Regs:$a, CvtRM)>;
+          (CVT_f64_s64 $a, CvtRM)>;
 def : Pat<(int_nvvm_ll2d_rp i64:$a),
-          (CVT_f64_s64 Int64Regs:$a, CvtRP)>;
+          (CVT_f64_s64 $a, CvtRP)>;
 
 def : Pat<(int_nvvm_ull2d_rn i64:$a),
-          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+          (CVT_f64_u64 $a, CvtRN)>;
 def : Pat<(int_nvvm_ull2d_rz i64:$a),
-          (CVT_f64_u64 Int64Regs:$a, CvtRZ)>;
+          (CVT_f64_u64 $a, CvtRZ)>;
 def : Pat<(int_nvvm_ull2d_rm i64:$a),
-          (CVT_f64_u64 Int64Regs:$a, CvtRM)>;
+          (CVT_f64_u64 $a, CvtRM)>;
 def : Pat<(int_nvvm_ull2d_rp i64:$a),
-          (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
+          (CVT_f64_u64 $a, CvtRP)>;
 
 
 def : Pat<(int_nvvm_f2h_rn_ftz f32:$a),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
+          (CVT_f16_f32 $a, CvtRN_FTZ)>;
 def : Pat<(int_nvvm_f2h_rn f32:$a),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+          (CVT_f16_f32 $a, CvtRN)>;
 
 def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b),
-          (CVT_e4m3x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+          (CVT_e4m3x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff_to_e4m3x2_rn_relu f32:$a, f32:$b),
-          (CVT_e4m3x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+          (CVT_e4m3x2_f32 $a, $b, CvtRN_RELU)>;
 def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b),
-          (CVT_e5m2x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+          (CVT_e5m2x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
-          (CVT_e5m2x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+          (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
 
 def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn Int32Regs:$a),
-          (CVT_e4m3x2_f16x2 Int32Regs:$a, CvtRN)>;
+          (CVT_e4m3x2_f16x2 $a, CvtRN)>;
 def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu Int32Regs:$a),
-          (CVT_e4m3x2_f16x2 Int32Regs:$a, CvtRN_RELU)>;
+          (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>;
 def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn Int32Regs:$a),
-          (CVT_e5m2x2_f16x2 Int32Regs:$a, CvtRN)>;
+          (CVT_e5m2x2_f16x2 $a, CvtRN)>;
 def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu Int32Regs:$a),
-          (CVT_e5m2x2_f16x2 Int32Regs:$a, CvtRN_RELU)>;
+          (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>;
 
 def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn Int16Regs:$a),
-          (CVT_f16x2_e4m3x2 Int16Regs:$a, CvtRN)>;
+          (CVT_f16x2_e4m3x2 $a, CvtRN)>;
 def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu Int16Regs:$a),
-          (CVT_f16x2_e4m3x2 Int16Regs:$a, CvtRN_RELU)>;
+          (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>;
 def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a),
-          (CVT_f16x2_e5m2x2 Int16Regs:$a, CvtRN)>;
+          (CVT_f16x2_e5m2x2 $a, CvtRN)>;
 def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a),
-          (CVT_f16x2_e5m2x2 Int16Regs:$a, CvtRN_RELU)>;
+          (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
 
 //
 // FNS
@@ -1823,9 +1823,9 @@ def INT_FNS_rii : INT_FNS_MBO<(ins Int32Regs:$mask,    i32imm:$base,    i32imm:$
 def INT_FNS_irr : INT_FNS_MBO<(ins    i32imm:$mask, Int32Regs:$base, Int32Regs:$offset),
                      (int_nvvm_fns       imm:$mask, i32:$base, i32:$offset)>;
 def INT_FNS_iri : INT_FNS_MBO<(ins    i32imm:$mask, Int32Regs:$base,    i32imm:$offset),
-                     (int_nvvm_fns       imm:$mask, Int32Regs:$base,       imm:$offset)>;
+                     (int_nvvm_fns       imm:$mask, i32:$base,       imm:$offset)>;
 def INT_FNS_iir : INT_FNS_MBO<(ins    i32imm:$mask,    i32imm:$base, Int32Regs:$offset),
-                     (int_nvvm_fns       imm:$mask,       imm:$base, Int32Regs:$offset)>;
+                     (int_nvvm_fns       imm:$mask,       imm:$base, i32:$offset)>;
 def INT_FNS_iii : INT_FNS_MBO<(ins    i32imm:$mask,    i32imm:$base,    i32imm:$offset),
                      (int_nvvm_fns       imm:$mask,       imm:$base,       imm:$offset)>;
 
@@ -2796,10 +2796,10 @@ defm cvta_to_const  : G_TO_NG<"const">;
 defm cvta_param : NG_TO_G<"param">;
 
 def : Pat<(int_nvvm_ptr_param_to_gen i32:$src),
-          (cvta_param Int32Regs:$src)>;
+          (cvta_param $src)>;
 
 def : Pat<(int_nvvm_ptr_param_to_gen i64:$src),
-          (cvta_param_64 Int64Regs:$src)>;
+          (cvta_param_64 $src)>;
 
 // nvvm.ptr.gen.to.param
 def : Pat<(int_nvvm_ptr_gen_to_param i32:$src),
@@ -2933,8 +2933,8 @@ def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
 
 
 def : Pat<(int_nvvm_swap_lo_hi_b64 i64:$src),
-          (V2I32toI64 (I64toI32H Int64Regs:$src),
-                      (I64toI32L Int64Regs:$src))> ;
+          (V2I32toI64 (I64toI32H $src),
+                      (I64toI32L $src))> ;
 
 //-----------------------------------
 // Texture Intrinsics
@@ -5040,21 +5040,21 @@ def TXQ_NUM_MIPMAP_LEVELS_I
 }
 
 def : Pat<(int_nvvm_txq_channel_order i64:$a),
-          (TXQ_CHANNEL_ORDER_R i64:$a)>;
+          (TXQ_CHANNEL_ORDER_R $a)>;
 def : Pat<(int_nvvm_txq_channel_data_type i64:$a),
-          (TXQ_CHANNEL_DATA_TYPE_R i64:$a)>;
+          (TXQ_CHANNEL_DATA_TYPE_R $a)>;
 def : Pat<(int_nvvm_txq_width i64:$a),
-          (TXQ_WIDTH_R i64:$a)>;
+          (TXQ_WIDTH_R $a)>;
 def : Pat<(int_nvvm_txq_height i64:$a),
-          (TXQ_HEIGHT_R i64:$a)>;
+          (TXQ_HEIGHT_R $a)>;
 def : Pat<(int_nvvm_txq_depth i64:$a),
-          (TXQ_DEPTH_R i64:$a)>;
+          (TXQ_DEPTH_R $a)>;
 def : Pat<(int_nvvm_txq_array_size i64:$a),
-          (TXQ_ARRAY_SIZE_R i64:$a)>;
+          (TXQ_ARRAY_SIZE_R $a)>;
 def : Pat<(int_nvvm_txq_num_samples i64:$a),
-          (TXQ_NUM_SAMPLES_R i64:$a)>;
+          (TXQ_NUM_SAMPLES_R $a)>;
 def : Pat<(int_nvvm_txq_num_mipmap_levels i64:$a),
-          (TXQ_NUM_MIPMAP_LEVELS_R i64:$a)>;
+          (TXQ_NUM_MIPMAP_LEVELS_R $a)>;
 
 
 //-----------------------------------
@@ -5113,17 +5113,17 @@ def SUQ_ARRAY_SIZE_I
 }
 
 def : Pat<(int_nvvm_suq_channel_order i64:$a),
-          (SUQ_CHANNEL_ORDER_R Int64Regs:$a)>;
+          (SUQ_CHANNEL_ORDER_R $a)>;
 def : Pat<(int_nvvm_suq_channel_data_type i64:$a),
-          (SUQ_CHANNEL_DATA_TYPE_R Int64Regs:$a)>;
+          (SUQ_CHANNEL_DATA_TYPE_R $a)>;
 def : Pat<(int_nvvm_suq_width i64:$a),
-          (SUQ_WIDTH_R Int64Regs:$a)>;
+          (SUQ_WIDTH_R $a)>;
 def : Pat<(int_nvvm_suq_height i64:$a),
-          (SUQ_HEIGHT_R Int64Regs:$a)>;
+          (SUQ_HEIGHT_R $a)>;
 def : Pat<(int_nvvm_suq_depth i64:$a),
-          (SUQ_DEPTH_R Int64Regs:$a)>;
+          (SUQ_DEPTH_R $a)>;
 def : Pat<(int_nvvm_suq_array_size i64:$a),
-          (SUQ_ARRAY_SIZE_R Int64Regs:$a)>;
+          (SUQ_ARRAY_SIZE_R $a)>;
 
 
 //===- Handle Query -------------------------------------------------------===//
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 42043ad..74ce6a9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -34,19 +34,18 @@ void NVPTXSubtarget::anchor() {}
 
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
-    // Provide the default CPU if we don't have one.
-    TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
+  TargetName = std::string(CPU);
 
-    ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+  ParseSubtargetFeatures(getTargetName(), /*TuneCPU=*/getTargetName(), FS);
 
-    // Re-map SM version numbers, SmVersion carries the regular SMs which do
-    // have relative order, while FullSmVersion allows distinguishing sm_90 from
-    // sm_90a, which would *not* be a subset of sm_91.
-    SmVersion = getSmVersion();
+  // Re-map SM version numbers, SmVersion carries the regular SMs which do
+  // have relative order, while FullSmVersion allows distinguishing sm_90 from
+  // sm_90a, which would *not* be a subset of sm_91.
+  SmVersion = getSmVersion();
 
-    // Set default to PTX 6.0 (CUDA 9.0)
-    if (PTXVersion == 0) {
-      PTXVersion = 60;
+  // Set default to PTX 6.0 (CUDA 9.0)
+  if (PTXVersion == 0) {
+    PTXVersion = 60;
   }
 
   return *this;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 7555a23..bbc1cca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -111,7 +111,12 @@ public:
   // - 0 represents base GPU model,
   // - non-zero value identifies particular architecture-accelerated variant.
   bool hasAAFeatures() const { return getFullSmVersion() % 10; }
-  std::string getTargetName() const { return TargetName; }
+
+  // If the user did not provide a target we default to the `sm_30` target.
+  std::string getTargetName() const {
+    return TargetName.empty() ? "sm_30" : TargetName;
+  }
+  bool hasTargetName() const { return !TargetName.empty(); }
 
   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3:
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index b3b2880..6d4b82a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -255,7 +255,10 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   PB.registerPipelineStartEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
-        FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
+        // We do not want to fold out calls to nvvm.reflect early if the user
+        // has not provided a target architecture just yet.
+        if (Subtarget.hasTargetName())
+          FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
         // Note: NVVMIntrRangePass was causing numerical discrepancies at one
         // point, if issues crop up, consider disabling.
         FPM.addPass(NVVMIntrRangePass());
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 98bffd9..0f2bec7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -311,11 +311,13 @@ std::optional<unsigned> getMaxNReg(const Function &F) {
 }
 
 bool isKernelFunction(const Function &F) {
+  if (F.getCallingConv() == CallingConv::PTX_Kernel)
+    return true;
+
   if (const auto X = findOneNVVMAnnotation(&F, "kernel"))
     return (*X == 1);
 
-  // There is no NVVM metadata, check the calling convention
-  return F.getCallingConv() == CallingConv::PTX_Kernel;
+  return false;
 }
 
 MaybeAlign getAlign(const Function &F, unsigned Index) {
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 56525a1..0cd584c 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -21,6 +21,7 @@
 #include "NVPTX.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -219,7 +220,13 @@ bool NVVMReflect::runOnFunction(Function &F) {
   return runNVVMReflect(F, SmVersion);
 }
 
-NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {}
+NVVMReflectPass::NVVMReflectPass() {
+  // Get the CPU string from the command line if not provided.
+  std::string MCPU = codegen::getMCPU();
+  StringRef SM = MCPU;
+  if (!SM.consume_front("sm_") || SM.consumeInteger(10, SmVersion))
+    SmVersion = 0;
+}
 
 PreservedAnalyses NVVMReflectPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 073857e..162d110 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2230,10 +2230,6 @@ void PPCLinuxAsmPrinter::emitFunctionBodyEnd() {
 
 void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
                                    MCSymbol *GVSym) const {
-
-  assert(MAI->hasVisibilityOnlyWithLinkage() &&
-         "AIX's linkage directives take a visibility setting.");
-
   MCSymbolAttr LinkageAttr = MCSA_Invalid;
   switch (GV->getLinkage()) {
   case GlobalValue::ExternalLinkage:
@@ -3251,9 +3247,15 @@ void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
 bool PPCAIXAsmPrinter::doFinalization(Module &M) {
   // Do streamer related finalization for DWARF.
-  if (!MAI->usesDwarfFileAndLocDirectives() && hasDebugInfo())
-    OutStreamer->doFinalizationAtSectionEnd(
-        OutStreamer->getContext().getObjectFileInfo()->getTextSection());
+  if (hasDebugInfo()) {
+    // Emit section end. This is used to tell the debug line section where the
+    // end is for a text section if we don't use .loc to represent the debug
+    // line.
+    auto *Sec = OutContext.getObjectFileInfo()->getTextSection();
+    OutStreamer->switchSectionNoPrint(Sec);
+    MCSymbol *Sym = Sec->getEndSymbol(OutContext);
+    OutStreamer->emitLabel(Sym);
+  }
 
   for (MCSymbol *Sym : ExtSymSDNodeSymbols)
     OutStreamer->emitSymbolAttribute(Sym, MCSA_Extern);
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2e0ee59..d1daf7c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -5473,10 +5473,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // generate secure plt code for TLS symbols.
     getGlobalBaseReg();
   } break;
-  case PPCISD::CALL: {
-    if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 ||
-        !TM.isPositionIndependent() || !Subtarget->isSecurePlt() ||
-        !Subtarget->isTargetELF())
+  case PPCISD::CALL:
+  case PPCISD::CALL_RM: {
+    if (Subtarget->isPPC64() || !TM.isPositionIndependent() ||
+        !Subtarget->isSecurePlt() || !Subtarget->isTargetELF())
       break;
 
     SDValue Op = N->getOperand(1);
@@ -5489,8 +5489,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       if (ES->getTargetFlags() == PPCII::MO_PLT)
         getGlobalBaseReg();
     }
-  }
-    break;
+  } break;
 
   case PPCISD::GlobalBaseReg:
     ReplaceNode(N, getGlobalBaseReg());
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 44f6db5..fa45a7f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -643,8 +643,8 @@ bool PPCInstrInfo::shouldReduceRegisterPressure(
   };
 
   // For now we only care about float and double type fma.
-  unsigned VSSRCLimit = TRI->getRegPressureSetLimit(
-      *MBB->getParent(), PPC::RegisterPressureSets::VSSRC);
+  unsigned VSSRCLimit =
+      RegClassInfo->getRegPressureSetLimit(PPC::RegisterPressureSets::VSSRC);
 
   // Only reduce register pressure when pressure is high.
   return GetMBBPressure(MBB)[PPC::RegisterPressureSets::VSSRC] >
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 9dcf2e9..2205c67 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -734,6 +734,16 @@ public:
            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
+  bool isUImm5GT3() const {
+    if (!isImm())
+      return false;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    int64_t Imm;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isUInt<5>(Imm) && (Imm > 3) &&
+           VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isUImm8GE32() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
@@ -1520,6 +1530,8 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
   case Match_InvalidUImm5NonZero:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 5) - 1);
+  case Match_InvalidUImm5GT3:
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 4, (1 << 5) - 1);
   case Match_InvalidUImm6:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
   case Match_InvalidUImm7:
@@ -1903,6 +1915,8 @@ ParseStatus RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
         // Accept an immediate representing a named Sys Reg if it satisfies the
         // the required features.
         for (auto &Reg : Range) {
+          if (Reg.IsAltName || Reg.IsDeprecatedName)
+            continue;
           if (Reg.haveRequiredFeatures(STI->getFeatureBits()))
             return RISCVOperand::createSysReg(Reg.Name, S, Imm);
         }
@@ -1940,22 +1954,27 @@ ParseStatus RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
       return ParseStatus::Failure;
 
     const auto *SysReg = RISCVSysReg::lookupSysRegByName(Identifier);
-    if (!SysReg)
-      SysReg = RISCVSysReg::lookupSysRegByAltName(Identifier);
-    if (!SysReg)
-      if ((SysReg = RISCVSysReg::lookupSysRegByDeprecatedName(Identifier)))
-        Warning(S, "'" + Identifier + "' is a deprecated alias for '" +
-                       SysReg->Name + "'");
-
-    // Accept a named Sys Reg if the required features are present.
+
     if (SysReg) {
+      if (SysReg->IsDeprecatedName) {
+        // Lookup the undeprecated name.
+        auto Range = RISCVSysReg::lookupSysRegByEncoding(SysReg->Encoding);
+        for (auto &Reg : Range) {
+          if (Reg.IsAltName || Reg.IsDeprecatedName)
+            continue;
+          Warning(S, "'" + Identifier + "' is a deprecated alias for '" +
+                         Reg.Name + "'");
+        }
+      }
+
+      // Accept a named Sys Reg if the required features are present.
       const auto &FeatureBits = getSTI().getFeatureBits();
       if (!SysReg->haveRequiredFeatures(FeatureBits)) {
         const auto *Feature = llvm::find_if(RISCVFeatureKV, [&](auto Feature) {
           return SysReg->FeaturesRequired[Feature.Value];
         });
         auto ErrorMsg = std::string("system register '") + SysReg->Name + "' ";
-        if (SysReg->isRV32Only && FeatureBits[RISCV::Feature64Bit]) {
+        if (SysReg->IsRV32Only && FeatureBits[RISCV::Feature64Bit]) {
           ErrorMsg += "is RV32 only";
           if (Feature != std::end(RISCVFeatureKV))
             ErrorMsg += " and ";
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 4466164..98d3615 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(LLVM RISCVGenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM RISCVGenSearchableTables.inc -gen-searchable-tables)
 tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM RISCVGenExegesis.inc -gen-exegesis)
 
 set(LLVM_TARGET_DEFINITIONS RISCVGISel.td)
 tablegen(LLVM RISCVGenGlobalISel.inc -gen-global-isel)
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 9901719..a490910 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -692,6 +692,14 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                         "Qualcomm uC Conditional Select custom opcode table");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcilsm, DecoderTableXqcilsm32,
                         "Qualcomm uC Load Store Multiple custom opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXqciac, DecoderTableXqciac32,
+      "Qualcomm uC Load-Store Address Calculation custom opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXqcicli, DecoderTableXqcicli32,
+      "Qualcomm uC Conditional Load Immediate custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicm, DecoderTableXqcicm32,
+                        "Qualcomm uC Conditional Move custom opcode table");
   TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
 
   return MCDisassembler::Fail;
@@ -718,6 +726,12 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   TRY_TO_DECODE_FEATURE(
       RISCV::FeatureStdExtZcmp, DecoderTableRVZcmp16,
       "Zcmp table (16-bit Push/Pop & Double Move Instructions)");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXqciac, DecoderTableXqciac16,
+      "Qualcomm uC Load-Store Address Calculation custom 16bit opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXqcicm, DecoderTableXqcicm16,
+      "Qualcomm uC Conditional Move custom 16bit opcode table");
   TRY_TO_DECODE_AND_ADD_SP(STI.hasFeature(RISCV::FeatureVendorXwchc),
                            DecoderTableXwchc16,
                            "WCH QingKe XW custom opcode table");
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index ef85057..3f1539d 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -80,7 +80,6 @@ private:
   bool selectFPCompare(MachineInstr &MI, MachineIRBuilder &MIB) const;
   void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID,
                  MachineIRBuilder &MIB) const;
-  bool selectMergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const;
 
   ComplexRendererFns selectShiftMask(MachineOperand &Root,
@@ -732,8 +731,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
   }
   case TargetOpcode::G_IMPLICIT_DEF:
     return selectImplicitDef(MI, MIB);
-  case TargetOpcode::G_MERGE_VALUES:
-    return selectMergeValues(MI, MIB);
   case TargetOpcode::G_UNMERGE_VALUES:
     return selectUnmergeValues(MI, MIB);
   default:
@@ -741,26 +738,13 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
   }
 }
 
-bool RISCVInstructionSelector::selectMergeValues(MachineInstr &MI,
-                                                 MachineIRBuilder &MIB) const {
-  assert(MI.getOpcode() == TargetOpcode::G_MERGE_VALUES);
-
-  // Build a F64 Pair from operands
-  if (MI.getNumOperands() != 3)
-    return false;
-  Register Dst = MI.getOperand(0).getReg();
-  Register Lo = MI.getOperand(1).getReg();
-  Register Hi = MI.getOperand(2).getReg();
-  if (!isRegInFprb(Dst) || !isRegInGprb(Lo) || !isRegInGprb(Hi))
-    return false;
-  MI.setDesc(TII.get(RISCV::BuildPairF64Pseudo));
-  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
-}
-
 bool RISCVInstructionSelector::selectUnmergeValues(
     MachineInstr &MI, MachineIRBuilder &MIB) const {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
 
+  if (!Subtarget->hasStdExtZfa())
+    return false;
+
   // Split F64 Src into two s32 parts
   if (MI.getNumOperands() != 3)
     return false;
@@ -769,8 +753,17 @@ bool RISCVInstructionSelector::selectUnmergeValues(
   Register Hi = MI.getOperand(1).getReg();
   if (!isRegInFprb(Src) || !isRegInGprb(Lo) || !isRegInGprb(Hi))
     return false;
-  MI.setDesc(TII.get(RISCV::SplitF64Pseudo));
-  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+
+  MachineInstr *ExtractLo = MIB.buildInstr(RISCV::FMV_X_W_FPR64, {Lo}, {Src});
+  if (!constrainSelectedInstRegOperands(*ExtractLo, TII, TRI, RBI))
+    return false;
+
+  MachineInstr *ExtractHi = MIB.buildInstr(RISCV::FMVH_X_D, {Hi}, {Src});
+  if (!constrainSelectedInstRegOperands(*ExtractHi, TII, TRI, RBI))
+    return false;
+
+  MI.eraseFromParent();
+  return true;
 }
 
 bool RISCVInstructionSelector::replacePtrWithInt(MachineOperand &Op,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 8284737..6f06459 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -132,7 +133,14 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
 
   auto PtrVecTys = {nxv1p0, nxv2p0, nxv4p0, nxv8p0, nxv16p0};
 
-  getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
+  getActionDefinitionsBuilder({G_ADD, G_SUB})
+      .legalFor({sXLen})
+      .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
+      .customFor(ST.is64Bit(), {s32})
+      .widenScalarToNextPow2(0)
+      .clampScalar(0, sXLen, sXLen);
+
+  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
       .legalFor({sXLen})
       .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
       .widenScalarToNextPow2(0)
@@ -1330,6 +1338,24 @@ bool RISCVLegalizerInfo::legalizeCustom(
       return true;
     return Helper.lowerConstant(MI);
   }
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_ADD: {
+    Helper.Observer.changingInstr(MI);
+    Helper.widenScalarSrc(MI, sXLen, 1, TargetOpcode::G_ANYEXT);
+    Helper.widenScalarSrc(MI, sXLen, 2, TargetOpcode::G_ANYEXT);
+
+    Register DstALU = MRI.createGenericVirtualRegister(sXLen);
+
+    MachineOperand &MO = MI.getOperand(0);
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+    auto DstSext = MIRBuilder.buildSExtInReg(sXLen, DstALU, 32);
+
+    MIRBuilder.buildInstr(TargetOpcode::G_TRUNC, {MO}, {DstSext});
+    MO.setReg(DstALU);
+
+    Helper.Observer.changedInstr(MI);
+    return true;
+  }
   case TargetOpcode::G_SEXT_INREG: {
     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
     int64_t SizeInBits = MI.getOperand(2).getImm();
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index eab4a5e..0cb1ef0 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -38,9 +38,12 @@ std::optional<MCFixupKind> RISCVAsmBackend::getFixupKind(StringRef Name) const {
   if (STI.getTargetTriple().isOSBinFormatELF()) {
     unsigned Type;
     Type = llvm::StringSwitch<unsigned>(Name)
-#define ELF_RELOC(X, Y) .Case(#X, Y)
+#define ELF_RELOC(NAME, ID) .Case(#NAME, ID)
 #include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
 #undef ELF_RELOC
+#define ELF_RISCV_NONSTANDARD_RELOC(_VENDOR, NAME, ID) .Case(#NAME, ID)
+#include "llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def"
+#undef ELF_RISCV_NONSTANDARD_RELOC
                .Case("BFD_RELOC_NONE", ELF::R_RISCV_NONE)
                .Case("BFD_RELOC_32", ELF::R_RISCV_32)
                .Case("BFD_RELOC_64", ELF::R_RISCV_64)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index b9f4db0..7048e40 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -302,6 +302,7 @@ enum OperandType : unsigned {
   OPERAND_UIMM4,
   OPERAND_UIMM5,
   OPERAND_UIMM5_NONZERO,
+  OPERAND_UIMM5_GT3,
   OPERAND_UIMM5_LSB0,
   OPERAND_UIMM6,
   OPERAND_UIMM6_LSB0,
@@ -453,8 +454,6 @@ int getLoadFPImm(APFloat FPImm);
 namespace RISCVSysReg {
 struct SysReg {
   const char Name[32];
-  const char AltName[32];
-  const char DeprecatedName[32];
   unsigned Encoding;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read, Write, Read-Only.
@@ -466,11 +465,13 @@ struct SysReg {
   // Register number without the privilege bits.
   // unsigned Number;
   FeatureBitset FeaturesRequired;
-  bool isRV32Only;
+  bool IsRV32Only;
+  bool IsAltName;
+  bool IsDeprecatedName;
 
   bool haveRequiredFeatures(const FeatureBitset &ActiveFeatures) const {
     // Not in 32-bit mode.
-    if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
+    if (IsRV32Only && ActiveFeatures[RISCV::Feature64Bit])
       return false;
     // No required feature associated with the system register.
     if (FeaturesRequired.none())
@@ -479,6 +480,7 @@ struct SysReg {
   }
 };
 
+#define GET_SysRegEncodings_DECL
 #define GET_SysRegsList_DECL
 #include "RISCVGenSearchableTables.inc"
 } // end namespace RISCVSysReg
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index d36c0d7..d525471 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -121,6 +121,8 @@ void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
   unsigned Imm = MI->getOperand(OpNo).getImm();
   auto Range = RISCVSysReg::lookupSysRegByEncoding(Imm);
   for (auto &Reg : Range) {
+    if (Reg.IsAltName || Reg.IsDeprecatedName)
+      continue;
     if (Reg.haveRequiredFeatures(STI.getFeatureBits())) {
       markup(O, Markup::Register) << Reg.Name;
       return;
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 9631241..4e0c64a 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -64,6 +64,12 @@ include "RISCVSchedXiangShanNanHu.td"
 include "RISCVProcessors.td"
 
 //===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "RISCVPfmCounters.td"
+
+//===----------------------------------------------------------------------===//
 // Define the RISC-V target.
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td
index 030613a..995dd0c 100644
--- a/llvm/lib/Target/RISCV/RISCVCombine.td
+++ b/llvm/lib/Target/RISCV/RISCVCombine.td
@@ -25,5 +25,5 @@ def RISCVPostLegalizerCombiner
     : GICombiner<"RISCVPostLegalizerCombinerImpl",
                  [sub_to_add, combines_for_extload, redundant_and,
                   identity_combines, shift_immed_chain,
-                  commute_constant_to_rhs]> {
+                  commute_constant_to_rhs, simplify_neg_minmax]> {
 }
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index dfc56588..01bc5387 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -844,6 +844,10 @@ def HasStdExtH : Predicate<"Subtarget->hasStdExtH()">,
 
 // Supervisor extensions
 
+def FeatureStdExtSdext : RISCVExperimentalExtension<1, 0, "External debugger">;
+
+def FeatureStdExtSdtrig : RISCVExperimentalExtension<1, 0, "Debugger triggers">;
+
 def FeatureStdExtShgatpa
     : RISCVExtension<1, 0,
                      "SvNNx4 mode supported for all modes supported by satp, as well as Bare">;
@@ -1274,6 +1278,30 @@ def HasVendorXqcilsm
       AssemblerPredicate<(all_of FeatureVendorXqcilsm),
                          "'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)">;
 
+def FeatureVendorXqciac
+    : RISCVExperimentalExtension<0, 2, "Qualcomm uC Load-Store Address Calculation Extension",
+                                 [FeatureStdExtZca]>;
+def HasVendorXqciac
+    : Predicate<"Subtarget->hasVendorXqciac()">,
+      AssemblerPredicate<(all_of FeatureVendorXqciac),
+                         "'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)">;
+
+def FeatureVendorXqcicli
+    : RISCVExperimentalExtension<0, 2,
+                                 "Qualcomm uC Conditional Load Immediate Extension">;
+def HasVendorXqcicli
+    : Predicate<"Subtarget->hasVendorXqcicli()">,
+      AssemblerPredicate<(all_of FeatureVendorXqcicli),
+                         "'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)">;
+
+def FeatureVendorXqcicm
+    : RISCVExperimentalExtension<0, 2, "Qualcomm uC Conditional Move Extension",
+                                 [FeatureStdExtZca]>;
+def HasVendorXqcicm
+    : Predicate<"Subtarget->hasVendorXqcicm()">,
+      AssemblerPredicate<(all_of FeatureVendorXqcicm),
+                         "'Xqcicm' (Qualcomm uC Conditional Move Extension)">;
+
 //===----------------------------------------------------------------------===//
 // LLVM specific features and extensions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index cda64ae..6c58989 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5104,6 +5104,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
   SDValue V1 = SVN->getOperand(0);
   SDValue V2 = SVN->getOperand(1);
   ArrayRef<int> Mask = SVN->getMask();
+  unsigned NumElts = VT.getVectorNumElements();
 
   // If we don't know exact data layout, not much we can do.  If this
   // is already m1 or smaller, no point in splitting further.
@@ -5120,70 +5121,58 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
 
   MVT ElemVT = VT.getVectorElementType();
   unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
+  unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
+
+  SmallVector<std::pair<int, SmallVector<int>>>
+    OutMasks(VRegsPerSrc, {-1, {}});
+
+  // Check if our mask can be done as a 1-to-1 mapping from source
+  // to destination registers in the group without needing to
+  // write each destination more than once.
+  for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
+    int DstVecIdx = DstIdx / ElemsPerVReg;
+    int DstSubIdx = DstIdx % ElemsPerVReg;
+    int SrcIdx = Mask[DstIdx];
+    if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
+      continue;
+    int SrcVecIdx = SrcIdx / ElemsPerVReg;
+    int SrcSubIdx = SrcIdx % ElemsPerVReg;
+    if (OutMasks[DstVecIdx].first == -1)
+      OutMasks[DstVecIdx].first = SrcVecIdx;
+    if (OutMasks[DstVecIdx].first != SrcVecIdx)
+      // Note: This case could easily be handled by keeping track of a chain
+      // of source values and generating two element shuffles below.  This is
+      // less an implementation question, and more a profitability one.
+      return SDValue();
+
+    OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
+    OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
+  }
 
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
   MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
   MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
   assert(M1VT == getLMUL1VT(M1VT));
   unsigned NumOpElts = M1VT.getVectorMinNumElements();
-  unsigned NormalizedVF = ContainerVT.getVectorMinNumElements();
-  unsigned NumOfSrcRegs = NormalizedVF / NumOpElts;
-  unsigned NumOfDestRegs = NormalizedVF / NumOpElts;
+  SDValue Vec = DAG.getUNDEF(ContainerVT);
   // The following semantically builds up a fixed length concat_vector
   // of the component shuffle_vectors.  We eagerly lower to scalable here
   // to avoid DAG combining it back to a large shuffle_vector again.
   V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
   V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
-  SmallVector<SDValue> SubRegs(NumOfDestRegs);
-  unsigned RegCnt = 0;
-  unsigned PrevCnt = 0;
-  processShuffleMasks(
-      Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs,
-      [&]() {
-        PrevCnt = RegCnt;
-        ++RegCnt;
-      },
-      [&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned SrcVecIdx,
-                      unsigned DstVecIdx) {
-        SDValue SrcVec = SrcVecIdx >= NumOfSrcRegs ? V2 : V1;
-        unsigned ExtractIdx = (SrcVecIdx % NumOfSrcRegs) * NumOpElts;
-        SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
-                                     DAG.getVectorIdxConstant(ExtractIdx, DL));
-        SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
-        SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
-        SubRegs[RegCnt] = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
-        PrevCnt = RegCnt;
-        ++RegCnt;
-      },
-      [&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned Idx1, unsigned Idx2) {
-        if (PrevCnt + 1 == RegCnt)
-          ++RegCnt;
-        SDValue SubVec1 = SubRegs[PrevCnt + 1];
-        if (!SubVec1) {
-          SDValue SrcVec = Idx1 >= NumOfSrcRegs ? V2 : V1;
-          unsigned ExtractIdx = (Idx1 % NumOfSrcRegs) * NumOpElts;
-          SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
-                                DAG.getVectorIdxConstant(ExtractIdx, DL));
-        }
-        SubVec1 = convertFromScalableVector(OneRegVT, SubVec1, DAG, Subtarget);
-        SDValue SrcVec = Idx2 >= NumOfSrcRegs ? V2 : V1;
-        unsigned ExtractIdx = (Idx2 % NumOfSrcRegs) * NumOpElts;
-        SDValue SubVec2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
-                                      DAG.getVectorIdxConstant(ExtractIdx, DL));
-        SubVec2 = convertFromScalableVector(OneRegVT, SubVec2, DAG, Subtarget);
-        SubVec1 =
-            DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, SrcSubMask);
-        SubVec1 = convertToScalableVector(M1VT, SubVec1, DAG, Subtarget);
-        SubRegs[PrevCnt + 1] = SubVec1;
-      });
-  assert(RegCnt == NumOfDestRegs && "Whole vector must be processed");
-  SDValue Vec = DAG.getUNDEF(ContainerVT);
-  for (auto [I, V] : enumerate(SubRegs)) {
-    if (!V)
+  for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
+    auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
+    if (SrcVecIdx == -1)
       continue;
-    unsigned InsertIdx = I * NumOpElts;
-
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, V,
+    unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
+    SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
+    SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+                                 DAG.getVectorIdxConstant(ExtractIdx, DL));
+    SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
+    SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
+    SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
+    unsigned InsertIdx = DstVecIdx * NumOpElts;
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
                       DAG.getVectorIdxConstant(InsertIdx, DL));
   }
   return convertFromScalableVector(VT, Vec, DAG, Subtarget);
@@ -10165,7 +10154,10 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op,
   case ISD::VP_REDUCE_AND: {
     // vcpop ~x == 0
     SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL);
-    Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, TrueMask, VL);
+    if (IsVP || VecVT.isFixedLengthVector())
+      Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, TrueMask, VL);
+    else
+      Vec = DAG.getNode(ISD::XOR, DL, ContainerVT, Vec, TrueMask);
     Vec = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Vec, Mask, VL);
     CC = ISD::SETEQ;
     break;
@@ -12674,8 +12666,7 @@ SDValue RISCVTargetLowering::lowerGET_ROUNDING(SDValue Op,
   const MVT XLenVT = Subtarget.getXLenVT();
   SDLoc DL(Op);
   SDValue Chain = Op->getOperand(0);
-  SDValue SysRegNo = DAG.getTargetConstant(
-      RISCVSysReg::lookupSysRegByName("FRM")->Encoding, DL, XLenVT);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::frm, DL, XLenVT);
   SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other);
   SDValue RM = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo);
 
@@ -12706,8 +12697,7 @@ SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op,
   SDLoc DL(Op);
   SDValue Chain = Op->getOperand(0);
   SDValue RMValue = Op->getOperand(1);
-  SDValue SysRegNo = DAG.getTargetConstant(
-      RISCVSysReg::lookupSysRegByName("FRM")->Encoding, DL, XLenVT);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::frm, DL, XLenVT);
 
   // Encoding used for rounding mode in RISC-V differs from that used in
   // FLT_ROUNDS. To convert it the C rounding mode is used as an index in
@@ -12910,15 +12900,11 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue LoCounter, HiCounter;
     MVT XLenVT = Subtarget.getXLenVT();
     if (N->getOpcode() == ISD::READCYCLECOUNTER) {
-      LoCounter = DAG.getTargetConstant(
-          RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding, DL, XLenVT);
-      HiCounter = DAG.getTargetConstant(
-          RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding, DL, XLenVT);
+      LoCounter = DAG.getTargetConstant(RISCVSysReg::cycle, DL, XLenVT);
+      HiCounter = DAG.getTargetConstant(RISCVSysReg::cycleh, DL, XLenVT);
     } else {
-      LoCounter = DAG.getTargetConstant(
-          RISCVSysReg::lookupSysRegByName("TIME")->Encoding, DL, XLenVT);
-      HiCounter = DAG.getTargetConstant(
-          RISCVSysReg::lookupSysRegByName("TIMEH")->Encoding, DL, XLenVT);
+      LoCounter = DAG.getTargetConstant(RISCVSysReg::time, DL, XLenVT);
+      HiCounter = DAG.getTargetConstant(RISCVSysReg::timeh, DL, XLenVT);
     }
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
     SDValue RCW = DAG.getNode(RISCVISD::READ_COUNTER_WIDE, DL, VTs,
@@ -18397,6 +18383,15 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
 
     auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+
+    // Bail if we might break a sh{1,2,3}add pattern.
+    if (Subtarget.hasStdExtZba() && C2 && C2->getZExtValue() >= 1 &&
+        C2->getZExtValue() <= 3 && N->hasOneUse() &&
+        N->user_begin()->getOpcode() == ISD::ADD &&
+        !isUsedByLdSt(*N->user_begin(), nullptr) &&
+        !isa<ConstantSDNode>(N->user_begin()->getOperand(1)))
+      return false;
+
     if (C1 && C2) {
       const APInt &C1Int = C1->getAPIntValue();
       APInt ShiftedC1Int = C1Int << C2->getAPIntValue();
@@ -20278,13 +20273,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   for (auto &Reg : RegsToPass)
     Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
 
-  if (!IsTailCall) {
-    // Add a register mask operand representing the call-preserved registers.
-    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
-    const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
-    assert(Mask && "Missing call preserved mask for calling convention");
-    Ops.push_back(DAG.getRegisterMask(Mask));
-  }
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
 
   // Glue the call to the argument copies, if any.
   if (Glue.getNode())
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 7598583..1fd130d 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -627,7 +627,7 @@ public:
     return MI;
   }
 
-  void setAVL(VSETVLIInfo Info) {
+  void setAVL(const VSETVLIInfo &Info) {
     assert(Info.isValid());
     if (Info.isUnknown())
       setUnknown();
@@ -1223,7 +1223,8 @@ bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used,
 // If we don't use LMUL or the SEW/LMUL ratio, then adjust LMUL so that we
 // maintain the SEW/LMUL ratio. This allows us to eliminate VL toggles in more
 // places.
-static VSETVLIInfo adjustIncoming(VSETVLIInfo PrevInfo, VSETVLIInfo NewInfo,
+static VSETVLIInfo adjustIncoming(const VSETVLIInfo &PrevInfo,
+                                  const VSETVLIInfo &NewInfo,
                                   DemandedFields &Demanded) {
   VSETVLIInfo Info = NewInfo;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index ae969bff8..349bc36 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -23,7 +23,9 @@ def SDT_RISCVSplitF64     : SDTypeProfile<2, 1, [SDTCisVT<0, i32>,
                                                  SDTCisVT<2, f64>]>;
 
 def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>;
+def : GINodeEquiv<G_MERGE_VALUES, RISCVBuildPairF64>;
 def RISCVSplitF64     : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>;
+def : GINodeEquiv<G_UNMERGE_VALUES, RISCVSplitF64>;
 
 def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmRV32Zdinx">;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 99186ec..942ced8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -536,27 +536,23 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction,
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasVendorXTHeadBa] in {
-def : Pat<(add (XLenVT GPR:$rs1), (shl GPR:$rs2, uimm2:$uimm2)),
+def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
+def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)),
           (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
-def : Pat<(XLenVT (riscv_shl_add GPR:$rs1, uimm2:$uimm2, GPR:$rs2)),
-          (TH_ADDSL GPR:$rs2, GPR:$rs1, uimm2:$uimm2)>;
 
 // Reuse complex patterns from StdExtZba
-def : Pat<(add_non_imm12 sh1add_op:$rs1, (XLenVT GPR:$rs2)),
-          (TH_ADDSL GPR:$rs2, sh1add_op:$rs1, 1)>;
-def : Pat<(add_non_imm12 sh2add_op:$rs1, (XLenVT GPR:$rs2)),
-          (TH_ADDSL GPR:$rs2, sh2add_op:$rs1, 2)>;
-def : Pat<(add_non_imm12 sh3add_op:$rs1, (XLenVT GPR:$rs2)),
-          (TH_ADDSL GPR:$rs2, sh3add_op:$rs1, 3)>;
-
-def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy4:$i),
-          (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy2XForm CSImm12MulBy4:$i))), 2)>;
-def : Pat<(add (XLenVT GPR:$r), CSImm12MulBy8:$i),
-          (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), (SimmShiftRightBy3XForm CSImm12MulBy8:$i))), 3)>;
-
-def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 200)),
-          (SLLI (XLenVT (TH_ADDSL (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)),
-                                  (XLenVT (TH_ADDSL GPR:$r, GPR:$r, 2)), 2)), 3)>;
+def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, sh1add_op:$rs2, 1)>;
+def : Pat<(add_like_non_imm12 sh2add_op:$rs2, (XLenVT GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, sh2add_op:$rs2, 2)>;
+def : Pat<(add_like_non_imm12 sh3add_op:$rs2, (XLenVT GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, sh3add_op:$rs2, 3)>;
+
+def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy4:$i),
+          (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy4:$i)), 2)>;
+def : Pat<(add_like (XLenVT GPR:$r), CSImm12MulBy8:$i),
+          (TH_ADDSL GPR:$r, (XLenVT (ADDI (XLenVT X0), CSImm12MulBy8:$i)), 3)>;
 } // Predicates = [HasVendorXTHeadBa]
 
 let Predicates = [HasVendorXTHeadBb] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 05b5591..6f15646 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -21,6 +21,13 @@ def uimm5nonzero : RISCVOp<XLenVT>,
   let OperandType = "OPERAND_UIMM5_NONZERO";
 }
 
+def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
+  [{return (Imm > 3) && isUInt<5>(Imm);}]> {
+  let ParserMatchClass = UImmAsmOperand<5, "GT3">;
+  let DecoderMethod = "decodeUImmOperand<5>";
+  let OperandType = "OPERAND_UIMM5_GT3";
+}
+
 def uimm11 : RISCVUImmLeafOp<11>;
 
 //===----------------------------------------------------------------------===//
@@ -132,6 +139,33 @@ class QCIStoreMultiple<bits<2> funct2, DAGOperand InTyRs2, string opcodestr>
   let Inst{31-25} = {funct2, imm{6-2}};
 }
 
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCILICC<bits<3> funct3, bits<2> funct2, DAGOperand InTyRs2, string opcodestr>
+    : RVInstRBase<funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
+                  (ins GPRNoX0:$rd, GPRNoX0:$rs1, InTyRs2:$rs2, simm5:$simm),
+                  opcodestr, "$rd, $rs1, $rs2, $simm"> {
+  let Constraints = "$rd = $rd_wb";
+  bits<5> simm;
+
+  let Inst{31-25} = {simm, funct2};
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCIMVCC<bits<3> funct3, string opcodestr>
+    : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd),
+               (ins GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs3),
+               opcodestr, "$rd, $rs1, $rs2, $rs3">;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCIMVCCI<bits<3> funct3, string opcodestr, DAGOperand immType>
+    : RVInstR4<0b10, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd),
+               (ins GPRNoX0:$rs1, immType:$imm, GPRNoX0:$rs3),
+               opcodestr, "$rd, $rs1, $imm, $rs3"> {
+  bits<5> imm;
+
+  let rs2 = imm;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -184,6 +218,37 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 } // Predicates = [HasVendorXqcia, IsRV32], DecoderNamespace = "Xqcia"
 
+let Predicates = [HasVendorXqciac, IsRV32], DecoderNamespace = "Xqciac" in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+  def QC_C_MULADDI : RVInst16CL<0b001, 0b10, (outs GPRC:$rd_wb),
+                               (ins GPRC:$rd, GPRC:$rs1, uimm5:$uimm),
+                               "qc.c.muladdi", "$rd, $rs1, $uimm"> {
+    let Constraints = "$rd = $rd_wb";
+    bits<5> uimm;
+
+    let Inst{12-10} = uimm{3-1};
+    let Inst{6} = uimm{0};
+    let Inst{5} = uimm{4};
+  }
+
+  def QC_MULADDI : RVInstI<0b110, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+                           (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12),
+                           "qc.muladdi", "$rd, $rs1, $imm12"> {
+    let Constraints = "$rd = $rd_wb";
+  }
+
+  def QC_SHLADD : RVInstRBase<0b011, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
+                              (ins GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$shamt),
+                              "qc.shladd", "$rd, $rs1, $rs2, $shamt"> {
+    bits<5> shamt;
+
+    let Inst{31-30} = 0b01;
+    let Inst{29-25} = shamt;
+  }
+
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+} // Predicates = [HasVendorXqciac, IsRV32], DecoderNamespace = "Xqciac"
+
 let Predicates = [HasVendorXqcics, IsRV32], DecoderNamespace = "Xqcics" in {
   def QC_SELECTIIEQ : QCISELECTIICC <0b010, "qc.selectiieq">;
   def QC_SELECTIINE : QCISELECTIICC <0b011, "qc.selectiine">;
@@ -205,6 +270,48 @@ let Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm" in {
     def QC_LWMI : QCILoadMultiple<0b01, uimm5nonzero, "qc.lwmi">;
 } // Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm"
 
+let Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" in {
+  def QC_LIEQ    : QCILICC<0b000, 0b01, GPRNoX0, "qc.lieq">;
+  def QC_LINE    : QCILICC<0b001, 0b01, GPRNoX0, "qc.line">;
+  def QC_LILT    : QCILICC<0b100, 0b01, GPRNoX0, "qc.lilt">;
+  def QC_LIGE    : QCILICC<0b101, 0b01, GPRNoX0, "qc.lige">;
+  def QC_LILTU   : QCILICC<0b110, 0b01, GPRNoX0, "qc.liltu">;
+  def QC_LIGEU   : QCILICC<0b111, 0b01, GPRNoX0, "qc.ligeu">;
+
+  def QC_LIEQI   : QCILICC<0b000, 0b11, simm5, "qc.lieqi">;
+  def QC_LINEI   : QCILICC<0b001, 0b11, simm5, "qc.linei">;
+  def QC_LILTI   : QCILICC<0b100, 0b11, simm5, "qc.lilti">;
+  def QC_LIGEI   : QCILICC<0b101, 0b11, simm5, "qc.ligei">;
+  def QC_LILTUI  : QCILICC<0b110, 0b11, uimm5, "qc.liltui">;
+  def QC_LIGEUI  : QCILICC<0b111, 0b11, uimm5, "qc.ligeui">;
+} // Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli"
+
+let Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm" in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+  def QC_C_MVEQZ   : RVInst16CL<0b101, 0b10, (outs GPRC:$rd_wb),
+                              (ins GPRC:$rd, GPRC:$rs1),
+                              "qc.c.mveqz", "$rd, $rs1"> {
+    let Constraints = "$rd = $rd_wb";
+
+    let Inst{12-10} = 0b011;
+    let Inst{6-5} = 0b00;
+  }
+
+  def QC_MVEQ    : QCIMVCC<0b000, "qc.mveq">;
+  def QC_MVNE    : QCIMVCC<0b001, "qc.mvne">;
+  def QC_MVLT    : QCIMVCC<0b100, "qc.mvlt">;
+  def QC_MVGE    : QCIMVCC<0b101, "qc.mvge">;
+  def QC_MVLTU   : QCIMVCC<0b110, "qc.mvltu">;
+  def QC_MVGEU   : QCIMVCC<0b111, "qc.mvgeu">;
+
+  def QC_MVEQI   : QCIMVCCI<0b000, "qc.mveqi", simm5>;
+  def QC_MVNEI   : QCIMVCCI<0b001, "qc.mvnei", simm5>;
+  def QC_MVLTI   : QCIMVCCI<0b100, "qc.mvlti", simm5>;
+  def QC_MVGEI   : QCIMVCCI<0b101, "qc.mvgei", simm5>;
+  def QC_MVLTUI  : QCIMVCCI<0b110, "qc.mvltui", uimm5>;
+  def QC_MVGEUI  : QCIMVCCI<0b111, "qc.mvgeui", uimm5>;
+} // Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm"
+
 //===----------------------------------------------------------------------===//
 // Aliases
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVPfmCounters.td b/llvm/lib/Target/RISCV/RISCVPfmCounters.td
new file mode 100644
index 0000000..013e789
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVPfmCounters.td
@@ -0,0 +1,18 @@
+//===---- RISCVPfmCounters.td - RISC-V Hardware Counters ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the available hardware counters for RISC-V.
+//
+//===----------------------------------------------------------------------===//
+
+def CpuCyclesPfmCounter : PfmCounter<"CYCLES">;
+
+def DefaultPfmCounters : ProcPfmCounters {
+  let CycleCounter = CpuCyclesPfmCounter;
+}
+def : PfmCountersDefaultBinding<DefaultPfmCounters>;
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 61c7c21..6dfed7dd 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -321,6 +321,25 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
                                                   [TuneNoSinkSplatOperands,
                                                    TuneVXRMPipelineFlush])>;
 
+defvar SiFiveP500TuneFeatures = [TuneNoDefaultUnroll,
+                                 TuneConditionalCompressedMoveFusion,
+                                 TuneLUIADDIFusion,
+                                 TuneAUIPCADDIFusion,
+                                 TunePostRAScheduler];
+
+def SIFIVE_P550 : RISCVProcessorModel<"sifive-p550", NoSchedModel,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtZifencei,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZba,
+                                       FeatureStdExtZbb],
+                                      SiFiveP500TuneFeatures>;
+
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       !listconcat(RVA22U64Features,
                                       [FeatureStdExtV,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index a86c255..396cbe2c 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -182,7 +182,7 @@ def P400WriteCMOV : SchedWriteRes<[SiFiveP400Branch, SiFiveP400IEXQ1]> {
 }
 def : InstRW<[P400WriteCMOV], (instrs PseudoCCMOVGPRNoX0)>;
 
-let Latency = 3 in {
+let Latency = 2 in {
 // Integer multiplication
 def : WriteRes<WriteIMul, [SiFiveP400MulDiv]>;
 def : WriteRes<WriteIMul32, [SiFiveP400MulDiv]>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index 7946a74..ceaeb85 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -237,6 +237,7 @@ def : ReadAdvance<ReadFCvtF16ToI32, 0>;
 def : ReadAdvance<ReadFDiv16, 0>;
 def : ReadAdvance<ReadFCmp16, 0>;
 def : ReadAdvance<ReadFMA16, 0>;
+def : ReadAdvance<ReadFMA16Addend, 0>;
 def : ReadAdvance<ReadFMinMax16, 0>;
 def : ReadAdvance<ReadFMul16, 0>;
 def : ReadAdvance<ReadFSGNJ16, 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index d85b4a9..4c86103 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -19,12 +19,6 @@ include "llvm/TableGen/SearchableTable.td"
 
 class SysReg<string name, bits<12> op> {
   string Name = name;
-  // A maximum of one alias is supported right now.
-  string AltName = name;
-  // A maximum of one deprecated name is supported right now.  Unlike the
-  // `AltName` alias, a `DeprecatedName` generates a diagnostic when the name is
-  // used to encourage software to migrate away from the name.
-  string DeprecatedName = "";
   bits<12> Encoding = op;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read and Write = 0, 1, 2; Read-Only = 3.
@@ -37,14 +31,16 @@ class SysReg<string name, bits<12> op> {
   // bits<6> Number = op{5 - 0};
   code FeaturesRequired = [{ {} }];
   bit isRV32Only = 0;
+  bit isAltName = 0;
+  bit isDeprecatedName = 0;
 }
 
 def SysRegsList : GenericTable {
   let FilterClass = "SysReg";
   // FIXME: add "ReadWrite", "Mode", "Extra", "Number" fields when needed.
   let Fields = [
-    "Name", "AltName", "DeprecatedName", "Encoding", "FeaturesRequired",
-    "isRV32Only",
+    "Name", "Encoding", "FeaturesRequired",
+    "isRV32Only", "isAltName", "isDeprecatedName"
   ];
 
   let PrimaryKey = [ "Encoding" ];
@@ -52,19 +48,15 @@ def SysRegsList : GenericTable {
   let PrimaryKeyReturnRange = true;
 }
 
-def lookupSysRegByName : SearchIndex {
-  let Table = SysRegsList;
-  let Key = [ "Name" ];
-}
-
-def lookupSysRegByAltName : SearchIndex {
-  let Table = SysRegsList;
-  let Key = [ "AltName" ];
+def SysRegEncodings : GenericEnum {
+  let FilterClass = "SysReg";
+  let NameField = "Name";
+  let ValueField = "Encoding";
 }
 
-def lookupSysRegByDeprecatedName : SearchIndex {
+def lookupSysRegByName : SearchIndex {
   let Table = SysRegsList;
-  let Key = [ "DeprecatedName" ];
+  let Key = [ "Name" ];
 }
 
 // The following CSR encodings match those given in Tables 2.2,
@@ -123,15 +115,17 @@ def : SysReg<"senvcfg", 0x10A>;
 def : SysReg<"sscratch", 0x140>;
 def : SysReg<"sepc", 0x141>;
 def : SysReg<"scause", 0x142>;
-let DeprecatedName = "sbadaddr" in
 def : SysReg<"stval", 0x143>;
+let isDeprecatedName = 1 in
+def : SysReg<"sbadaddr", 0x143>;
 def : SysReg<"sip", 0x144>;
 
 //===----------------------------------------------------------------------===//
 // Supervisor Protection and Translation
 //===----------------------------------------------------------------------===//
-let DeprecatedName = "sptbr" in
 def : SysReg<"satp", 0x180>;
+let isDeprecatedName = 1 in
+def : SysReg<"sptbr", 0x180>;
 
 //===----------------------------------------------------------------------===//
 // Quality-of-Service(QoS) Identifiers (Ssqosid)
@@ -245,8 +239,9 @@ def : SysReg<"mstatush", 0x310>;
 def : SysReg<"mscratch", 0x340>;
 def : SysReg<"mepc", 0x341>;
 def : SysReg<"mcause", 0x342>;
-let DeprecatedName = "mbadaddr" in
 def : SysReg<"mtval", 0x343>;
+let isDeprecatedName = 1 in
+def : SysReg<"mbadaddr", 0x343>;
 def : SysReg<"mip", 0x344>;
 def : SysReg<"mtinst", 0x34A>;
 def : SysReg<"mtval2", 0x34B>;
@@ -298,8 +293,9 @@ foreach i = 3...31 in
 //===----------------------------------------------------------------------===//
 // Machine Counter Setup
 //===----------------------------------------------------------------------===//
-let AltName = "mucounteren" in // Privileged spec v1.9.1 Name
 def : SysReg<"mcountinhibit", 0x320>;
+let isAltName = 1 in
+def : SysReg<"mucounteren", 0x320>;
 
 // mhpmevent3-mhpmevent31 at 0x323-0x33F.
 foreach i = 3...31 in
@@ -323,7 +319,10 @@ def : SysReg<"tselect", 0x7A0>;
 def : SysReg<"tdata1", 0x7A1>;
 def : SysReg<"tdata2", 0x7A2>;
 def : SysReg<"tdata3", 0x7A3>;
+def : SysReg<"tinfo", 0x7A4>;
+def : SysReg<"tcontrol", 0x7A5>;
 def : SysReg<"mcontext", 0x7A8>;
+def : SysReg<"mscontext", 0x7AA>;
 
 //===----------------------------------------------------------------------===//
 // Debug Mode Registers
@@ -333,8 +332,9 @@ def : SysReg<"dpc", 0x7B1>;
 
 // "dscratch" is an alternative name for "dscratch0" which appeared in earlier
 // drafts of the RISC-V debug spec
-let AltName = "dscratch" in
 def : SysReg<"dscratch0", 0x7B2>;
+let isAltName = 1 in
+def : SysReg<"dscratch", 0x7B2>;
 def : SysReg<"dscratch1", 0x7B3>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 49192bd..850d624 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1663,7 +1663,7 @@ InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
     return 0;
 
   if (OpInfo.isUniform())
-    // vmv.x.i, vmv.v.x, or vfmv.v.f
+    // vmv.v.i, vmv.v.x, or vfmv.v.f
     // We ignore the cost of the scalar constant materialization to be consistent
     // with how we treat scalar constants themselves just above.
     return 1;
@@ -2329,6 +2329,15 @@ unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
 }
 
+TTI::AddressingModeKind
+RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,
+                                         ScalarEvolution *SE) const {
+  if (ST->hasVendorXCVmem() && !ST->is64Bit())
+    return TTI::AMK_PostIndexed;
+
+  return BasicTTIImplBase::getPreferredAddressingMode(L, SE);
+}
+
 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                                  const TargetTransformInfo::LSRCost &C2) {
   // RISC-V specific here are "instruction number 1st priority".
@@ -2549,16 +2558,21 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   TTI::MemCmpExpansionOptions Options;
   // TODO: Enable expansion when unaligned access is not supported after we fix
   // issues in ExpandMemcmp.
-  if (!(ST->enableUnalignedScalarMem() &&
-        (ST->hasStdExtZbb() || ST->hasStdExtZbkb() || IsZeroCmp)))
+  if (!ST->enableUnalignedScalarMem())
+    return Options;
+
+  if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
     return Options;
 
   Options.AllowOverlappingLoads = true;
   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
   Options.NumLoadsPerBlock = Options.MaxNumLoads;
-  if (ST->is64Bit())
+  if (ST->is64Bit()) {
     Options.LoadSizes = {8, 4, 2, 1};
-  else
+    Options.AllowedTailExpansions = {3, 5, 6};
+  } else {
     Options.LoadSizes = {4, 2, 1};
+    Options.AllowedTailExpansions = {3};
+  }
   return Options;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index bd90bfe..9b36439 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -388,6 +388,9 @@ public:
     llvm_unreachable("unknown register class");
   }
 
+  TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L,
+                                                     ScalarEvolution *SE) const;
+
   unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
     if (Vector)
       return RISCVRegisterClass::VRRC;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 4e3212c..ad61a77 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -50,7 +50,10 @@ public:
   StringRef getPassName() const override { return PASS_NAME; }
 
 private:
-  bool checkUsers(const MachineOperand *&CommonVL, MachineInstr &MI);
+  std::optional<MachineOperand> getMinimumVLForUser(MachineOperand &UserOp);
+  /// Returns the largest common VL MachineOperand that may be used to optimize
+  /// MI. Returns std::nullopt if it failed to find a suitable VL.
+  std::optional<MachineOperand> checkUsers(MachineInstr &MI);
   bool tryReduceVL(MachineInstr &MI);
   bool isCandidate(const MachineInstr &MI) const;
 };
@@ -76,11 +79,6 @@ static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
 
 /// Represents the EMUL and EEW of a MachineOperand.
 struct OperandInfo {
-  enum class State {
-    Unknown,
-    Known,
-  } S;
-
   // Represent as 1,2,4,8, ... and fractional indicator. This is because
   // EMUL can take on values that don't map to RISCVII::VLMUL values exactly.
   // For example, a mask operand can have an EMUL less than MF8.
@@ -89,34 +87,32 @@ struct OperandInfo {
   unsigned Log2EEW;
 
   OperandInfo(RISCVII::VLMUL EMUL, unsigned Log2EEW)
-      : S(State::Known), EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) {
-  }
+      : EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) {}
 
   OperandInfo(std::pair<unsigned, bool> EMUL, unsigned Log2EEW)
-      : S(State::Known), EMUL(EMUL), Log2EEW(Log2EEW) {}
+      : EMUL(EMUL), Log2EEW(Log2EEW) {}
 
-  OperandInfo() : S(State::Unknown) {}
+  OperandInfo(unsigned Log2EEW) : Log2EEW(Log2EEW) {}
 
-  bool isUnknown() const { return S == State::Unknown; }
-  bool isKnown() const { return S == State::Known; }
+  OperandInfo() = delete;
 
   static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
-    assert(A.isKnown() && B.isKnown() && "Both operands must be known");
-
     return A.Log2EEW == B.Log2EEW && A.EMUL->first == B.EMUL->first &&
            A.EMUL->second == B.EMUL->second;
   }
 
+  static bool EEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
+    return A.Log2EEW == B.Log2EEW;
+  }
+
   void print(raw_ostream &OS) const {
-    if (isUnknown()) {
-      OS << "Unknown";
-      return;
-    }
-    assert(EMUL && "Expected EMUL to have value");
-    OS << "EMUL: m";
-    if (EMUL->second)
-      OS << "f";
-    OS << EMUL->first;
+    if (EMUL) {
+      OS << "EMUL: m";
+      if (EMUL->second)
+        OS << "f";
+      OS << EMUL->first;
+    } else
+      OS << "EMUL: unknown\n";
     OS << ", EEW: " << (1 << Log2EEW);
   }
 };
@@ -127,30 +123,18 @@ static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
   return OS;
 }
 
-namespace llvm {
-namespace RISCVVType {
-/// Return the RISCVII::VLMUL that is two times VLMul.
-/// Precondition: VLMul is not LMUL_RESERVED or LMUL_8.
-static RISCVII::VLMUL twoTimesVLMUL(RISCVII::VLMUL VLMul) {
-  switch (VLMul) {
-  case RISCVII::VLMUL::LMUL_F8:
-    return RISCVII::VLMUL::LMUL_F4;
-  case RISCVII::VLMUL::LMUL_F4:
-    return RISCVII::VLMUL::LMUL_F2;
-  case RISCVII::VLMUL::LMUL_F2:
-    return RISCVII::VLMUL::LMUL_1;
-  case RISCVII::VLMUL::LMUL_1:
-    return RISCVII::VLMUL::LMUL_2;
-  case RISCVII::VLMUL::LMUL_2:
-    return RISCVII::VLMUL::LMUL_4;
-  case RISCVII::VLMUL::LMUL_4:
-    return RISCVII::VLMUL::LMUL_8;
-  case RISCVII::VLMUL::LMUL_8:
-  default:
-    llvm_unreachable("Could not multiply VLMul by 2");
-  }
+LLVM_ATTRIBUTE_UNUSED
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const std::optional<OperandInfo> &OI) {
+  if (OI)
+    OI->print(OS);
+  else
+    OS << "nullopt";
+  return OS;
 }
 
+namespace llvm {
+namespace RISCVVType {
 /// Return EMUL = (EEW / SEW) * LMUL where EEW comes from Log2EEW and LMUL and
 /// SEW are from the TSFlags of MI.
 static std::pair<unsigned, bool>
@@ -180,24 +164,22 @@ getEMULEqualsEEWDivSEWTimesLMUL(unsigned Log2EEW, const MachineInstr &MI) {
 } // end namespace RISCVVType
 } // end namespace llvm
 
-/// Dest has EEW=SEW and EMUL=LMUL. Source EEW=SEW/Factor (i.e. F2 => EEW/2).
-/// Source has EMUL=(EEW/SEW)*LMUL. LMUL and SEW comes from TSFlags of MI.
-static OperandInfo getIntegerExtensionOperandInfo(unsigned Factor,
-                                                  const MachineInstr &MI,
-                                                  const MachineOperand &MO) {
-  RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+/// Dest has EEW=SEW. Source EEW=SEW/Factor (i.e. F2 => EEW/2).
+/// SEW comes from TSFlags of MI.
+static unsigned getIntegerExtensionOperandEEW(unsigned Factor,
+                                              const MachineInstr &MI,
+                                              const MachineOperand &MO) {
   unsigned MILog2SEW =
       MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
 
   if (MO.getOperandNo() == 0)
-    return OperandInfo(MIVLMul, MILog2SEW);
+    return MILog2SEW;
 
   unsigned MISEW = 1 << MILog2SEW;
   unsigned EEW = MISEW / Factor;
   unsigned Log2EEW = Log2_32(EEW);
 
-  return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(Log2EEW, MI),
-                     Log2EEW);
+  return Log2EEW;
 }
 
 /// Check whether MO is a mask operand of MI.
@@ -211,18 +193,15 @@ static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO,
   return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID;
 }
 
-/// Return the OperandInfo for MO.
-static OperandInfo getOperandInfo(const MachineOperand &MO,
-                                  const MachineRegisterInfo *MRI) {
+static std::optional<unsigned>
+getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   const MachineInstr &MI = *MO.getParent();
   const RISCVVPseudosTable::PseudoInfo *RVV =
       RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
   assert(RVV && "Could not find MI in PseudoTable");
 
-  // MI has a VLMUL and SEW associated with it. The RVV specification defines
-  // the LMUL and SEW of each operand and definition in relation to MI.VLMUL and
-  // MI.SEW.
-  RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+  // MI has a SEW associated with it. The RVV specification defines
+  // the EEW of each operand and definition in relation to MI.SEW.
   unsigned MILog2SEW =
       MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
 
@@ -233,13 +212,13 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // since they must preserve the entire register content.
   if (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs() &&
       (MO.getReg() != RISCV::NoRegister))
-    return {};
+    return std::nullopt;
 
   bool IsMODef = MO.getOperandNo() == 0;
 
-  // All mask operands have EEW=1, EMUL=(EEW/SEW)*LMUL
+  // All mask operands have EEW=1
   if (isMaskOperand(MI, MO, MRI))
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+    return 0;
 
   // switch against BaseInstr to reduce number of cases that need to be
   // considered.
@@ -256,55 +235,65 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // Vector Loads and Stores
   // Vector Unit-Stride Instructions
   // Vector Strided Instructions
-  /// Dest EEW encoded in the instruction and EMUL=(EEW/SEW)*LMUL
+  /// Dest EEW encoded in the instruction
+  case RISCV::VLM_V:
+  case RISCV::VSM_V:
+    return 0;
+  case RISCV::VLE8_V:
   case RISCV::VSE8_V:
+  case RISCV::VLSE8_V:
   case RISCV::VSSE8_V:
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(3, MI), 3);
+    return 3;
+  case RISCV::VLE16_V:
   case RISCV::VSE16_V:
+  case RISCV::VLSE16_V:
   case RISCV::VSSE16_V:
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(4, MI), 4);
+    return 4;
+  case RISCV::VLE32_V:
   case RISCV::VSE32_V:
+  case RISCV::VLSE32_V:
   case RISCV::VSSE32_V:
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(5, MI), 5);
+    return 5;
+  case RISCV::VLE64_V:
   case RISCV::VSE64_V:
+  case RISCV::VLSE64_V:
   case RISCV::VSSE64_V:
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(6, MI), 6);
+    return 6;
 
   // Vector Indexed Instructions
   // vs(o|u)xei<eew>.v
-  // Dest/Data (operand 0) EEW=SEW, EMUL=LMUL. Source EEW=<eew> and
-  // EMUL=(EEW/SEW)*LMUL.
+  // Dest/Data (operand 0) EEW=SEW.  Source EEW=<eew>.
   case RISCV::VLUXEI8_V:
   case RISCV::VLOXEI8_V:
   case RISCV::VSUXEI8_V:
   case RISCV::VSOXEI8_V: {
     if (MO.getOperandNo() == 0)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(3, MI), 3);
+      return MILog2SEW;
+    return 3;
   }
   case RISCV::VLUXEI16_V:
   case RISCV::VLOXEI16_V:
   case RISCV::VSUXEI16_V:
   case RISCV::VSOXEI16_V: {
     if (MO.getOperandNo() == 0)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(4, MI), 4);
+      return MILog2SEW;
+    return 4;
   }
   case RISCV::VLUXEI32_V:
   case RISCV::VLOXEI32_V:
   case RISCV::VSUXEI32_V:
   case RISCV::VSOXEI32_V: {
     if (MO.getOperandNo() == 0)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(5, MI), 5);
+      return MILog2SEW;
+    return 5;
   }
   case RISCV::VLUXEI64_V:
   case RISCV::VLOXEI64_V:
   case RISCV::VSUXEI64_V:
   case RISCV::VSOXEI64_V: {
     if (MO.getOperandNo() == 0)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(6, MI), 6);
+      return MILog2SEW;
+    return 6;
   }
 
   // Vector Integer Arithmetic Instructions
@@ -318,7 +307,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VRSUB_VX:
   // Vector Bitwise Logical Instructions
   // Vector Single-Width Shift Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VAND_VI:
   case RISCV::VAND_VV:
   case RISCV::VAND_VX:
@@ -338,7 +327,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VSRA_VV:
   case RISCV::VSRA_VX:
   // Vector Integer Min/Max Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VMINU_VV:
   case RISCV::VMINU_VX:
   case RISCV::VMIN_VV:
@@ -348,7 +337,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMAX_VV:
   case RISCV::VMAX_VX:
   // Vector Single-Width Integer Multiply Instructions
-  // Source and Dest EEW=SEW and EMUL=LMUL.
+  // Source and Dest EEW=SEW.
   case RISCV::VMUL_VV:
   case RISCV::VMUL_VX:
   case RISCV::VMULH_VV:
@@ -358,7 +347,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMULHSU_VV:
   case RISCV::VMULHSU_VX:
   // Vector Integer Divide Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VDIVU_VV:
   case RISCV::VDIVU_VX:
   case RISCV::VDIV_VV:
@@ -368,7 +357,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VREM_VV:
   case RISCV::VREM_VX:
   // Vector Single-Width Integer Multiply-Add Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VMACC_VV:
   case RISCV::VMACC_VX:
   case RISCV::VNMSAC_VV:
@@ -379,8 +368,8 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VNMSUB_VX:
   // Vector Integer Merge Instructions
   // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
-  // EEW=SEW and EMUL=LMUL, except the mask operand has EEW=1 and EMUL=
-  // (EEW/SEW)*LMUL. Mask operand is handled before this switch.
+  // EEW=SEW, except the mask operand has EEW=1. Mask operand is handled
+  // before this switch.
   case RISCV::VMERGE_VIM:
   case RISCV::VMERGE_VVM:
   case RISCV::VMERGE_VXM:
@@ -393,7 +382,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // Vector Fixed-Point Arithmetic Instructions
   // Vector Single-Width Saturating Add and Subtract
   // Vector Single-Width Averaging Add and Subtract
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VMV_V_I:
   case RISCV::VMV_V_V:
   case RISCV::VMV_V_X:
@@ -415,8 +404,13 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VASUBU_VX:
   case RISCV::VASUB_VV:
   case RISCV::VASUB_VX:
+  // Vector Single-Width Fractional Multiply with Rounding and Saturation
+  // EEW=SEW. The instruction produces 2*SEW product internally but
+  // saturates to fit into SEW bits.
+  case RISCV::VSMUL_VV:
+  case RISCV::VSMUL_VX:
   // Vector Single-Width Scaling Shift Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VSSRL_VI:
   case RISCV::VSSRL_VV:
   case RISCV::VSSRL_VX:
@@ -426,13 +420,13 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // Vector Permutation Instructions
   // Integer Scalar Move Instructions
   // Floating-Point Scalar Move Instructions
-  // EMUL=LMUL. EEW=SEW.
+  // EEW=SEW.
   case RISCV::VMV_X_S:
   case RISCV::VMV_S_X:
   case RISCV::VFMV_F_S:
   case RISCV::VFMV_S_F:
   // Vector Slide Instructions
-  // EMUL=LMUL. EEW=SEW.
+  // EEW=SEW.
   case RISCV::VSLIDEUP_VI:
   case RISCV::VSLIDEUP_VX:
   case RISCV::VSLIDEDOWN_VI:
@@ -442,19 +436,62 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VSLIDE1DOWN_VX:
   case RISCV::VFSLIDE1DOWN_VF:
   // Vector Register Gather Instructions
-  // EMUL=LMUL. EEW=SEW. For mask operand, EMUL=1 and EEW=1.
+  // EEW=SEW. For mask operand, EEW=1.
   case RISCV::VRGATHER_VI:
   case RISCV::VRGATHER_VV:
   case RISCV::VRGATHER_VX:
   // Vector Compress Instruction
-  // EMUL=LMUL. EEW=SEW.
+  // EEW=SEW.
   case RISCV::VCOMPRESS_VM:
   // Vector Element Index Instruction
   case RISCV::VID_V:
-    return OperandInfo(MIVLMul, MILog2SEW);
+  // Vector Single-Width Floating-Point Add/Subtract Instructions
+  case RISCV::VFADD_VF:
+  case RISCV::VFADD_VV:
+  case RISCV::VFSUB_VF:
+  case RISCV::VFSUB_VV:
+  case RISCV::VFRSUB_VF:
+  // Vector Single-Width Floating-Point Multiply/Divide Instructions
+  case RISCV::VFMUL_VF:
+  case RISCV::VFMUL_VV:
+  case RISCV::VFDIV_VF:
+  case RISCV::VFDIV_VV:
+  case RISCV::VFRDIV_VF:
+  // Vector Floating-Point Square-Root Instruction
+  case RISCV::VFSQRT_V:
+  // Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+  case RISCV::VFRSQRT7_V:
+  // Vector Floating-Point Reciprocal Estimate Instruction
+  case RISCV::VFREC7_V:
+  // Vector Floating-Point MIN/MAX Instructions
+  case RISCV::VFMIN_VF:
+  case RISCV::VFMIN_VV:
+  case RISCV::VFMAX_VF:
+  case RISCV::VFMAX_VV:
+  // Vector Floating-Point Sign-Injection Instructions
+  case RISCV::VFSGNJ_VF:
+  case RISCV::VFSGNJ_VV:
+  case RISCV::VFSGNJN_VV:
+  case RISCV::VFSGNJN_VF:
+  case RISCV::VFSGNJX_VF:
+  case RISCV::VFSGNJX_VV:
+  // Vector Floating-Point Classify Instruction
+  case RISCV::VFCLASS_V:
+  // Vector Floating-Point Move Instruction
+  case RISCV::VFMV_V_F:
+  // Single-Width Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFCVT_XU_F_V:
+  case RISCV::VFCVT_X_F_V:
+  case RISCV::VFCVT_RTZ_XU_F_V:
+  case RISCV::VFCVT_RTZ_X_F_V:
+  case RISCV::VFCVT_F_XU_V:
+  case RISCV::VFCVT_F_X_V:
+  // Vector Floating-Point Merge Instruction
+  case RISCV::VFMERGE_VFM:
+    return MILog2SEW;
 
   // Vector Widening Integer Add/Subtract
-  // Def uses EEW=2*SEW and EMUL=2*LMUL. Operands use EEW=SEW and EMUL=LMUL.
+  // Def uses EEW=2*SEW . Operands use EEW=SEW.
   case RISCV::VWADDU_VV:
   case RISCV::VWADDU_VX:
   case RISCV::VWSUBU_VV:
@@ -465,7 +502,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VWSUB_VX:
   case RISCV::VWSLL_VI:
   // Vector Widening Integer Multiply Instructions
-  // Source and Destination EMUL=LMUL. Destination EEW=2*SEW. Source EEW=SEW.
+  // Destination EEW=2*SEW. Source EEW=SEW.
   case RISCV::VWMUL_VV:
   case RISCV::VWMUL_VX:
   case RISCV::VWMULSU_VV:
@@ -473,7 +510,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VWMULU_VV:
   case RISCV::VWMULU_VX:
   // Vector Widening Integer Multiply-Add Instructions
-  // Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Destination EEW=2*SEW. Source EEW=SEW.
   // A SEW-bit*SEW-bit multiply of the sources forms a 2*SEW-bit value, which
   // is then added to the 2*SEW-bit Dest. These instructions never have a
   // passthru operand.
@@ -483,14 +520,38 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VWMACC_VX:
   case RISCV::VWMACCSU_VV:
   case RISCV::VWMACCSU_VX:
-  case RISCV::VWMACCUS_VX: {
+  case RISCV::VWMACCUS_VX:
+  // Vector Widening Floating-Point Fused Multiply-Add Instructions
+  case RISCV::VFWMACC_VF:
+  case RISCV::VFWMACC_VV:
+  case RISCV::VFWNMACC_VF:
+  case RISCV::VFWNMACC_VV:
+  case RISCV::VFWMSAC_VF:
+  case RISCV::VFWMSAC_VV:
+  case RISCV::VFWNMSAC_VF:
+  case RISCV::VFWNMSAC_VV:
+  // Vector Widening Floating-Point Add/Subtract Instructions
+  // Dest EEW=2*SEW. Source EEW=SEW.
+  case RISCV::VFWADD_VV:
+  case RISCV::VFWADD_VF:
+  case RISCV::VFWSUB_VV:
+  case RISCV::VFWSUB_VF:
+  // Vector Widening Floating-Point Multiply
+  case RISCV::VFWMUL_VF:
+  case RISCV::VFWMUL_VV:
+  // Widening Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFWCVT_XU_F_V:
+  case RISCV::VFWCVT_X_F_V:
+  case RISCV::VFWCVT_RTZ_XU_F_V:
+  case RISCV::VFWCVT_RTZ_X_F_V:
+  case RISCV::VFWCVT_F_XU_V:
+  case RISCV::VFWCVT_F_X_V:
+  case RISCV::VFWCVT_F_F_V: {
     unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW;
-    RISCVII::VLMUL EMUL =
-        IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
-    return OperandInfo(EMUL, Log2EEW);
+    return Log2EEW;
   }
 
-  // Def and Op1 uses EEW=2*SEW and EMUL=2*LMUL. Op2 uses EEW=SEW and EMUL=LMUL
+  // Def and Op1 uses EEW=2*SEW. Op2 uses EEW=SEW.
   case RISCV::VWADDU_WV:
   case RISCV::VWADDU_WX:
   case RISCV::VWSUBU_WV:
@@ -498,29 +559,31 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VWADD_WV:
   case RISCV::VWADD_WX:
   case RISCV::VWSUB_WV:
-  case RISCV::VWSUB_WX: {
+  case RISCV::VWSUB_WX:
+  // Vector Widening Floating-Point Add/Subtract Instructions
+  case RISCV::VFWADD_WF:
+  case RISCV::VFWADD_WV:
+  case RISCV::VFWSUB_WF:
+  case RISCV::VFWSUB_WV: {
     bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
     bool TwoTimes = IsMODef || IsOp1;
     unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
-    RISCVII::VLMUL EMUL =
-        TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
-    return OperandInfo(EMUL, Log2EEW);
+    return Log2EEW;
   }
 
   // Vector Integer Extension
   case RISCV::VZEXT_VF2:
   case RISCV::VSEXT_VF2:
-    return getIntegerExtensionOperandInfo(2, MI, MO);
+    return getIntegerExtensionOperandEEW(2, MI, MO);
   case RISCV::VZEXT_VF4:
   case RISCV::VSEXT_VF4:
-    return getIntegerExtensionOperandInfo(4, MI, MO);
+    return getIntegerExtensionOperandEEW(4, MI, MO);
   case RISCV::VZEXT_VF8:
   case RISCV::VSEXT_VF8:
-    return getIntegerExtensionOperandInfo(8, MI, MO);
+    return getIntegerExtensionOperandEEW(8, MI, MO);
 
   // Vector Narrowing Integer Right Shift Instructions
-  // Destination EEW=SEW and EMUL=LMUL, Op 1 has EEW=2*SEW EMUL=2*LMUL. Op2 has
-  // EEW=SEW EMUL=LMUL.
+  // Destination EEW=SEW, Op 1 has EEW=2*SEW. Op2 has EEW=SEW
   case RISCV::VNSRL_WX:
   case RISCV::VNSRL_WI:
   case RISCV::VNSRL_WV:
@@ -528,19 +591,26 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VNSRA_WV:
   case RISCV::VNSRA_WX:
   // Vector Narrowing Fixed-Point Clip Instructions
-  // Destination and Op1 EEW=SEW and EMUL=LMUL. Op2 EEW=2*SEW and EMUL=2*LMUL
+  // Destination and Op1 EEW=SEW. Op2 EEW=2*SEW.
   case RISCV::VNCLIPU_WI:
   case RISCV::VNCLIPU_WV:
   case RISCV::VNCLIPU_WX:
   case RISCV::VNCLIP_WI:
   case RISCV::VNCLIP_WV:
-  case RISCV::VNCLIP_WX: {
+  case RISCV::VNCLIP_WX:
+  // Narrowing Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFNCVT_XU_F_W:
+  case RISCV::VFNCVT_X_F_W:
+  case RISCV::VFNCVT_RTZ_XU_F_W:
+  case RISCV::VFNCVT_RTZ_X_F_W:
+  case RISCV::VFNCVT_F_XU_W:
+  case RISCV::VFNCVT_F_X_W:
+  case RISCV::VFNCVT_F_F_W:
+  case RISCV::VFNCVT_ROD_F_F_W: {
     bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
     bool TwoTimes = IsOp1;
     unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
-    RISCVII::VLMUL EMUL =
-        TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
-    return OperandInfo(EMUL, Log2EEW);
+    return Log2EEW;
   }
 
   // Vector Mask Instructions
@@ -548,7 +618,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // vmsbf.m set-before-first mask bit
   // vmsif.m set-including-first mask bit
   // vmsof.m set-only-first mask bit
-  // EEW=1 and EMUL=(EEW/SEW)*LMUL
+  // EEW=1
   // We handle the cases when operand is a v0 mask operand above the switch,
   // but these instructions may use non-v0 mask operands and need to be handled
   // specifically.
@@ -563,20 +633,20 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMSBF_M:
   case RISCV::VMSIF_M:
   case RISCV::VMSOF_M: {
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+    return 0;
   }
 
   // Vector Iota Instruction
-  // EEW=SEW and EMUL=LMUL, except the mask operand has EEW=1 and EMUL=
-  // (EEW/SEW)*LMUL. Mask operand is not handled before this switch.
+  // EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled
+  // before this switch.
   case RISCV::VIOTA_M: {
     if (IsMODef || MO.getOperandNo() == 1)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+      return MILog2SEW;
+    return 0;
   }
 
   // Vector Integer Compare Instructions
-  // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Dest EEW=1. Source EEW=SEW.
   case RISCV::VMSEQ_VI:
   case RISCV::VMSEQ_VV:
   case RISCV::VMSEQ_VX:
@@ -598,29 +668,87 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMSGT_VI:
   case RISCV::VMSGT_VX:
   // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
-  // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL. Mask
-  // source operand handled above this switch.
+  // Dest EEW=1. Source EEW=SEW. Mask source operand handled above this switch.
   case RISCV::VMADC_VIM:
   case RISCV::VMADC_VVM:
   case RISCV::VMADC_VXM:
   case RISCV::VMSBC_VVM:
   case RISCV::VMSBC_VXM:
-  // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Dest EEW=1. Source EEW=SEW.
   case RISCV::VMADC_VV:
   case RISCV::VMADC_VI:
   case RISCV::VMADC_VX:
   case RISCV::VMSBC_VV:
-  case RISCV::VMSBC_VX: {
+  case RISCV::VMSBC_VX:
+  // 13.13. Vector Floating-Point Compare Instructions
+  // Dest EEW=1. Source EEW=SEW
+  case RISCV::VMFEQ_VF:
+  case RISCV::VMFEQ_VV:
+  case RISCV::VMFNE_VF:
+  case RISCV::VMFNE_VV:
+  case RISCV::VMFLT_VF:
+  case RISCV::VMFLT_VV:
+  case RISCV::VMFLE_VF:
+  case RISCV::VMFLE_VV:
+  case RISCV::VMFGT_VF:
+  case RISCV::VMFGE_VF: {
     if (IsMODef)
-      return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
-    return OperandInfo(MIVLMul, MILog2SEW);
+      return 0;
+    return MILog2SEW;
+  }
+
+  // Vector Reduction Operations
+  // Vector Single-Width Integer Reduction Instructions
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDXOR_VS: {
+    return MILog2SEW;
   }
 
   default:
-    return {};
+    return std::nullopt;
   }
 }
 
+static std::optional<OperandInfo>
+getOperandInfo(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
+  const MachineInstr &MI = *MO.getParent();
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+  assert(RVV && "Could not find MI in PseudoTable");
+
+  std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO, MRI);
+  if (!Log2EEW)
+    return std::nullopt;
+
+  switch (RVV->BaseInstr) {
+  // Vector Reduction Operations
+  // Vector Single-Width Integer Reduction Instructions
+  // The Dest and VS1 only read element 0 of the vector register. Return just
+  // the EEW for these.
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDXOR_VS:
+    if (MO.getOperandNo() != 2)
+      return OperandInfo(*Log2EEW);
+    break;
+  };
+
+  // All others have EMUL=EEW/SEW*LMUL
+  return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(*Log2EEW, MI),
+                     *Log2EEW);
+}
+
 /// Return true if this optimization should consider MI for VL reduction. This
 /// white-list approach simplifies this optimization for instructions that may
 /// have more complex semantics with relation to how it uses VL.
@@ -632,6 +760,32 @@ static bool isSupportedInstr(const MachineInstr &MI) {
     return false;
 
   switch (RVV->BaseInstr) {
+  // Vector Unit-Stride Instructions
+  // Vector Strided Instructions
+  case RISCV::VLM_V:
+  case RISCV::VLE8_V:
+  case RISCV::VLSE8_V:
+  case RISCV::VLE16_V:
+  case RISCV::VLSE16_V:
+  case RISCV::VLE32_V:
+  case RISCV::VLSE32_V:
+  case RISCV::VLE64_V:
+  case RISCV::VLSE64_V:
+  // Vector Indexed Instructions
+  case RISCV::VLUXEI8_V:
+  case RISCV::VLOXEI8_V:
+  case RISCV::VLUXEI16_V:
+  case RISCV::VLOXEI16_V:
+  case RISCV::VLUXEI32_V:
+  case RISCV::VLOXEI32_V:
+  case RISCV::VLUXEI64_V:
+  case RISCV::VLOXEI64_V: {
+    for (const MachineMemOperand *MMO : MI.memoperands())
+      if (MMO->isVolatile())
+        return false;
+    return true;
+  }
+
   // Vector Single-Width Integer Add and Subtract
   case RISCV::VADD_VI:
   case RISCV::VADD_VV:
@@ -801,6 +955,30 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VMSOF_M:
   case RISCV::VIOTA_M:
   case RISCV::VID_V:
+  // Single-Width Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFCVT_XU_F_V:
+  case RISCV::VFCVT_X_F_V:
+  case RISCV::VFCVT_RTZ_XU_F_V:
+  case RISCV::VFCVT_RTZ_X_F_V:
+  case RISCV::VFCVT_F_XU_V:
+  case RISCV::VFCVT_F_X_V:
+  // Widening Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFWCVT_XU_F_V:
+  case RISCV::VFWCVT_X_F_V:
+  case RISCV::VFWCVT_RTZ_XU_F_V:
+  case RISCV::VFWCVT_RTZ_X_F_V:
+  case RISCV::VFWCVT_F_XU_V:
+  case RISCV::VFWCVT_F_X_V:
+  case RISCV::VFWCVT_F_F_V:
+  // Narrowing Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFNCVT_XU_F_W:
+  case RISCV::VFNCVT_X_F_W:
+  case RISCV::VFNCVT_RTZ_XU_F_W:
+  case RISCV::VFNCVT_RTZ_X_F_W:
+  case RISCV::VFNCVT_F_XU_W:
+  case RISCV::VFNCVT_F_X_W:
+  case RISCV::VFNCVT_F_F_W:
+  case RISCV::VFNCVT_ROD_F_F_W:
     return true;
   }
 
@@ -835,6 +1013,9 @@ static bool isVectorOpUsedAsScalarOp(MachineOperand &MO) {
   case RISCV::VFWREDOSUM_VS:
   case RISCV::VFWREDUSUM_VS:
     return MO.getOperandNo() == 3;
+  case RISCV::VMV_X_S:
+  case RISCV::VFMV_F_S:
+    return MO.getOperandNo() == 1;
   default:
     return false;
   }
@@ -904,6 +1085,11 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
     return false;
   }
 
+  if (MI.mayRaiseFPException()) {
+    LLVM_DEBUG(dbgs() << "Not a candidate because may raise FP exception\n");
+    return false;
+  }
+
   // Some instructions that produce vectors have semantics that make it more
   // difficult to determine whether the VL can be reduced. For example, some
   // instructions, such as reductions, may write lanes past VL to a scalar
@@ -925,79 +1111,103 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   return true;
 }
 
-bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL,
-                                  MachineInstr &MI) {
+std::optional<MachineOperand>
+RISCVVLOptimizer::getMinimumVLForUser(MachineOperand &UserOp) {
+  const MachineInstr &UserMI = *UserOp.getParent();
+  const MCInstrDesc &Desc = UserMI.getDesc();
+
+  if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
+    LLVM_DEBUG(dbgs() << "    Abort due to lack of VL, assume that"
+                         " use VLMAX\n");
+    return std::nullopt;
+  }
+
+  // Instructions like reductions may use a vector register as a scalar
+  // register. In this case, we should treat it as only reading the first lane.
+  if (isVectorOpUsedAsScalarOp(UserOp)) {
+    [[maybe_unused]] Register R = UserOp.getReg();
+    [[maybe_unused]] const TargetRegisterClass *RC = MRI->getRegClass(R);
+    assert(RISCV::VRRegClass.hasSubClassEq(RC) &&
+           "Expect LMUL 1 register class for vector as scalar operands!");
+    LLVM_DEBUG(dbgs() << "    Used this operand as a scalar operand\n");
+
+    return MachineOperand::CreateImm(1);
+  }
+
+  unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+  const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
+  // Looking for an immediate or a register VL that isn't X0.
+  assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
+         "Did not expect X0 VL");
+  return VLOp;
+}
+
+std::optional<MachineOperand> RISCVVLOptimizer::checkUsers(MachineInstr &MI) {
   // FIXME: Avoid visiting each user for each time we visit something on the
   // worklist, combined with an extra visit from the outer loop. Restructure
   // along lines of an instcombine style worklist which integrates the outer
   // pass.
-  bool CanReduceVL = true;
+  std::optional<MachineOperand> CommonVL;
   for (auto &UserOp : MRI->use_operands(MI.getOperand(0).getReg())) {
     const MachineInstr &UserMI = *UserOp.getParent();
     LLVM_DEBUG(dbgs() << "  Checking user: " << UserMI << "\n");
-
-    // Instructions like reductions may use a vector register as a scalar
-    // register. In this case, we should treat it like a scalar register which
-    // does not impact the decision on whether to optimize VL.
-    // TODO: Treat it like a scalar register instead of bailing out.
-    if (isVectorOpUsedAsScalarOp(UserOp)) {
-      CanReduceVL = false;
-      break;
-    }
-
     if (mayReadPastVL(UserMI)) {
       LLVM_DEBUG(dbgs() << "    Abort because used by unsafe instruction\n");
-      CanReduceVL = false;
-      break;
+      return std::nullopt;
     }
 
     // Tied operands might pass through.
     if (UserOp.isTied()) {
       LLVM_DEBUG(dbgs() << "    Abort because user used as tied operand\n");
-      CanReduceVL = false;
-      break;
-    }
-
-    const MCInstrDesc &Desc = UserMI.getDesc();
-    if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
-      LLVM_DEBUG(dbgs() << "    Abort due to lack of VL or SEW, assume that"
-                           " use VLMAX\n");
-      CanReduceVL = false;
-      break;
+      return std::nullopt;
     }
 
-    unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
-    const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
-
-    // Looking for an immediate or a register VL that isn't X0.
-    assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
-           "Did not expect X0 VL");
+    auto VLOp = getMinimumVLForUser(UserOp);
+    if (!VLOp)
+      return std::nullopt;
 
     // Use the largest VL among all the users. If we cannot determine this
     // statically, then we cannot optimize the VL.
-    if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, VLOp)) {
-      CommonVL = &VLOp;
+    if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, *VLOp)) {
+      CommonVL = *VLOp;
       LLVM_DEBUG(dbgs() << "    User VL is: " << VLOp << "\n");
-    } else if (!RISCV::isVLKnownLE(VLOp, *CommonVL)) {
+    } else if (!RISCV::isVLKnownLE(*VLOp, *CommonVL)) {
       LLVM_DEBUG(dbgs() << "    Abort because cannot determine a common VL\n");
-      CanReduceVL = false;
-      break;
+      return std::nullopt;
+    }
+
+    if (!RISCVII::hasSEWOp(UserMI.getDesc().TSFlags)) {
+      LLVM_DEBUG(dbgs() << "    Abort due to lack of SEW operand\n");
+      return std::nullopt;
+    }
+
+    std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp, MRI);
+    std::optional<OperandInfo> ProducerInfo =
+        getOperandInfo(MI.getOperand(0), MRI);
+    if (!ConsumerInfo || !ProducerInfo) {
+      LLVM_DEBUG(dbgs() << "    Abort due to unknown operand information.\n");
+      LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
+      LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
+      return std::nullopt;
     }
 
-    // The SEW and LMUL of destination and source registers need to match.
-    OperandInfo ConsumerInfo = getOperandInfo(UserOp, MRI);
-    OperandInfo ProducerInfo = getOperandInfo(MI.getOperand(0), MRI);
-    if (ConsumerInfo.isUnknown() || ProducerInfo.isUnknown() ||
-        !OperandInfo::EMULAndEEWAreEqual(ConsumerInfo, ProducerInfo)) {
-      LLVM_DEBUG(dbgs() << "    Abort due to incompatible or unknown "
-                           "information for EMUL or EEW.\n");
+    // If the operand is used as a scalar operand, then the EEW must be
+    // compatible. Otherwise, the EMUL *and* EEW must be compatible.
+    bool IsVectorOpUsedAsScalarOp = isVectorOpUsedAsScalarOp(UserOp);
+    if ((IsVectorOpUsedAsScalarOp &&
+         !OperandInfo::EEWAreEqual(*ConsumerInfo, *ProducerInfo)) ||
+        (!IsVectorOpUsedAsScalarOp &&
+         !OperandInfo::EMULAndEEWAreEqual(*ConsumerInfo, *ProducerInfo))) {
+      LLVM_DEBUG(
+          dbgs()
+          << "    Abort due to incompatible information for EMUL or EEW.\n");
       LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
       LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
-      CanReduceVL = false;
-      break;
+      return std::nullopt;
     }
   }
-  return CanReduceVL;
+
+  return CommonVL;
 }
 
 bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
@@ -1009,12 +1219,11 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
     MachineInstr &MI = *Worklist.pop_back_val();
     LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n");
 
-    const MachineOperand *CommonVL = nullptr;
-    bool CanReduceVL = true;
-    if (isVectorRegClass(MI.getOperand(0).getReg(), MRI))
-      CanReduceVL = checkUsers(CommonVL, MI);
+    if (!isVectorRegClass(MI.getOperand(0).getReg(), MRI))
+      continue;
 
-    if (!CanReduceVL || !CommonVL)
+    auto CommonVL = checkUsers(MI);
+    if (!CommonVL)
       continue;
 
     assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) &&
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index aa83d99..a79e19f 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_target(SPIRVCodeGen
   SPIRVCallLowering.cpp
   SPIRVInlineAsmLowering.cpp
   SPIRVCommandLine.cpp
-  SPIRVDuplicatesTracker.cpp
   SPIRVEmitIntrinsics.cpp
   SPIRVGlobalRegistry.cpp
   SPIRVInstrInfo.cpp
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 4012bd7..78add92 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -274,7 +274,7 @@ void SPIRVAsmPrinter::emitInstruction(const MachineInstr *MI) {
 }
 
 void SPIRVAsmPrinter::outputModuleSection(SPIRV::ModuleSectionType MSType) {
-  for (MachineInstr *MI : MAI->getMSInstrs(MSType))
+  for (const MachineInstr *MI : MAI->getMSInstrs(MSType))
     outputInstruction(MI);
 }
 
@@ -326,7 +326,7 @@ void SPIRVAsmPrinter::outputOpMemoryModel() {
 void SPIRVAsmPrinter::outputEntryPoints() {
   // Find all OpVariable IDs with required StorageClass.
   DenseSet<Register> InterfaceIDs;
-  for (MachineInstr *MI : MAI->GlobalVarList) {
+  for (const MachineInstr *MI : MAI->GlobalVarList) {
     assert(MI->getOpcode() == SPIRV::OpVariable);
     auto SC = static_cast<SPIRV::StorageClass::StorageClass>(
         MI->getOperand(2).getImm());
@@ -336,14 +336,14 @@ void SPIRVAsmPrinter::outputEntryPoints() {
     // declaring all global variables referenced by the entry point call tree.
     if (ST->isAtLeastSPIRVVer(VersionTuple(1, 4)) ||
         SC == SPIRV::StorageClass::Input || SC == SPIRV::StorageClass::Output) {
-      MachineFunction *MF = MI->getMF();
+      const MachineFunction *MF = MI->getMF();
       Register Reg = MAI->getRegisterAlias(MF, MI->getOperand(0).getReg());
       InterfaceIDs.insert(Reg);
     }
   }
 
   // Output OpEntryPoints adding interface args to all of them.
-  for (MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_EntryPoints)) {
+  for (const MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_EntryPoints)) {
     SPIRVMCInstLower MCInstLowering;
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst, MAI);
@@ -381,9 +381,8 @@ void SPIRVAsmPrinter::outputGlobalRequirements() {
 
 void SPIRVAsmPrinter::outputExtFuncDecls() {
   // Insert OpFunctionEnd after each declaration.
-  SmallVectorImpl<MachineInstr *>::iterator
-      I = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).begin(),
-      E = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).end();
+  auto I = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).begin(),
+       E = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).end();
   for (; I != E; ++I) {
     outputInstruction(*I);
     if ((I + 1) == E || (*(I + 1))->getOpcode() == SPIRV::OpFunction)
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index fa37313f..44b6f5f8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -418,6 +418,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                .addImm(FuncControl)
                                .addUse(GR->getSPIRVTypeID(FuncTy));
   GR->recordFunctionDefinition(&F, &MB.getInstr()->getOperand(0));
+  GR->addGlobalObject(&F, &MIRBuilder.getMF(), FuncVReg);
 
   // Add OpFunctionParameter instructions
   int i = 0;
@@ -431,6 +432,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
         .addUse(GR->getSPIRVTypeID(ArgTypeVRegs[i]));
     if (F.isDeclaration())
       GR->add(&Arg, &MIRBuilder.getMF(), ArgReg);
+    GR->addGlobalObject(&Arg, &MIRBuilder.getMF(), ArgReg);
     i++;
   }
   // Name the function.
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
deleted file mode 100644
index 48df845..0000000
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===-- SPIRVDuplicatesTracker.cpp - SPIR-V Duplicates Tracker --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// General infrastructure for keeping track of the values that according to
-// the SPIR-V binary layout should be global to the whole module.
-//
-//===----------------------------------------------------------------------===//
-
-#include "SPIRVDuplicatesTracker.h"
-#include "SPIRVInstrInfo.h"
-
-#define DEBUG_TYPE "build-dep-graph"
-
-using namespace llvm;
-
-template <typename T>
-void SPIRVGeneralDuplicatesTracker::prebuildReg2Entry(
-    SPIRVDuplicatesTracker<T> &DT, SPIRVReg2EntryTy &Reg2Entry,
-    const SPIRVInstrInfo *TII) {
-  for (auto &TPair : DT.getAllUses()) {
-    for (auto &RegPair : TPair.second) {
-      const MachineFunction *MF = RegPair.first;
-      Register R = RegPair.second;
-      MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(R);
-      if (!MI || (TPair.second.getIsConst() && !TII->isConstantInstr(*MI)))
-        continue;
-      Reg2Entry[&MI->getOperand(0)] = &TPair.second;
-    }
-  }
-}
-
-void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
-    std::vector<SPIRV::DTSortableEntry *> &Graph, const SPIRVInstrInfo *TII,
-    MachineModuleInfo *MMI = nullptr) {
-  SPIRVReg2EntryTy Reg2Entry;
-  prebuildReg2Entry(TT, Reg2Entry, TII);
-  prebuildReg2Entry(CT, Reg2Entry, TII);
-  prebuildReg2Entry(GT, Reg2Entry, TII);
-  prebuildReg2Entry(FT, Reg2Entry, TII);
-  prebuildReg2Entry(AT, Reg2Entry, TII);
-  prebuildReg2Entry(MT, Reg2Entry, TII);
-  prebuildReg2Entry(ST, Reg2Entry, TII);
-
-  for (auto &Op2E : Reg2Entry) {
-    SPIRV::DTSortableEntry *E = Op2E.second;
-    Graph.push_back(E);
-    for (auto &U : *E) {
-      const MachineRegisterInfo &MRI = U.first->getRegInfo();
-      MachineInstr *MI = MRI.getUniqueVRegDef(U.second);
-      if (!MI)
-        continue;
-      assert(MI && MI->getParent() && "No MachineInstr created yet");
-      for (auto i = MI->getNumDefs(); i < MI->getNumOperands(); i++) {
-        MachineOperand &Op = MI->getOperand(i);
-        if (!Op.isReg())
-          continue;
-        MachineInstr *VRegDef = MRI.getVRegDef(Op.getReg());
-        // References to a function via function pointers generate virtual
-        // registers without a definition. We are able to resolve this
-        // reference using Globar Register info into an OpFunction instruction
-        // but do not expect to find it in Reg2Entry.
-        if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL && i == 2)
-          continue;
-        MachineOperand *RegOp = &VRegDef->getOperand(0);
-        if (Reg2Entry.count(RegOp) == 0 &&
-            (MI->getOpcode() != SPIRV::OpVariable || i != 3)) {
-          // try to repair the unexpected code pattern
-          bool IsFixed = false;
-          if (VRegDef->getOpcode() == TargetOpcode::G_CONSTANT &&
-              RegOp->isReg() && MRI.getType(RegOp->getReg()).isScalar()) {
-            const Constant *C = VRegDef->getOperand(1).getCImm();
-            add(C, MI->getParent()->getParent(), RegOp->getReg());
-            auto Iter = CT.Storage.find(C);
-            if (Iter != CT.Storage.end()) {
-              SPIRV::DTSortableEntry &MissedEntry = Iter->second;
-              Reg2Entry[RegOp] = &MissedEntry;
-              IsFixed = true;
-            }
-          }
-          if (!IsFixed) {
-            std::string DiagMsg;
-            raw_string_ostream OS(DiagMsg);
-            OS << "Unexpected pattern while building a dependency "
-                  "graph.\nInstruction: ";
-            MI->print(OS);
-            OS << "Operand: ";
-            Op.print(OS);
-            OS << "\nOperand definition: ";
-            VRegDef->print(OS);
-            report_fatal_error(DiagMsg.c_str());
-          }
-        }
-        if (Reg2Entry.count(RegOp))
-          E->addDep(Reg2Entry[RegOp]);
-      }
-
-      if (E->getIsFunc()) {
-        MachineInstr *Next = MI->getNextNode();
-        if (Next && (Next->getOpcode() == SPIRV::OpFunction ||
-                     Next->getOpcode() == SPIRV::OpFunctionParameter)) {
-          E->addDep(Reg2Entry[&Next->getOperand(0)]);
-        }
-      }
-    }
-  }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  if (MMI) {
-    const Module *M = MMI->getModule();
-    for (auto F = M->begin(), E = M->end(); F != E; ++F) {
-      const MachineFunction *MF = MMI->getMachineFunction(*F);
-      if (!MF)
-        continue;
-      for (const MachineBasicBlock &MBB : *MF) {
-        for (const MachineInstr &CMI : MBB) {
-          MachineInstr &MI = const_cast<MachineInstr &>(CMI);
-          MI.dump();
-          if (MI.getNumExplicitDefs() > 0 &&
-              Reg2Entry.count(&MI.getOperand(0))) {
-            dbgs() << "\t[";
-            for (SPIRV::DTSortableEntry *D :
-                 Reg2Entry.lookup(&MI.getOperand(0))->getDeps())
-              dbgs() << Register::virtReg2Index(D->lookup(MF)) << ", ";
-            dbgs() << "]\n";
-          }
-        }
-      }
-    }
-  }
-#endif
-}
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
index 6847da0..e574892 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
@@ -211,23 +211,7 @@ class SPIRVGeneralDuplicatesTracker {
   SPIRVDuplicatesTracker<MachineInstr> MT;
   SPIRVDuplicatesTracker<SPIRV::SpecialTypeDescriptor> ST;
 
-  // NOTE: using MOs instead of regs to get rid of MF dependency to be able
-  // to use flat data structure.
-  // NOTE: replacing DenseMap with MapVector doesn't affect overall correctness
-  // but makes LITs more stable, should prefer DenseMap still due to
-  // significant perf difference.
-  using SPIRVReg2EntryTy =
-      MapVector<MachineOperand *, SPIRV::DTSortableEntry *>;
-
-  template <typename T>
-  void prebuildReg2Entry(SPIRVDuplicatesTracker<T> &DT,
-                         SPIRVReg2EntryTy &Reg2Entry,
-                         const SPIRVInstrInfo *TII);
-
 public:
-  void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph,
-                      const SPIRVInstrInfo *TII, MachineModuleInfo *MMI);
-
   void add(const Type *Ty, const MachineFunction *MF, Register R) {
     TT.add(unifyPtrType(Ty), MF, R);
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 77b5421..d2b14d6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -1841,20 +1841,20 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
   // Skip special artifical variable llvm.global.annotations.
   if (GV.getName() == "llvm.global.annotations")
     return;
-  if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
+  Constant *Init = nullptr;
+  if (hasInitializer(&GV)) {
     // Deduce element type and store results in Global Registry.
     // Result is ignored, because TypedPointerType is not supported
     // by llvm IR general logic.
     deduceElementTypeHelper(&GV, false);
-    Constant *Init = GV.getInitializer();
+    Init = GV.getInitializer();
     Type *Ty = isAggrConstForceInt32(Init) ? B.getInt32Ty() : Init->getType();
     Constant *Const = isAggrConstForceInt32(Init) ? B.getInt32(1) : Init;
     auto *InitInst = B.CreateIntrinsic(Intrinsic::spv_init_global,
                                        {GV.getType(), Ty}, {&GV, Const});
     InitInst->setArgOperand(1, Init);
   }
-  if ((!GV.hasInitializer() || isa<UndefValue>(GV.getInitializer())) &&
-      GV.getNumUses() == 0)
+  if (!Init && GV.getNumUses() == 0)
     B.CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV);
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 0c424477..a06c62e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -721,6 +721,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
   }
   Reg = MIB->getOperand(0).getReg();
   DT.add(GVar, &MIRBuilder.getMF(), Reg);
+  addGlobalObject(GVar, &MIRBuilder.getMF(), Reg);
 
   // Set to Reg the same type as ResVReg has.
   auto MRI = MIRBuilder.getMRI();
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index ec2386fa..528baf5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -89,6 +89,9 @@ class SPIRVGlobalRegistry {
   // Intrinsic::spv_assign_ptr_type instructions.
   DenseMap<Value *, CallInst *> AssignPtrTypeInstr;
 
+  // Maps OpVariable and OpFunction-related v-regs to its LLVM IR definition.
+  DenseMap<std::pair<const MachineFunction *, Register>, const Value *> Reg2GO;
+
   // Add a new OpTypeXXX instruction without checking for duplicates.
   SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
                              SPIRV::AccessQualifier::AccessQualifier AQ =
@@ -151,15 +154,17 @@ public:
     return DT.find(F, MF);
   }
 
-  void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph,
-                      const SPIRVInstrInfo *TII,
-                      MachineModuleInfo *MMI = nullptr) {
-    DT.buildDepsGraph(Graph, TII, MMI);
-  }
-
   void setBound(unsigned V) { Bound = V; }
   unsigned getBound() { return Bound; }
 
+  void addGlobalObject(const Value *V, const MachineFunction *MF, Register R) {
+    Reg2GO[std::make_pair(MF, R)] = V;
+  }
+  const Value *getGlobalObject(const MachineFunction *MF, Register R) {
+    auto It = Reg2GO.find(std::make_pair(MF, R));
+    return It == Reg2GO.end() ? nullptr : It->second;
+  }
+
   // Add a record to the map of function return pointer types.
   void addReturnType(const Function *ArgF, TypedPointerType *DerivedTy) {
     FunResPointerTypes[ArgF] = DerivedTy;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index bd9e77e..9a140e7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -47,6 +47,19 @@ bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   }
 }
 
+bool SPIRVInstrInfo::isSpecConstantInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SPIRV::OpSpecConstantTrue:
+  case SPIRV::OpSpecConstantFalse:
+  case SPIRV::OpSpecConstant:
+  case SPIRV::OpSpecConstantComposite:
+  case SPIRV::OpSpecConstantOp:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool SPIRVInstrInfo::isInlineAsmDefInstr(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case SPIRV::OpAsmTargetINTEL:
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 67d2d97..4e5059b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -30,6 +30,7 @@ public:
   const SPIRVRegisterInfo &getRegisterInfo() const { return RI; }
   bool isHeaderInstr(const MachineInstr &MI) const;
   bool isConstantInstr(const MachineInstr &MI) const;
+  bool isSpecConstantInstr(const MachineInstr &MI) const;
   bool isInlineAsmDefInstr(const MachineInstr &MI) const;
   bool isTypeDeclInstr(const MachineInstr &MI) const;
   bool isDecorationInstr(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 289d5f3..28c9b81 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1105,6 +1105,7 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
                                             Constant::getNullValue(LLVMArrTy));
     Register VarReg = MRI->createGenericVirtualRegister(LLT::scalar(64));
     GR.add(GV, GR.CurMF, VarReg);
+    GR.addGlobalObject(GV, GR.CurMF, VarReg);
 
     Result &=
         BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
@@ -2881,6 +2882,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     // translated to a `LocalInvocationId` builtin variable
     return loadVec3BuiltinInputID(SPIRV::BuiltIn::LocalInvocationId, ResVReg,
                                   ResType, I);
+  case Intrinsic::spv_group_id:
+    // The HLSL SV_GroupId semantic is lowered to
+    // llvm.spv.group.id intrinsic in LLVM IR for SPIR-V backend.
+    //
+    // In SPIR-V backend, llvm.spv.group.id is now translated to a `WorkgroupId`
+    // builtin variable
+    return loadVec3BuiltinInputID(SPIRV::BuiltIn::WorkgroupId, ResVReg, ResType,
+                                  I);
   case Intrinsic::spv_fdot:
     return selectFloatDot(ResVReg, ResType, I);
   case Intrinsic::spv_udot:
@@ -2906,6 +2915,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectAny(ResVReg, ResType, I);
   case Intrinsic::spv_cross:
     return selectExtInst(ResVReg, ResType, I, CL::cross, GL::Cross);
+  case Intrinsic::spv_distance:
+    return selectExtInst(ResVReg, ResType, I, CL::distance, GL::Distance);
   case Intrinsic::spv_lerp:
     return selectExtInst(ResVReg, ResType, I, CL::mix, GL::FMix);
   case Intrinsic::spv_length:
@@ -3450,7 +3461,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
       ID = UnnamedGlobalIDs.size();
     GlobalIdent = "__unnamed_" + Twine(ID).str();
   } else {
-    GlobalIdent = GV->getGlobalIdentifier();
+    GlobalIdent = GV->getName();
   }
 
   // Behaviour of functions as operands depends on availability of the
@@ -3482,18 +3493,25 @@ bool SPIRVInstructionSelector::selectGlobalValue(
         // References to a function via function pointers generate virtual
         // registers without a definition. We will resolve it later, during
         // module analysis stage.
+        Register ResTypeReg = GR.getSPIRVTypeID(ResType);
         MachineRegisterInfo *MRI = MIRBuilder.getMRI();
-        Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(64));
-        MRI->setRegClass(FuncVReg, &SPIRV::iIDRegClass);
-        MachineInstrBuilder MB =
+        Register FuncVReg =
+            MRI->createGenericVirtualRegister(GR.getRegType(ResType));
+        MRI->setRegClass(FuncVReg, &SPIRV::pIDRegClass);
+        MachineInstrBuilder MIB1 =
+            BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpUndef))
+                .addDef(FuncVReg)
+                .addUse(ResTypeReg);
+        MachineInstrBuilder MIB2 =
             BuildMI(BB, I, I.getDebugLoc(),
                     TII.get(SPIRV::OpConstantFunctionPointerINTEL))
                 .addDef(NewReg)
-                .addUse(GR.getSPIRVTypeID(ResType))
+                .addUse(ResTypeReg)
                 .addUse(FuncVReg);
         // mapping the function pointer to the used Function
-        GR.recordFunctionPointer(&MB.getInstr()->getOperand(2), GVFun);
-        return MB.constrainAllUses(TII, TRI, RBI);
+        GR.recordFunctionPointer(&MIB2.getInstr()->getOperand(2), GVFun);
+        return MIB1.constrainAllUses(TII, TRI, RBI) &&
+               MIB2.constrainAllUses(TII, TRI, RBI);
       }
       return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
           .addDef(NewReg)
@@ -3506,18 +3524,16 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   auto GlobalVar = cast<GlobalVariable>(GV);
   assert(GlobalVar->getName() != "llvm.global.annotations");
 
-  bool HasInit = GlobalVar->hasInitializer() &&
-                 !isa<UndefValue>(GlobalVar->getInitializer());
-  // Skip empty declaration for GVs with initilaizers till we get the decl with
+  // Skip empty declaration for GVs with initializers till we get the decl with
   // passed initializer.
-  if (HasInit && !Init)
+  if (hasInitializer(GlobalVar) && !Init)
     return true;
 
-  bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage;
+  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage();
   SPIRV::LinkageType::LinkageType LnkType =
-      (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+      GV->isDeclarationForLinker()
           ? SPIRV::LinkageType::Import
-          : (GV->getLinkage() == GlobalValue::LinkOnceODRLinkage &&
+          : (GV->hasLinkOnceODRLinkage() &&
                      STI.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr)
                  ? SPIRV::LinkageType::LinkOnceODR
                  : SPIRV::LinkageType::Export);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 6371c67..63adf54 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -216,102 +216,262 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) {
   }
 }
 
-// Collect MI which defines the register in the given machine function.
-static void collectDefInstr(Register Reg, const MachineFunction *MF,
-                            SPIRV::ModuleAnalysisInfo *MAI,
-                            SPIRV::ModuleSectionType MSType,
-                            bool DoInsert = true) {
-  assert(MAI->hasRegisterAlias(MF, Reg) && "Cannot find register alias");
-  MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(Reg);
-  assert(MI && "There should be an instruction that defines the register");
-  MAI->setSkipEmission(MI);
-  if (DoInsert)
-    MAI->MS[MSType].push_back(MI);
+// Returns a representation of an instruction as a vector of MachineOperand
+// hash values, see llvm::hash_value(const MachineOperand &MO) for details.
+// This creates a signature of the instruction with the same content
+// that MachineOperand::isIdenticalTo uses for comparison.
+static InstrSignature instrToSignature(const MachineInstr &MI,
+                                       SPIRV::ModuleAnalysisInfo &MAI,
+                                       bool UseDefReg) {
+  InstrSignature Signature{MI.getOpcode()};
+  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    size_t h;
+    if (MO.isReg()) {
+      if (!UseDefReg && MO.isDef())
+        continue;
+      Register RegAlias = MAI.getRegisterAlias(MI.getMF(), MO.getReg());
+      if (!RegAlias.isValid()) {
+        LLVM_DEBUG({
+          dbgs() << "Unexpectedly, no global id found for the operand ";
+          MO.print(dbgs());
+          dbgs() << "\nInstruction: ";
+          MI.print(dbgs());
+          dbgs() << "\n";
+        });
+        report_fatal_error("All v-regs must have been mapped to global id's");
+      }
+      // mimic llvm::hash_value(const MachineOperand &MO)
+      h = hash_combine(MO.getType(), (unsigned)RegAlias, MO.getSubReg(),
+                       MO.isDef());
+    } else {
+      h = hash_value(MO);
+    }
+    Signature.push_back(h);
+  }
+  return Signature;
 }
 
-void SPIRVModuleAnalysis::collectGlobalEntities(
-    const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
-    SPIRV::ModuleSectionType MSType,
-    std::function<bool(const SPIRV::DTSortableEntry *)> Pred,
-    bool UsePreOrder = false) {
-  DenseSet<const SPIRV::DTSortableEntry *> Visited;
-  for (const auto *E : DepsGraph) {
-    std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil;
-    // NOTE: here we prefer recursive approach over iterative because
-    // we don't expect depchains long enough to cause SO.
-    RecHoistUtil = [MSType, UsePreOrder, &Visited, &Pred,
-                    &RecHoistUtil](const SPIRV::DTSortableEntry *E) {
-      if (Visited.count(E) || !Pred(E))
-        return;
-      Visited.insert(E);
-
-      // Traversing deps graph in post-order allows us to get rid of
-      // register aliases preprocessing.
-      // But pre-order is required for correct processing of function
-      // declaration and arguments processing.
-      if (!UsePreOrder)
-        for (auto *S : E->getDeps())
-          RecHoistUtil(S);
-
-      Register GlobalReg = Register::index2VirtReg(MAI.getNextID());
-      bool IsFirst = true;
-      for (auto &U : *E) {
-        const MachineFunction *MF = U.first;
-        Register Reg = U.second;
-        MAI.setRegisterAlias(MF, Reg, GlobalReg);
-        if (!MF->getRegInfo().getUniqueVRegDef(Reg))
-          continue;
-        collectDefInstr(Reg, MF, &MAI, MSType, IsFirst);
-        IsFirst = false;
-        if (E->getIsGV())
-          MAI.GlobalVarList.push_back(MF->getRegInfo().getUniqueVRegDef(Reg));
-      }
+bool SPIRVModuleAnalysis::isDeclSection(const MachineRegisterInfo &MRI,
+                                        const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case SPIRV::OpTypeForwardPointer:
+    // omit now, collect later
+    return false;
+  case SPIRV::OpVariable:
+    return static_cast<SPIRV::StorageClass::StorageClass>(
+               MI.getOperand(2).getImm()) != SPIRV::StorageClass::Function;
+  case SPIRV::OpFunction:
+  case SPIRV::OpFunctionParameter:
+    return true;
+  }
+  if (GR->hasConstFunPtr() && Opcode == SPIRV::OpUndef) {
+    Register DefReg = MI.getOperand(0).getReg();
+    for (MachineInstr &UseMI : MRI.use_instructions(DefReg)) {
+      if (UseMI.getOpcode() != SPIRV::OpConstantFunctionPointerINTEL)
+        continue;
+      // it's a dummy definition, FP constant refers to a function,
+      // and this is resolved in another way; let's skip this definition
+      assert(UseMI.getOperand(2).isReg() &&
+             UseMI.getOperand(2).getReg() == DefReg);
+      MAI.setSkipEmission(&MI);
+      return false;
+    }
+  }
+  return TII->isTypeDeclInstr(MI) || TII->isConstantInstr(MI) ||
+         TII->isInlineAsmDefInstr(MI);
+}
 
-      if (UsePreOrder)
-        for (auto *S : E->getDeps())
-          RecHoistUtil(S);
-    };
-    RecHoistUtil(E);
+// This is a special case of a function pointer refering to a possibly
+// forward function declaration. The operand is a dummy OpUndef that
+// requires a special treatment.
+void SPIRVModuleAnalysis::visitFunPtrUse(
+    Register OpReg, InstrGRegsMap &SignatureToGReg,
+    std::map<const Value *, unsigned> &GlobalToGReg, const MachineFunction *MF,
+    const MachineInstr &MI) {
+  const MachineOperand *OpFunDef =
+      GR->getFunctionDefinitionByUse(&MI.getOperand(2));
+  assert(OpFunDef && OpFunDef->isReg());
+  // find the actual function definition and number it globally in advance
+  const MachineInstr *OpDefMI = OpFunDef->getParent();
+  assert(OpDefMI && OpDefMI->getOpcode() == SPIRV::OpFunction);
+  const MachineFunction *FunDefMF = OpDefMI->getParent()->getParent();
+  const MachineRegisterInfo &FunDefMRI = FunDefMF->getRegInfo();
+  do {
+    visitDecl(FunDefMRI, SignatureToGReg, GlobalToGReg, FunDefMF, *OpDefMI);
+    OpDefMI = OpDefMI->getNextNode();
+  } while (OpDefMI && (OpDefMI->getOpcode() == SPIRV::OpFunction ||
+                       OpDefMI->getOpcode() == SPIRV::OpFunctionParameter));
+  // associate the function pointer with the newly assigned global number
+  Register GlobalFunDefReg = MAI.getRegisterAlias(FunDefMF, OpFunDef->getReg());
+  assert(GlobalFunDefReg.isValid() &&
+         "Function definition must refer to a global register");
+  MAI.setRegisterAlias(MF, OpReg, GlobalFunDefReg);
+}
+
+// Depth first recursive traversal of dependencies. Repeated visits are guarded
+// by MAI.hasRegisterAlias().
+void SPIRVModuleAnalysis::visitDecl(
+    const MachineRegisterInfo &MRI, InstrGRegsMap &SignatureToGReg,
+    std::map<const Value *, unsigned> &GlobalToGReg, const MachineFunction *MF,
+    const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  DenseSet<Register> Deps;
+
+  // Process each operand of the instruction to resolve dependencies
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    Register OpReg = MO.getReg();
+    // Handle function pointers special case
+    if (Opcode == SPIRV::OpConstantFunctionPointerINTEL &&
+        MRI.getRegClass(OpReg) == &SPIRV::pIDRegClass) {
+      visitFunPtrUse(OpReg, SignatureToGReg, GlobalToGReg, MF, MI);
+      continue;
+    }
+    // Skip already processed instructions
+    if (MAI.hasRegisterAlias(MF, MO.getReg()))
+      continue;
+    // Recursively visit dependencies
+    if (const MachineInstr *OpDefMI = MRI.getUniqueVRegDef(OpReg)) {
+      if (isDeclSection(MRI, *OpDefMI))
+        visitDecl(MRI, SignatureToGReg, GlobalToGReg, MF, *OpDefMI);
+      continue;
+    }
+    // Handle the unexpected case of no unique definition for the SPIR-V
+    // instruction
+    LLVM_DEBUG({
+      dbgs() << "Unexpectedly, no unique definition for the operand ";
+      MO.print(dbgs());
+      dbgs() << "\nInstruction: ";
+      MI.print(dbgs());
+      dbgs() << "\n";
+    });
+    report_fatal_error(
+        "No unique definition is found for the virtual register");
   }
+
+  Register GReg;
+  bool IsFunDef = false;
+  if (TII->isSpecConstantInstr(MI)) {
+    GReg = Register::index2VirtReg(MAI.getNextID());
+    MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI);
+  } else if (Opcode == SPIRV::OpFunction ||
+             Opcode == SPIRV::OpFunctionParameter) {
+    GReg = handleFunctionOrParameter(MF, MI, GlobalToGReg, IsFunDef);
+  } else if (TII->isTypeDeclInstr(MI) || TII->isConstantInstr(MI) ||
+             TII->isInlineAsmDefInstr(MI)) {
+    GReg = handleTypeDeclOrConstant(MI, SignatureToGReg);
+  } else if (Opcode == SPIRV::OpVariable) {
+    GReg = handleVariable(MF, MI, GlobalToGReg);
+  } else {
+    LLVM_DEBUG({
+      dbgs() << "\nInstruction: ";
+      MI.print(dbgs());
+      dbgs() << "\n";
+    });
+    llvm_unreachable("Unexpected instruction is visited");
+  }
+  MAI.setRegisterAlias(MF, MI.getOperand(0).getReg(), GReg);
+  if (!IsFunDef)
+    MAI.setSkipEmission(&MI);
 }
 
-// The function initializes global register alias table for types, consts,
-// global vars and func decls and collects these instruction for output
-// at module level. Also it collects explicit OpExtension/OpCapability
-// instructions.
-void SPIRVModuleAnalysis::processDefInstrs(const Module &M) {
-  std::vector<SPIRV::DTSortableEntry *> DepsGraph;
+Register SPIRVModuleAnalysis::handleFunctionOrParameter(
+    const MachineFunction *MF, const MachineInstr &MI,
+    std::map<const Value *, unsigned> &GlobalToGReg, bool &IsFunDef) {
+  const Value *GObj = GR->getGlobalObject(MF, MI.getOperand(0).getReg());
+  assert(GObj && "Unregistered global definition");
+  const Function *F = dyn_cast<Function>(GObj);
+  if (!F)
+    F = dyn_cast<Argument>(GObj)->getParent();
+  assert(F && "Expected a reference to a function or an argument");
+  IsFunDef = !F->isDeclaration();
+  auto It = GlobalToGReg.find(GObj);
+  if (It != GlobalToGReg.end())
+    return It->second;
+  Register GReg = Register::index2VirtReg(MAI.getNextID());
+  GlobalToGReg[GObj] = GReg;
+  if (!IsFunDef)
+    MAI.MS[SPIRV::MB_ExtFuncDecls].push_back(&MI);
+  return GReg;
+}
 
-  GR->buildDepsGraph(DepsGraph, TII, SPVDumpDeps ? MMI : nullptr);
+Register
+SPIRVModuleAnalysis::handleTypeDeclOrConstant(const MachineInstr &MI,
+                                              InstrGRegsMap &SignatureToGReg) {
+  InstrSignature MISign = instrToSignature(MI, MAI, false);
+  auto It = SignatureToGReg.find(MISign);
+  if (It != SignatureToGReg.end())
+    return It->second;
+  Register GReg = Register::index2VirtReg(MAI.getNextID());
+  SignatureToGReg[MISign] = GReg;
+  MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI);
+  return GReg;
+}
 
-  collectGlobalEntities(
-      DepsGraph, SPIRV::MB_TypeConstVars,
-      [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); });
+Register SPIRVModuleAnalysis::handleVariable(
+    const MachineFunction *MF, const MachineInstr &MI,
+    std::map<const Value *, unsigned> &GlobalToGReg) {
+  MAI.GlobalVarList.push_back(&MI);
+  const Value *GObj = GR->getGlobalObject(MF, MI.getOperand(0).getReg());
+  assert(GObj && "Unregistered global definition");
+  auto It = GlobalToGReg.find(GObj);
+  if (It != GlobalToGReg.end())
+    return It->second;
+  Register GReg = Register::index2VirtReg(MAI.getNextID());
+  GlobalToGReg[GObj] = GReg;
+  MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI);
+  return GReg;
+}
 
+void SPIRVModuleAnalysis::collectDeclarations(const Module &M) {
+  InstrGRegsMap SignatureToGReg;
+  std::map<const Value *, unsigned> GlobalToGReg;
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
     MachineFunction *MF = MMI->getMachineFunction(*F);
     if (!MF)
       continue;
-    // Iterate through and collect OpExtension/OpCapability instructions.
+    const MachineRegisterInfo &MRI = MF->getRegInfo();
+    unsigned PastHeader = 0;
     for (MachineBasicBlock &MBB : *MF) {
       for (MachineInstr &MI : MBB) {
-        if (MI.getOpcode() == SPIRV::OpExtension) {
-          // Here, OpExtension just has a single enum operand, not a string.
-          auto Ext = SPIRV::Extension::Extension(MI.getOperand(0).getImm());
-          MAI.Reqs.addExtension(Ext);
+        if (MI.getNumOperands() == 0)
+          continue;
+        unsigned Opcode = MI.getOpcode();
+        if (Opcode == SPIRV::OpFunction) {
+          if (PastHeader == 0) {
+            PastHeader = 1;
+            continue;
+          }
+        } else if (Opcode == SPIRV::OpFunctionParameter) {
+          if (PastHeader < 2)
+            continue;
+        } else if (PastHeader > 0) {
+          PastHeader = 2;
+        }
+
+        const MachineOperand &DefMO = MI.getOperand(0);
+        switch (Opcode) {
+        case SPIRV::OpExtension:
+          MAI.Reqs.addExtension(SPIRV::Extension::Extension(DefMO.getImm()));
           MAI.setSkipEmission(&MI);
-        } else if (MI.getOpcode() == SPIRV::OpCapability) {
-          auto Cap = SPIRV::Capability::Capability(MI.getOperand(0).getImm());
-          MAI.Reqs.addCapability(Cap);
+          break;
+        case SPIRV::OpCapability:
+          MAI.Reqs.addCapability(SPIRV::Capability::Capability(DefMO.getImm()));
           MAI.setSkipEmission(&MI);
+          if (PastHeader > 0)
+            PastHeader = 2;
+          break;
+        default:
+          if (DefMO.isReg() && isDeclSection(MRI, MI) &&
+              !MAI.hasRegisterAlias(MF, DefMO.getReg()))
+            visitDecl(MRI, SignatureToGReg, GlobalToGReg, MF, MI);
         }
       }
     }
   }
-
-  collectGlobalEntities(
-      DepsGraph, SPIRV::MB_ExtFuncDecls,
-      [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true);
 }
 
 // Look for IDs declared with Import linkage, and map the corresponding function
@@ -342,58 +502,6 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
   }
 }
 
-// References to a function via function pointers generate virtual
-// registers without a definition. We are able to resolve this
-// reference using Globar Register info into an OpFunction instruction
-// and replace dummy operands by the corresponding global register references.
-void SPIRVModuleAnalysis::collectFuncPtrs() {
-  for (auto &MI : MAI.MS[SPIRV::MB_TypeConstVars])
-    if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL)
-      collectFuncPtrs(MI);
-}
-
-void SPIRVModuleAnalysis::collectFuncPtrs(MachineInstr *MI) {
-  const MachineOperand *FunUse = &MI->getOperand(2);
-  if (const MachineOperand *FunDef = GR->getFunctionDefinitionByUse(FunUse)) {
-    const MachineInstr *FunDefMI = FunDef->getParent();
-    assert(FunDefMI->getOpcode() == SPIRV::OpFunction &&
-           "Constant function pointer must refer to function definition");
-    Register FunDefReg = FunDef->getReg();
-    Register GlobalFunDefReg =
-        MAI.getRegisterAlias(FunDefMI->getMF(), FunDefReg);
-    assert(GlobalFunDefReg.isValid() &&
-           "Function definition must refer to a global register");
-    Register FunPtrReg = FunUse->getReg();
-    MAI.setRegisterAlias(MI->getMF(), FunPtrReg, GlobalFunDefReg);
-  }
-}
-
-using InstrSignature = SmallVector<size_t>;
-using InstrTraces = std::set<InstrSignature>;
-
-// Returns a representation of an instruction as a vector of MachineOperand
-// hash values, see llvm::hash_value(const MachineOperand &MO) for details.
-// This creates a signature of the instruction with the same content
-// that MachineOperand::isIdenticalTo uses for comparison.
-static InstrSignature instrToSignature(MachineInstr &MI,
-                                       SPIRV::ModuleAnalysisInfo &MAI) {
-  InstrSignature Signature;
-  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
-    size_t h;
-    if (MO.isReg()) {
-      Register RegAlias = MAI.getRegisterAlias(MI.getMF(), MO.getReg());
-      // mimic llvm::hash_value(const MachineOperand &MO)
-      h = hash_combine(MO.getType(), (unsigned)RegAlias, MO.getSubReg(),
-                       MO.isDef());
-    } else {
-      h = hash_value(MO);
-    }
-    Signature.push_back(h);
-  }
-  return Signature;
-}
-
 // Collect the given instruction in the specified MS. We assume global register
 // numbering has already occurred by this point. We can directly compare reg
 // arguments when detecting duplicates.
@@ -401,7 +509,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
                               SPIRV::ModuleSectionType MSType, InstrTraces &IS,
                               bool Append = true) {
   MAI.setSkipEmission(&MI);
-  InstrSignature MISign = instrToSignature(MI, MAI);
+  InstrSignature MISign = instrToSignature(MI, MAI, true);
   auto FoundMI = IS.insert(MISign);
   if (!FoundMI.second)
     return; // insert failed, so we found a duplicate; don't add it to MAI.MS
@@ -465,7 +573,7 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
 
 // Number registers in all functions globally from 0 onwards and store
 // the result in global register alias table. Some registers are already
-// numbered in collectGlobalEntities.
+// numbered.
 void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
     if ((*F).isDeclaration())
@@ -1835,15 +1943,11 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
 
   // Process type/const/global var/func decl instructions, number their
   // destination registers from 0 to N, collect Extensions and Capabilities.
-  processDefInstrs(M);
+  collectDeclarations(M);
 
   // Number rest of registers from N+1 onwards.
   numberRegistersGlobally(M);
 
-  // Update references to OpFunction instructions to use Global Registers
-  if (GR->hasConstFunPtr())
-    collectFuncPtrs();
-
   // Collect OpName, OpEntryPoint, OpDecorate etc, process other instructions.
   processOtherInstrs(M);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index ee2aaf1..79b5444 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -124,7 +124,7 @@ public:
                           const Capability::Capability IfPresent);
 };
 
-using InstrList = SmallVector<MachineInstr *>;
+using InstrList = SmallVector<const MachineInstr *>;
 // Maps a local register to the corresponding global alias.
 using LocalToGlobalRegTable = std::map<Register, Register>;
 using RegisterAliasMapTy =
@@ -142,12 +142,12 @@ struct ModuleAnalysisInfo {
   // Maps ExtInstSet to corresponding ID register.
   DenseMap<unsigned, Register> ExtInstSetMap;
   // Contains the list of all global OpVariables in the module.
-  SmallVector<MachineInstr *, 4> GlobalVarList;
+  SmallVector<const MachineInstr *, 4> GlobalVarList;
   // Maps functions to corresponding function ID registers.
   DenseMap<const Function *, Register> FuncMap;
   // The set contains machine instructions which are necessary
   // for correct MIR but will not be emitted in function bodies.
-  DenseSet<MachineInstr *> InstrsToDelete;
+  DenseSet<const MachineInstr *> InstrsToDelete;
   // The table contains global aliases of local registers for each machine
   // function. The aliases are used to substitute local registers during
   // code emission.
@@ -167,7 +167,7 @@ struct ModuleAnalysisInfo {
   }
   Register getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; }
   InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
-  void setSkipEmission(MachineInstr *MI) { InstrsToDelete.insert(MI); }
+  void setSkipEmission(const MachineInstr *MI) { InstrsToDelete.insert(MI); }
   bool getSkipEmission(const MachineInstr *MI) {
     return InstrsToDelete.contains(MI);
   }
@@ -204,6 +204,10 @@ struct ModuleAnalysisInfo {
 };
 } // namespace SPIRV
 
+using InstrSignature = SmallVector<size_t>;
+using InstrTraces = std::set<InstrSignature>;
+using InstrGRegsMap = std::map<SmallVector<size_t>, unsigned>;
+
 struct SPIRVModuleAnalysis : public ModulePass {
   static char ID;
 
@@ -216,17 +220,27 @@ public:
 
 private:
   void setBaseInfo(const Module &M);
-  void collectGlobalEntities(
-      const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
-      SPIRV::ModuleSectionType MSType,
-      std::function<bool(const SPIRV::DTSortableEntry *)> Pred,
-      bool UsePreOrder);
-  void processDefInstrs(const Module &M);
   void collectFuncNames(MachineInstr &MI, const Function *F);
   void processOtherInstrs(const Module &M);
   void numberRegistersGlobally(const Module &M);
-  void collectFuncPtrs();
-  void collectFuncPtrs(MachineInstr *MI);
+
+  // analyze dependencies to collect module scope definitions
+  void collectDeclarations(const Module &M);
+  void visitDecl(const MachineRegisterInfo &MRI, InstrGRegsMap &SignatureToGReg,
+                 std::map<const Value *, unsigned> &GlobalToGReg,
+                 const MachineFunction *MF, const MachineInstr &MI);
+  Register handleVariable(const MachineFunction *MF, const MachineInstr &MI,
+                          std::map<const Value *, unsigned> &GlobalToGReg);
+  Register handleTypeDeclOrConstant(const MachineInstr &MI,
+                                    InstrGRegsMap &SignatureToGReg);
+  Register
+  handleFunctionOrParameter(const MachineFunction *MF, const MachineInstr &MI,
+                            std::map<const Value *, unsigned> &GlobalToGReg,
+                            bool &IsFunDef);
+  void visitFunPtrUse(Register OpReg, InstrGRegsMap &SignatureToGReg,
+                      std::map<const Value *, unsigned> &GlobalToGReg,
+                      const MachineFunction *MF, const MachineInstr &MI);
+  bool isDeclSection(const MachineRegisterInfo &MRI, const MachineInstr &MI);
 
   const SPIRVSubtarget *ST;
   SPIRVGlobalRegistry *GR;
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 8357c30..5b4c849 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -58,9 +58,10 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                              ->getValue());
       if (auto *GV = dyn_cast<GlobalValue>(Const)) {
         Register Reg = GR->find(GV, &MF);
-        if (!Reg.isValid())
+        if (!Reg.isValid()) {
           GR->add(GV, &MF, SrcReg);
-        else
+          GR->addGlobalObject(GV, &MF, SrcReg);
+        } else
           RegsAlreadyAddedToDT[&MI] = Reg;
       } else {
         Register Reg = GR->find(Const, &MF);
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index da2e24c..60649ea 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/TypedPointerType.h"
 #include <queue>
@@ -236,6 +237,10 @@ Type *parseBasicTypeName(StringRef &TypeName, LLVMContext &Ctx);
 // Returns true if the function was changed.
 bool sortBlocks(Function &F);
 
+inline bool hasInitializer(const GlobalVariable *GV) {
+  return GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer());
+}
+
 // True if this is an instance of TypedPointerType.
 inline bool isTypedPointerTy(const Type *T) {
   return T && T->getTypeID() == Type::TypedPointerTyID;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 06d7d42..16e7d05 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -29,19 +29,14 @@ SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) {
   AllowAtInName = true;
   AllowAtAtStartOfIdentifier = true;
   AllowDollarAtStartOfIdentifier = true;
-  AllowHashAtStartOfIdentifier = true;
   AssemblerDialect = AD_HLASM;
   CalleeSaveStackSlotSize = 8;
   CodePointerSize = 8;
   CommentString = "*";
-  DotIsPC = false;
-  EmitGNUAsmStartIndentationMarker = false;
-  EmitLabelsInUpperCase = true;
   ExceptionsType = ExceptionHandling::ZOS;
+  IsHLASM = true;
   IsLittleEndian = false;
   MaxInstLength = 6;
-  RestrictCommentStringToStartOfStatement = true;
-  StarIsPC = true;
   SupportsDebugInformation = true;
 }
 
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index c0985f3..d5365f3 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -204,7 +204,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const GlobalValue *GV) const {
     // don't assume the variables to be DSO local unless we actually know
     // that for sure. This only has to be done for variables; for functions
     // the linker can insert thunks for calling functions from another DLL.
-    if (TT.isWindowsGNUEnvironment() && GV->isDeclarationForLinker() &&
+    if (TT.isOSCygMing() && GV->isDeclarationForLinker() &&
         isa<GlobalVariable>(GV))
       return false;
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index b67c573..abe0cc6 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -140,8 +140,8 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
   case X86::VCMPPSZ128rmik:  case X86::VCMPPSZ128rrik:
   case X86::VCMPPSZ256rmik:  case X86::VCMPPSZ256rrik:
   case X86::VCMPPSZrmik:     case X86::VCMPPSZrrik:
-  case X86::VCMPSDZrmi_Intk: case X86::VCMPSDZrri_Intk:
-  case X86::VCMPSSZrmi_Intk: case X86::VCMPSSZrri_Intk:
+  case X86::VCMPSDZrmik_Int: case X86::VCMPSDZrrik_Int:
+  case X86::VCMPSSZrmik_Int: case X86::VCMPSSZrrik_Int:
   case X86::VCMPPDZ128rmbi:  case X86::VCMPPDZ128rmbik:
   case X86::VCMPPDZ256rmbi:  case X86::VCMPPDZ256rmbik:
   case X86::VCMPPDZrmbi:     case X86::VCMPPDZrmbik:
@@ -150,8 +150,8 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
   case X86::VCMPPSZrmbi:     case X86::VCMPPSZrmbik:
   case X86::VCMPPDZrrib:     case X86::VCMPPDZrribk:
   case X86::VCMPPSZrrib:     case X86::VCMPPSZrribk:
-  case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrrib_Intk:
-  case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrrib_Intk:
+  case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrribk_Int:
+  case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrribk_Int:
   case X86::VCMPPHZ128rmi:   case X86::VCMPPHZ128rri:
   case X86::VCMPPHZ256rmi:   case X86::VCMPPHZ256rri:
   case X86::VCMPPHZrmi:      case X86::VCMPPHZrri:
@@ -160,12 +160,12 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
   case X86::VCMPPHZ128rmik:  case X86::VCMPPHZ128rrik:
   case X86::VCMPPHZ256rmik:  case X86::VCMPPHZ256rrik:
   case X86::VCMPPHZrmik:     case X86::VCMPPHZrrik:
-  case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk:
+  case X86::VCMPSHZrmik_Int: case X86::VCMPSHZrrik_Int:
   case X86::VCMPPHZ128rmbi:  case X86::VCMPPHZ128rmbik:
   case X86::VCMPPHZ256rmbi:  case X86::VCMPPHZ256rmbik:
   case X86::VCMPPHZrmbi:     case X86::VCMPPHZrmbik:
   case X86::VCMPPHZrrib:     case X86::VCMPPHZrribk:
-  case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
+  case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrribk_Int:
   case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
   case X86::VCMPPBF16Z256rmi:  case X86::VCMPPBF16Z256rri:
   case X86::VCMPPBF16Zrmi:     case X86::VCMPPBF16Zrri:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 9f8bc57..681d0da 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -40,6 +40,17 @@ using namespace llvm;
   CASE_MASK_INS_COMMON(Inst, Suffix, src)         \
   CASE_MASKZ_INS_COMMON(Inst, Suffix, src)
 
+#define CASE_MASK_INS_COMMON_INT(Inst, Suffix, src) \
+  case X86::V##Inst##Suffix##src##k_Int:
+
+#define CASE_MASKZ_INS_COMMON_INT(Inst, Suffix, src) \
+  case X86::V##Inst##Suffix##src##kz_Int:
+
+#define CASE_AVX512_INS_COMMON_INT(Inst, Suffix, src) \
+  CASE_AVX_INS_COMMON(Inst, Suffix, src##_Int)        \
+  CASE_MASK_INS_COMMON_INT(Inst, Suffix, src)         \
+  CASE_MASKZ_INS_COMMON_INT(Inst, Suffix, src)
+
 #define CASE_FPCLASS_PACKED(Inst, src)    \
   CASE_AVX_INS_COMMON(Inst, Z, src##i)    \
   CASE_AVX_INS_COMMON(Inst, Z256, src##i) \
@@ -196,8 +207,8 @@ using namespace llvm;
   CASE_AVX_INS_COMMON(Inst##SS, , r_Int)          \
   CASE_AVX_INS_COMMON(Inst##SD, Z, r)             \
   CASE_AVX_INS_COMMON(Inst##SS, Z, r)             \
-  CASE_AVX512_INS_COMMON(Inst##SD, Z, r_Int)      \
-  CASE_AVX512_INS_COMMON(Inst##SS, Z, r_Int)
+  CASE_AVX512_INS_COMMON_INT(Inst##SD, Z, r)      \
+  CASE_AVX512_INS_COMMON_INT(Inst##SS, Z, r)
 
 #define CASE_FMA_SCALAR_MEM(Inst)                 \
   CASE_AVX_INS_COMMON(Inst##SD, , m)              \
@@ -206,8 +217,8 @@ using namespace llvm;
   CASE_AVX_INS_COMMON(Inst##SS, , m_Int)          \
   CASE_AVX_INS_COMMON(Inst##SD, Z, m)             \
   CASE_AVX_INS_COMMON(Inst##SS, Z, m)             \
-  CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int)      \
-  CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int)
+  CASE_AVX512_INS_COMMON_INT(Inst##SD, Z, m)      \
+  CASE_AVX512_INS_COMMON_INT(Inst##SS, Z, m)
 
 #define CASE_FMA4(Inst, suf)                      \
   CASE_AVX_INS_COMMON(Inst, 4, suf)               \
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index fafcc73..01e2d4ac 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -277,8 +277,8 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
   case X86::VCMPSDrmi_Int:   case X86::VCMPSDrri_Int:
   case X86::VCMPSDZrmi:      case X86::VCMPSDZrri:
   case X86::VCMPSDZrmi_Int:  case X86::VCMPSDZrri_Int:
-  case X86::VCMPSDZrmi_Intk: case X86::VCMPSDZrri_Intk:
-  case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrrib_Intk:
+  case X86::VCMPSDZrmik_Int: case X86::VCMPSDZrrik_Int:
+  case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrribk_Int:
     OS << "sd\t";
     break;
   case X86::CMPSSrmi:        case X86::CMPSSrri:
@@ -287,8 +287,8 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
   case X86::VCMPSSrmi_Int:   case X86::VCMPSSrri_Int:
   case X86::VCMPSSZrmi:      case X86::VCMPSSZrri:
   case X86::VCMPSSZrmi_Int:  case X86::VCMPSSZrri_Int:
-  case X86::VCMPSSZrmi_Intk: case X86::VCMPSSZrri_Intk:
-  case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrrib_Intk:
+  case X86::VCMPSSZrmik_Int: case X86::VCMPSSZrrik_Int:
+  case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrribk_Int:
     OS << "ss\t";
     break;
   case X86::VCMPPHZ128rmi:  case X86::VCMPPHZ128rri:
@@ -305,8 +305,8 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
     break;
   case X86::VCMPSHZrmi:      case X86::VCMPSHZrri:
   case X86::VCMPSHZrmi_Int:  case X86::VCMPSHZrri_Int:
-  case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
-  case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk:
+  case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrribk_Int:
+  case X86::VCMPSHZrmik_Int: case X86::VCMPSHZrrik_Int:
     OS << "sh\t";
     break;
   case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 6800926..c26dc2c 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -119,8 +119,8 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
   case X86::VCMPPSZ128rmik:  case X86::VCMPPSZ128rrik:
   case X86::VCMPPSZ256rmik:  case X86::VCMPPSZ256rrik:
   case X86::VCMPPSZrmik:     case X86::VCMPPSZrrik:
-  case X86::VCMPSDZrmi_Intk: case X86::VCMPSDZrri_Intk:
-  case X86::VCMPSSZrmi_Intk: case X86::VCMPSSZrri_Intk:
+  case X86::VCMPSDZrmik_Int: case X86::VCMPSDZrrik_Int:
+  case X86::VCMPSSZrmik_Int: case X86::VCMPSSZrrik_Int:
   case X86::VCMPPDZ128rmbi:  case X86::VCMPPDZ128rmbik:
   case X86::VCMPPDZ256rmbi:  case X86::VCMPPDZ256rmbik:
   case X86::VCMPPDZrmbi:     case X86::VCMPPDZrmbik:
@@ -129,8 +129,8 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
   case X86::VCMPPSZrmbi:     case X86::VCMPPSZrmbik:
   case X86::VCMPPDZrrib:     case X86::VCMPPDZrribk:
   case X86::VCMPPSZrrib:     case X86::VCMPPSZrribk:
-  case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrrib_Intk:
-  case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrrib_Intk:
+  case X86::VCMPSDZrrib_Int: case X86::VCMPSDZrribk_Int:
+  case X86::VCMPSSZrrib_Int: case X86::VCMPSSZrribk_Int:
   case X86::VCMPPHZ128rmi:   case X86::VCMPPHZ128rri:
   case X86::VCMPPHZ256rmi:   case X86::VCMPPHZ256rri:
   case X86::VCMPPHZrmi:      case X86::VCMPPHZrri:
@@ -139,12 +139,12 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
   case X86::VCMPPHZ128rmik:  case X86::VCMPPHZ128rrik:
   case X86::VCMPPHZ256rmik:  case X86::VCMPPHZ256rrik:
   case X86::VCMPPHZrmik:     case X86::VCMPPHZrrik:
-  case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk:
+  case X86::VCMPSHZrmik_Int: case X86::VCMPSHZrrik_Int:
   case X86::VCMPPHZ128rmbi:  case X86::VCMPPHZ128rmbik:
   case X86::VCMPPHZ256rmbi:  case X86::VCMPPHZ256rmbik:
   case X86::VCMPPHZrmbi:     case X86::VCMPPHZrmbik:
   case X86::VCMPPHZrrib:     case X86::VCMPPHZrribk:
-  case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
+  case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrribk_Int:
   case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
   case X86::VCMPPBF16Z256rmi:  case X86::VCMPPBF16Z256rri:
   case X86::VCMPPBF16Zrmi:     case X86::VCMPPBF16Zrri:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3b260a8..6b0eb38 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -94,7 +94,7 @@ static cl::opt<int> BrMergingCcmpBias(
 
 static cl::opt<bool>
     WidenShift("x86-widen-shift", cl::init(true),
-               cl::desc("Replacte narrow shifts with wider shifts."),
+               cl::desc("Replace narrow shifts with wider shifts."),
                cl::Hidden);
 
 static cl::opt<int> BrMergingLikelyBias(
@@ -341,8 +341,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
   if (Subtarget.hasAVX10_2()) {
-    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Legal);
-    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom);
+    for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
+                   MVT::v4i64}) {
+      setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal);
+      setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal);
+    }
+    if (Subtarget.hasAVX10_2_512()) {
+      setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);
+    }
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal);
       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal);
@@ -623,6 +632,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FMAXNUM, VT, Action);
     setOperationAction(ISD::FMINIMUM, VT, Action);
     setOperationAction(ISD::FMAXIMUM, VT, Action);
+    setOperationAction(ISD::FMINIMUMNUM, VT, Action);
+    setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
     setOperationAction(ISD::FSIN, VT, Action);
     setOperationAction(ISD::FCOS, VT, Action);
     setOperationAction(ISD::FSINCOS, VT, Action);
@@ -1066,6 +1077,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::FMAXIMUM,           MVT::f32, Custom);
     setOperationAction(ISD::FMINIMUM,           MVT::f32, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM,        MVT::f32, Custom);
+    setOperationAction(ISD::FMINIMUMNUM,        MVT::f32, Custom);
 
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
@@ -1108,6 +1121,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
     }
 
     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
@@ -1473,6 +1488,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
       setOperationAction(ISD::FMAXIMUM,          VT, Custom);
       setOperationAction(ISD::FMINIMUM,          VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM,       VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM,       VT, Custom);
       setOperationAction(ISD::FCANONICALIZE, VT, Custom);
     }
 
@@ -1818,6 +1835,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
       setOperationAction(ISD::FNEG,  VT, Custom);
       setOperationAction(ISD::FABS,  VT, Custom);
       setOperationAction(ISD::FMA,   VT, Legal);
@@ -2289,6 +2308,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
     setOperationAction(ISD::FMAXIMUM,             MVT::f16, Custom);
     setOperationAction(ISD::FMINIMUM,             MVT::f16, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM,          MVT::f16, Custom);
+    setOperationAction(ISD::FMINIMUMNUM,          MVT::f16, Custom);
     setOperationAction(ISD::FP_EXTEND,            MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
 
@@ -2333,6 +2354,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
       setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
       setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
+
+      setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
+      setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
     }
 
     if (Subtarget.hasVLX()) {
@@ -2377,6 +2403,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       // Need to custom widen these to prevent scalarization.
       setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
       setOperationAction(ISD::STORE, MVT::v4f16, Custom);
+
+      setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
+      setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
+
+      setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
+      setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
     }
   }
 
@@ -2433,6 +2469,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSQRT, VT, Legal);
       setOperationAction(ISD::FMA, VT, Legal);
       setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::FMINIMUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
     }
     if (Subtarget.hasAVX10_2_512()) {
       setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
@@ -2442,6 +2482,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
       setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
       setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
+      setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
+      setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
     }
     for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
       setCondCodeAction(ISD::SETOEQ, VT, Custom);
@@ -2643,6 +2687,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::UINT_TO_FP,
                        ISD::STRICT_SINT_TO_FP,
                        ISD::STRICT_UINT_TO_FP,
+                       ISD::FP_TO_SINT_SAT,
+                       ISD::FP_TO_UINT_SAT,
                        ISD::SETCC,
                        ISD::MUL,
                        ISD::XOR,
@@ -28826,19 +28872,35 @@ static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
 
 static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
-  assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
-         "Expected FMAXIMUM or FMINIMUM opcode");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = Op.getValueType();
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
   SDLoc DL(Op);
+  bool IsMaxOp =
+      Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
+  bool IsNum =
+      Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
+  if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
+    unsigned Opc = 0;
+    if (VT.isVector())
+      Opc = X86ISD::VMINMAX;
+    else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
+      Opc = X86ISD::VMINMAXS;
+
+    if (Opc) {
+      SDValue Imm =
+          DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
+      return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
+    }
+  }
+
   uint64_t SizeInBits = VT.getScalarSizeInBits();
   APInt PreferredZero = APInt::getZero(SizeInBits);
   APInt OppositeZero = PreferredZero;
   EVT IVT = VT.changeTypeToInteger();
   X86ISD::NodeType MinMaxOp;
-  if (Op.getOpcode() == ISD::FMAXIMUM) {
+  if (IsMaxOp) {
     MinMaxOp = X86ISD::FMAX;
     OppositeZero.setSignBit();
   } else {
@@ -28968,7 +29030,9 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
   if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
     return MinMax;
 
-  SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
+  SDValue IsNaN =
+      DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+
   return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
 }
 
@@ -33226,6 +33290,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UMIN:               return LowerMINMAX(Op, Subtarget, DAG);
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
     return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
   case ISD::ABDS:
@@ -33638,6 +33704,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
     return;
   }
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT: {
+    if (!Subtarget.hasAVX10_2())
+      return;
+
+    bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
+    EVT VT = N->getValueType(0);
+    SDValue Op = N->getOperand(0);
+    EVT OpVT = Op.getValueType();
+    SDValue Res;
+
+    if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
+      if (IsSigned)
+        Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
+      else
+        Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
+      Results.push_back(Res);
+    }
+    return;
+  }
   case ISD::FP_TO_SINT:
   case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_UINT:
@@ -34618,6 +34704,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VPERMV3)
   NODE_NAME_CASE(VPERMI)
   NODE_NAME_CASE(VPTERNLOG)
+  NODE_NAME_CASE(FP_TO_SINT_SAT)
+  NODE_NAME_CASE(FP_TO_UINT_SAT)
   NODE_NAME_CASE(VFIXUPIMM)
   NODE_NAME_CASE(VFIXUPIMM_SAE)
   NODE_NAME_CASE(VFIXUPIMMS)
@@ -41606,6 +41694,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget &Subtarget) {
   MVT VT = N.getSimpleValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
   SmallVector<int, 4> Mask;
   unsigned Opcode = N.getOpcode();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -41891,7 +41981,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
       APInt Mask = APInt::getHighBitsSet(64, 32);
       if (DAG.MaskedValueIsZero(In, Mask)) {
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
-        MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+        MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
         return DAG.getBitcast(VT, Movl);
@@ -41906,7 +41996,6 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         // Create a vector constant - scalar constant followed by zeros.
         EVT ScalarVT = N0.getOperand(0).getValueType();
         Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
-        unsigned NumElts = VT.getVectorNumElements();
         Constant *Zero = ConstantInt::getNullValue(ScalarTy);
         SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
@@ -41957,9 +42046,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         MVT SrcVT = N0.getOperand(0).getSimpleValueType();
         unsigned SrcBits = SrcVT.getScalarSizeInBits();
         if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
-          unsigned Size = VT.getVectorNumElements();
           unsigned NewSize = SrcVT.getVectorNumElements();
-          APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
+          APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(NumElts);
           APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
           return DAG.getBitcast(
               VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
@@ -42372,7 +42460,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
       DMask[DOffset + 0] = DOffset + 1;
       DMask[DOffset + 1] = DOffset + 0;
-      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
       V = DAG.getBitcast(DVT, V);
       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
@@ -45967,6 +46055,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
   case ISD::FMAXNUM_IEEE:
   case ISD::FMAXIMUM:
   case ISD::FMINIMUM:
+  case ISD::FMAXIMUMNUM:
+  case ISD::FMINIMUMNUM:
   case X86ISD::FMAX:
   case X86ISD::FMIN:
   case ISD::FABS: // Begin 1 operand
@@ -56175,6 +56265,33 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
+static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasAVX10_2())
+    return SDValue();
+
+  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
+  EVT SrcVT = N->getOperand(0).getValueType();
+  EVT DstVT = N->getValueType(0);
+  SDLoc dl(N);
+
+  if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
+    SDValue V2F32Value = DAG.getUNDEF(SrcVT);
+
+    // Concatenate the original v2f32 input and V2F32Value to create v4f32
+    SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                                 N->getOperand(0), V2F32Value);
+
+    // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
+    if (IsSigned)
+      return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
+
+    return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
+  }
+  return SDValue();
+}
+
 static bool needCarryOrOverflowFlag(SDValue Flags) {
   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
 
@@ -59288,6 +59405,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::INTRINSIC_WO_CHAIN:  return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
   case ISD::INTRINSIC_W_CHAIN:  return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
   case ISD::INTRINSIC_VOID:  return combineINTRINSIC_VOID(N, DAG, DCI);
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
     // clang-format on
   }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 2b7a8ea..eaedaa0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -908,6 +908,10 @@ namespace llvm {
     // Load x87 FPU environment from memory.
     FLDENVm,
 
+    // Custom handling for FP_TO_xINT_SAT
+    FP_TO_SINT_SAT,
+    FP_TO_UINT_SAT,
+
     /// This instruction implements FP_TO_SINT with the
     /// integer destination in memory and a FP reg source.  This corresponds
     /// to the X86::FIST*m instructions and the rounding mode change stuff. It
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 0301c07..1270161 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -403,28 +403,45 @@ multiclass avx10_minmax_scalar<string OpStr, X86VectorVTInfo _, SDNode OpNode,
                                 SDNode OpNodeSAE> {
   let ExeDomain = _.ExeDomain, Predicates = [HasAVX10_2] in {
     let mayRaiseFPException = 1 in {
+      let isCodeGenOnly = 1 in {
+        def rri : AVX512Ii8<0x53, MRMSrcReg, (outs _.FRC:$dst),
+                            (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3),
+                             !strconcat(OpStr, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+                             [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, (i32 timm:$src3)))]>,
+                       Sched<[WriteFMAX]>;
+
+        def rmi : AVX512Ii8<0x53, MRMSrcMem, (outs _.FRC:$dst),
+                            (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                             !strconcat(OpStr, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+                             [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2),
+                                                       (i32 timm:$src3)))]>,
+                       Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>;
+      }
       defm rri : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst),
-                               (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3),
-                                OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                                (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                              (i32 timm:$src3)))>,
-                                Sched<[WriteFMAX]>;
+                                 (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3),
+                                  OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                                  (_.VT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                                (i32 timm:$src3))),
+                                 0, 0, 0, vselect_mask, "", "_Int">,
+                       Sched<[WriteFMAX]>;
 
       defm rmi : AVX512_maskable<0x53, MRMSrcMem, _, (outs VR128X:$dst),
-                       (ins VR128X:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
-                       OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
-                       (_.VT (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
-                                     (i32 timm:$src3)))>,
+                                 (ins VR128X:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                                  OpStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                                  (_.VT (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
+                                                (i32 timm:$src3))),
+                                 0, 0, 0, vselect_mask, "", "_Int">,
                        Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>;
     }
     let Uses = []<Register>, mayRaiseFPException = 0 in
       defm rrib : AVX512_maskable<0x53, MRMSrcReg, _, (outs VR128X:$dst),
-                        (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3),
-                        OpStr, "$src3, {sae}, $src2, $src1",
-                        "$src1, $src2, {sae}, $src3",
-                        (_.VT (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                         (i32 timm:$src3)))>,
-                        Sched<[WriteFMAX]>, EVEX_B;
+                                  (ins VR128X:$src1, VR128X:$src2, i32u8imm:$src3),
+                                   OpStr, "$src3, {sae}, $src2, $src1",
+                                   "$src1, $src2, {sae}, $src3",
+                                   (_.VT (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                                    (i32 timm:$src3))),
+                                  0, 0, 0, vselect_mask, "", "_Int">,
+                       Sched<[WriteFMAX]>, EVEX_B;
   }
 }
 
@@ -817,6 +834,70 @@ let Predicates = [HasAVX10_2] in {
 // patterns have been disabled with null_frag.
 // Patterns VCVTTPD2DQSZ128
 
+// VCVTTPD2DQS
+def : Pat<(v4i32(X86fp2sisat(v2f64 VR128X:$src))),
+          (VCVTTPD2DQSZ128rr VR128X:$src)>;
+def : Pat<(v4i32(fp_to_sint_sat(v4f64 VR256X:$src), i32)),
+          (VCVTTPD2DQSZ256rr VR256X:$src)>;
+def : Pat<(v8i32(fp_to_sint_sat(v8f64 VR512:$src), i32)),
+          (VCVTTPD2DQSZrr VR512:$src)>;
+
+// VCVTTPD2QQS
+def : Pat<(v2i64(fp_to_sint_sat(v2f64 VR128X:$src), i64)),
+          (VCVTTPD2QQSZ128rr VR128X:$src)>;
+def : Pat<(v4i64(fp_to_sint_sat(v4f64 VR256X:$src), i64)),
+          (VCVTTPD2QQSZ256rr VR256X:$src)>;
+def : Pat<(v8i64(fp_to_sint_sat(v8f64 VR512:$src), i64)),
+          (VCVTTPD2QQSZrr VR512:$src)>;
+
+// VCVTTPD2UDQS
+def : Pat<(v4i32(X86fp2uisat(v2f64 VR128X:$src))),
+          (VCVTTPD2UDQSZ128rr VR128X:$src)>;
+def : Pat<(v4i32(fp_to_uint_sat(v4f64 VR256X:$src), i32)),
+          (VCVTTPD2UDQSZ256rr VR256X:$src)>;
+def : Pat<(v8i32(fp_to_uint_sat(v8f64 VR512:$src), i32)),
+          (VCVTTPD2UDQSZrr VR512:$src)>;
+
+// VCVTTPD2UQQS
+def : Pat<(v2i64(fp_to_uint_sat(v2f64 VR128X:$src), i64)),
+          (VCVTTPD2UQQSZ128rr VR128X:$src)>;
+def : Pat<(v4i64(fp_to_uint_sat(v4f64 VR256X:$src), i64)),
+          (VCVTTPD2UQQSZ256rr VR256X:$src)>;
+def : Pat<(v8i64(fp_to_uint_sat(v8f64 VR512:$src), i64)),
+          (VCVTTPD2UQQSZrr VR512:$src)>;
+
+// VCVTTPS2DQS
+def : Pat<(v4i32(fp_to_sint_sat(v4f32 VR128X:$src), i32)),
+          (VCVTTPS2DQSZ128rr VR128X:$src)>;
+def : Pat<(v8i32(fp_to_sint_sat(v8f32 VR256X:$src), i32)),
+          (VCVTTPS2DQSZ256rr VR256X:$src)>;
+def : Pat<(v16i32(fp_to_sint_sat(v16f32 VR512:$src), i32)),
+          (VCVTTPS2DQSZrr VR512:$src)>;
+
+// VCVTTPS2QQS
+def : Pat<(v2i64(X86fp2sisat(v4f32 VR128X:$src))),
+          (VCVTTPS2QQSZ128rr VR128X:$src)>;
+def : Pat<(v4i64(fp_to_sint_sat(v4f32 VR128X:$src), i64)),
+          (VCVTTPS2QQSZ256rr VR128X:$src)>;
+def : Pat<(v8i64(fp_to_sint_sat(v8f32 VR256X:$src), i64)),
+          (VCVTTPS2QQSZrr VR256X:$src)>;
+
+// VCVTTPS2UDQS
+def : Pat<(v4i32(fp_to_uint_sat(v4f32 VR128X:$src), i32)),
+          (VCVTTPS2UDQSZ128rr VR128X:$src)>;
+def : Pat<(v8i32(fp_to_uint_sat(v8f32 VR256X:$src), i32)),
+          (VCVTTPS2UDQSZ256rr VR256X:$src)>;
+def : Pat<(v16i32(fp_to_uint_sat(v16f32 VR512:$src), i32)),
+          (VCVTTPS2UDQSZrr VR512:$src)>;
+
+// VCVTTPS2UQQS
+def : Pat<(v2i64(X86fp2uisat(v4f32 VR128X:$src))),
+          (VCVTTPS2UQQSZ128rr VR128X:$src)>;
+def : Pat<(v4i64(fp_to_uint_sat(v4f32 VR128X:$src), i64)),
+          (VCVTTPS2UQQSZ256rr VR128X:$src)>;
+def : Pat<(v8i64(fp_to_uint_sat(v8f32 VR256X:$src), i64)),
+          (VCVTTPS2UQQSZrr VR256X:$src)>;
+
 def : Pat<(v4i32 (X86cvttp2sis (v2f64 VR128X:$src))),
           (VCVTTPD2DQSZ128rr VR128X:$src)>;
 def : Pat<(v4i32 (X86cvttp2sis (loadv2f64 addr:$src))),
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index e899807..d6ca4b1 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -28,19 +28,20 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
                                   bit IsCommutable = 0,
                                   bit IsKCommutable = 0,
                                   bit IsKZCommutable = IsCommutable,
-                                  string ClobberConstraint = ""> {
+                                  string ClobberConstraint = "",
+                                  string Suffix = ""> {
   let isCommutable = IsCommutable, Constraints = ClobberConstraint in
-    def NAME: AVX512<O, F, Outs, Ins,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
-                                     "$dst, "#IntelSrcAsm#"}",
-                       Pattern>;
+    def Suffix: AVX512<O, F, Outs, Ins,
+                            OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+                                          "$dst, "#IntelSrcAsm#"}",
+                            Pattern>;
 
   // Prefer over VMOV*rrk Pat<>
   let isCommutable = IsKCommutable in
-    def NAME#k: AVX512<O, F, Outs, MaskingIns,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
-                                     "$dst {${mask}}, "#IntelSrcAsm#"}",
-                       MaskingPattern>,
+    def k#Suffix: AVX512<O, F, Outs, MaskingIns,
+                              OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+                                            "$dst {${mask}}, "#IntelSrcAsm#"}",
+                              MaskingPattern>,
               EVEX_K {
       // In case of the 3src subclass this is overridden with a let.
       string Constraints = !if(!eq(ClobberConstraint, ""), MaskingConstraint,
@@ -52,10 +53,10 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
   // So, it is Ok to use IsCommutable instead of IsKCommutable.
   let isCommutable = IsKZCommutable, // Prefer over VMOV*rrkz Pat<>
       Constraints = ClobberConstraint in
-    def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
-                                     "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
-                       ZeroMaskingPattern>,
+    def kz#Suffix: AVX512<O, F, Outs, ZeroMaskingIns,
+                               OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
+                                             "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
+                               ZeroMaskingPattern>,
               EVEX_KZ;
 }
 
@@ -72,7 +73,8 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                                   bit IsCommutable = 0,
                                   bit IsKCommutable = 0,
                                   bit IsKZCommutable = IsCommutable,
-                                  string ClobberConstraint = ""> :
+                                  string ClobberConstraint = "",
+                                  string Suffix = ""> :
   AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.RC:$dst, RHS)],
@@ -80,7 +82,8 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                          [(set _.RC:$dst,
                                (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
                          MaskingConstraint, IsCommutable,
-                         IsKCommutable, IsKZCommutable, ClobberConstraint>;
+                         IsKCommutable, IsKZCommutable, ClobberConstraint,
+                         Suffix>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -115,23 +118,24 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                            bit IsCommutable = 0, bit IsKCommutable = 0,
                            bit IsKZCommutable = IsCommutable,
                            SDPatternOperator Select = vselect_mask,
-                           string ClobberConstraint = ""> :
+                           string ClobberConstraint = "",
+                           string Suffix = ""> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (Select _.KRCWM:$mask, RHS, _.RC:$src0),
                           Select, "$src0 = $dst", IsCommutable, IsKCommutable,
-                          IsKZCommutable, ClobberConstraint>;
+                          IsKZCommutable, ClobberConstraint, Suffix>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the scalar instruction.
 multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
-                           dag RHS> :
+                           dag RHS, string Suffix = ""> :
    AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
-                   RHS, 0, 0, 0, X86selects_mask>;
+                   RHS, 0, 0, 0, X86selects_mask, "", Suffix>;
 
 // Similar to AVX512_maskable but in this case one of the source operands
 // ($src1) is already tied to $dst so we just use that for the preserved
@@ -144,7 +148,7 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
                                 bit IsCommutable = 0,
                                 bit IsKCommutable = 0,
                                 SDPatternOperator Select = vselect_mask,
-                                bit MaskOnly = 0> :
+                                bit MaskOnly = 0, string Suffix = ""> :
    AVX512_maskable_common<O, F, _, Outs,
                           !con((ins _.RC:$src1), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
@@ -152,7 +156,8 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
                           OpcodeStr, AttSrcAsm, IntelSrcAsm,
                           !if(MaskOnly, (null_frag), RHS),
                           (Select _.KRCWM:$mask, RHS, _.RC:$src1),
-                          Select, "", IsCommutable, IsKCommutable>;
+                          Select, "", IsCommutable, IsKCommutable,
+                          IsCommutable, "", Suffix>;
 
 // Similar to AVX512_maskable_3src but in this case the input VT for the tied
 // operand differs from the output VT. This requires a bitconvert on
@@ -178,10 +183,10 @@ multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                                      dag RHS,
                                      bit IsCommutable = 0,
                                      bit IsKCommutable = 0,
-                                     bit MaskOnly = 0> :
+                                     bit MaskOnly = 0, string Suffix = ""> :
    AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm,
                         IntelSrcAsm, RHS, IsCommutable, IsKCommutable,
-                        X86selects_mask, MaskOnly>;
+                        X86selects_mask, MaskOnly, Suffix>;
 
 multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
                                   dag Outs, dag Ins,
@@ -215,17 +220,18 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   list<dag> Pattern,
                                   list<dag> MaskingPattern,
-                                  bit IsCommutable = 0> {
+                                  bit IsCommutable = 0,
+                                  string Suffix = ""> {
     let isCommutable = IsCommutable in {
-    def NAME: AVX512<O, F, Outs, Ins,
+    def Suffix: AVX512<O, F, Outs, Ins,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
                                      "$dst, "#IntelSrcAsm#"}",
                        Pattern>;
 
-    def NAME#k: AVX512<O, F, Outs, MaskingIns,
-                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
-                                     "$dst {${mask}}, "#IntelSrcAsm#"}",
-                       MaskingPattern>, EVEX_K;
+    def k#Suffix: AVX512<O, F, Outs, MaskingIns,
+                         OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+                                       "$dst {${mask}}, "#IntelSrcAsm#"}",
+                         MaskingPattern>, EVEX_K;
     }
 }
 
@@ -235,20 +241,22 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   dag RHS, dag MaskingRHS,
-                                  bit IsCommutable = 0> :
+                                  bit IsCommutable = 0,
+                                  string Suffix = ""> :
   AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.KRC:$dst, RHS)],
-                         [(set _.KRC:$dst, MaskingRHS)], IsCommutable>;
+                         [(set _.KRC:$dst, MaskingRHS)], IsCommutable, Suffix>;
 
 multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
-                           dag RHS, dag RHS_su, bit IsCommutable = 0> :
+                           dag RHS, dag RHS_su, bit IsCommutable = 0,
+                           string Suffix = ""> :
    AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (and _.KRCWM:$mask, RHS_su), IsCommutable>;
+                          (and _.KRCWM:$mask, RHS_su), IsCommutable, Suffix>;
 
 // Used by conversion instructions.
 multiclass AVX512_maskable_cvt<bits<8> O, Format F, X86VectorVTInfo _,
@@ -1937,37 +1945,37 @@ defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend,
 multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                              PatFrag OpNode_su, PatFrag OpNodeSAE_su,
                              X86FoldableSchedWrite sched> {
-  defm  rri_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
-                                       (outs _.KRC:$dst),
-                                       (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
-                                       "vcmp"#_.Suffix,
-                                       "$cc, $src2, $src1", "$src1, $src2, $cc",
-                                       (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
-                                       (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc)>,
-                                       EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+  defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                                   (outs _.KRC:$dst),
+                                   (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                                   "vcmp"#_.Suffix,
+                                   "$cc, $src2, $src1", "$src1, $src2, $cc",
+                                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+                                   (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), 0, "_Int">,
+                                   EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
   let mayLoad = 1 in
-  defm  rmi_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
-                                       (outs _.KRC:$dst),
-                                       (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc),
-                                       "vcmp"#_.Suffix,
-                                       "$cc, $src2, $src1", "$src1, $src2, $cc",
-                                       (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
-                                           timm:$cc),
-                                       (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
-                                           timm:$cc)>, EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
-                                       Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+  defm  rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                                   (outs _.KRC:$dst),
+                                   (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc),
+                                   "vcmp"#_.Suffix,
+                                   "$cc, $src2, $src1", "$src1, $src2, $cc",
+                                   (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
+                                       timm:$cc),
+                                   (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
+                                       timm:$cc), 0, "_Int">, EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+                                   Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
   let Uses = [MXCSR] in
-  defm  rrib_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
-                                        (outs _.KRC:$dst),
-                                        (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
-                                        "vcmp"#_.Suffix,
-                                        "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc",
-                                        (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                                   timm:$cc),
-                                        (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                                      timm:$cc)>,
-                                        EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
+  defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                                    (outs _.KRC:$dst),
+                                    (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                                    "vcmp"#_.Suffix,
+                                    "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc",
+                                    (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                               timm:$cc),
+                                    (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                                  timm:$cc), 0, "_Int">,
+                                    EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
 
   let isCodeGenOnly = 1 in {
     let isCommutable = 1 in
@@ -5354,17 +5362,17 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                             SDPatternOperator OpNode, SDNode VecNode,
                             X86FoldableSchedWrite sched, bit IsCommutable> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
-                           (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2)), "_Int">,
                            Sched<[sched]>;
 
-  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (VecNode _.RC:$src1,
-                                        (_.ScalarIntMemFrags addr:$src2)))>,
+                                        (_.ScalarIntMemFrags addr:$src2))), "_Int">,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -5387,28 +5395,28 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                                   SDNode VecNode, X86FoldableSchedWrite sched> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
-  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                           (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
                           "$rc, $src2, $src1", "$src1, $src2, $rc",
                           (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                          (i32 timm:$rc))>,
+                          (i32 timm:$rc)), "_Int">,
                           EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                                 SDPatternOperator OpNode, SDNode VecNode, SDNode SaeNode,
                                 X86FoldableSchedWrite sched, bit IsCommutable> {
   let ExeDomain = _.ExeDomain in {
-  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
-                           (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2)), "_Int">,
                            Sched<[sched]>, SIMD_EXC;
 
-  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (VecNode _.RC:$src1,
-                                        (_.ScalarIntMemFrags addr:$src2)))>,
+                                        (_.ScalarIntMemFrags addr:$src2))), "_Int">,
                          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
   let isCodeGenOnly = 1, Predicates = [HasAVX512],
@@ -5429,10 +5437,10 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   }
 
   let Uses = [MXCSR] in
-  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                             "{sae}, $src2, $src1", "$src1, $src2, {sae}",
-                            (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+                            (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), "_Int">,
                             EVEX_B, Sched<[sched]>;
   }
 }
@@ -6835,22 +6843,22 @@ defm VFNMSUB132   : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub,
 multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                                dag RHS_r, dag RHS_m, dag RHS_b, bit MaskOnlyReg> {
 let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
-  defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm r: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
-          "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
+          "$src3, $src2", "$src2, $src3", (null_frag), 1, 1, 0, "_Int">,
           EVEX, VVVV, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
 
   let mayLoad = 1 in
-  defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm m: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
-          "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
+          "$src3, $src2", "$src2, $src3", (null_frag), 1, 1, 0, "_Int">,
           EVEX, VVVV, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
                           SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
 
   let Uses = [MXCSR] in
-  defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rb: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
-         OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
+         OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1, 0, "_Int">,
          EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
 
   let isCodeGenOnly = 1, isCommutable = 1 in {
@@ -6982,7 +6990,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                     (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                     _.FRC:$src3),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
-              (!cast<I>(Prefix#"213"#Suffix#"Zr_Intk")
+              (!cast<I>(Prefix#"213"#Suffix#"Zrk_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
@@ -6993,7 +7001,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                     (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                     (_.ScalarLdFrag addr:$src3)),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
-              (!cast<I>(Prefix#"213"#Suffix#"Zm_Intk")
+              (!cast<I>(Prefix#"213"#Suffix#"Zmk_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
@@ -7002,7 +7010,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                 (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                           (_.ScalarLdFrag addr:$src3), _.FRC:$src2),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
-              (!cast<I>(Prefix#"132"#Suffix#"Zm_Intk")
+              (!cast<I>(Prefix#"132"#Suffix#"Zmk_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
@@ -7011,7 +7019,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                 (MaskedOp _.FRC:$src2, _.FRC:$src3,
                           (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
-              (!cast<I>(Prefix#"231"#Suffix#"Zr_Intk")
+              (!cast<I>(Prefix#"231"#Suffix#"Zrk_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
@@ -7021,7 +7029,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                 (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
                           (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
-              (!cast<I>(Prefix#"231"#Suffix#"Zm_Intk")
+              (!cast<I>(Prefix#"231"#Suffix#"Zmk_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
@@ -7031,7 +7039,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                           (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                           _.FRC:$src3),
                 (_.EltVT ZeroFP)))))),
-              (!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz")
+              (!cast<I>(Prefix#"213"#Suffix#"Zrkz_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
@@ -7041,7 +7049,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                 (MaskedOp _.FRC:$src2, _.FRC:$src3,
                           (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
                 (_.EltVT ZeroFP)))))),
-              (!cast<I>(Prefix#"231"#Suffix#"Zr_Intkz")
+              (!cast<I>(Prefix#"231"#Suffix#"Zrkz_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
@@ -7052,7 +7060,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                           (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                           (_.ScalarLdFrag addr:$src3)),
                 (_.EltVT ZeroFP)))))),
-              (!cast<I>(Prefix#"213"#Suffix#"Zm_Intkz")
+              (!cast<I>(Prefix#"213"#Suffix#"Zmkz_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
@@ -7061,7 +7069,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                 (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                           _.FRC:$src2, (_.ScalarLdFrag addr:$src3)),
                 (_.EltVT ZeroFP)))))),
-              (!cast<I>(Prefix#"132"#Suffix#"Zm_Intkz")
+              (!cast<I>(Prefix#"132"#Suffix#"Zmkz_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
@@ -7070,7 +7078,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                 (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
                           (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
                 (_.EltVT ZeroFP)))))),
-              (!cast<I>(Prefix#"231"#Suffix#"Zm_Intkz")
+              (!cast<I>(Prefix#"231"#Suffix#"Zmkz_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
@@ -7097,7 +7105,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                        (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                        _.FRC:$src3, (i32 timm:$rc)),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
-              (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intk")
+              (!cast<I>(Prefix#"213"#Suffix#"Zrbk_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
@@ -7108,7 +7116,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                        (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                        (i32 timm:$rc)),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
-              (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intk")
+              (!cast<I>(Prefix#"231"#Suffix#"Zrbk_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
@@ -7119,7 +7127,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                        (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                        _.FRC:$src3, (i32 timm:$rc)),
                 (_.EltVT ZeroFP)))))),
-              (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intkz")
+              (!cast<I>(Prefix#"213"#Suffix#"Zrbkz_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
@@ -7130,7 +7138,7 @@ multiclass avx512_scalar_fma_patterns<SDPatternOperator Op, SDNode MaskedOp,
                        (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                        (i32 timm:$rc)),
                 (_.EltVT ZeroFP)))))),
-              (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intkz")
+              (!cast<I>(Prefix#"231"#Suffix#"Zrbkz_Int")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)),
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
@@ -7628,17 +7636,17 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                                 X86VectorVTInfo _Src, SDNode OpNode,
                                 X86FoldableSchedWrite sched> {
-  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
-                                       (_Src.VT _Src.RC:$src2)))>,
+                                       (_Src.VT _Src.RC:$src2))), "_Int">,
                          EVEX, VVVV, VEX_LIG, Sched<[sched]>;
-  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
-                                  (_Src.ScalarIntMemFrags addr:$src2)))>,
+                                  (_Src.ScalarIntMemFrags addr:$src2))), "_Int">,
                          EVEX, VVVV, VEX_LIG,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -7660,11 +7668,11 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
                                     X86VectorVTInfo _Src, SDNode OpNodeSAE,
                                     X86FoldableSchedWrite sched> {
   let Uses = [MXCSR] in
-  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                         (_.VT (OpNodeSAE (_.VT _.RC:$src1),
-                                         (_Src.VT _Src.RC:$src2)))>,
+                                         (_Src.VT _Src.RC:$src2))), "_Int">,
                         EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
 }
 
@@ -7673,11 +7681,11 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
                                    X86VectorVTInfo _Src, SDNode OpNodeRnd,
                                    X86FoldableSchedWrite sched> {
   let Uses = [MXCSR] in
-  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
-                                         (_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>,
+                                         (_Src.VT _Src.RC:$src2), (i32 timm:$rc))), "_Int">,
                         EVEX, VVVV, VEX_LIG, Sched<[sched]>,
                         EVEX_B, EVEX_RC;
 }
@@ -9531,25 +9539,25 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
                               X86VectorVTInfo _, string Name, Predicate prd = HasAVX512> {
   let ExeDomain = _.ExeDomain, Predicates = [prd] in {
-    defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+    defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (X86fsqrts (_.VT _.RC:$src1),
-                                    (_.VT _.RC:$src2))>,
+                                    (_.VT _.RC:$src2)), "_Int">,
                          Sched<[sched]>, SIMD_EXC;
-    defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+    defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (X86fsqrts (_.VT _.RC:$src1),
-                                    (_.ScalarIntMemFrags addr:$src2))>,
+                                    (_.ScalarIntMemFrags addr:$src2)), "_Int">,
                          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
     let Uses = [MXCSR] in
-    defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+    defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
                          "$rc, $src2, $src1", "$src1, $src2, $rc",
                          (X86fsqrtRnds (_.VT _.RC:$src1),
                                      (_.VT _.RC:$src2),
-                                     (i32 timm:$rc))>,
+                                     (i32 timm:$rc)), "_Int">,
                          EVEX_B, EVEX_RC, Sched<[sched]>;
 
     let isCodeGenOnly = 1, hasSideEffects = 0 in {
@@ -9596,27 +9604,27 @@ defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt", SchedWriteFSqrtSizes>, VEX_LI
 multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
                                   X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain in {
-  defm rri_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
                            "$src3, $src2, $src1", "$src1, $src2, $src3",
                            (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                           (i32 timm:$src3)))>,
+                           (i32 timm:$src3))), "_Int">,
                            Sched<[sched]>, SIMD_EXC;
 
   let Uses = [MXCSR] in
-  defm rrib_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
                          "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
                          (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                         (i32 timm:$src3)))>, EVEX_B,
+                         (i32 timm:$src3))), "_Int">, EVEX_B,
                          Sched<[sched]>;
 
-  defm rmi_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
                          OpcodeStr,
                          "$src3, $src2, $src1", "$src1, $src2, $src3",
                          (_.VT (X86RndScales _.RC:$src1,
-                                (_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3)))>,
+                                (_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3))), "_Int">,
                          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
   let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
@@ -9669,13 +9677,13 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
     def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask,
                (OpNode (extractelt _.VT:$src2, (iPTR 0))),
                (extractelt _.VT:$dst, (iPTR 0))))),
-              (!cast<Instruction>("V"#OpcPrefix#r_Intk)
+              (!cast<Instruction>("V"#OpcPrefix#rk_Int)
                _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>;
 
     def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask,
                (OpNode (extractelt _.VT:$src2, (iPTR 0))),
                ZeroFP))),
-              (!cast<Instruction>("V"#OpcPrefix#r_Intkz)
+              (!cast<Instruction>("V"#OpcPrefix#rkz_Int)
                OutMask, _.VT:$src2, _.VT:$src1)>;
   }
 }
@@ -12174,7 +12182,7 @@ multiclass AVX512_scalar_math_fp_patterns<SDPatternOperator Op, SDNode MaskedOp,
                                        (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                                       _.FRC:$src2),
                             _.FRC:$src0))),
-              (!cast<Instruction>("V"#OpcPrefix#"Zrr_Intk")
+              (!cast<Instruction>("V"#OpcPrefix#"Zrrk_Int")
                (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
                VK1WM:$mask, _.VT:$src1,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
@@ -12185,7 +12193,7 @@ multiclass AVX512_scalar_math_fp_patterns<SDPatternOperator Op, SDNode MaskedOp,
                                        (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                                       (_.ScalarLdFrag addr:$src2)),
                             _.FRC:$src0))),
-              (!cast<Instruction>("V"#OpcPrefix#"Zrm_Intk")
+              (!cast<Instruction>("V"#OpcPrefix#"Zrmk_Int")
                (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
                VK1WM:$mask, _.VT:$src1, addr:$src2)>;
 
@@ -12196,7 +12204,7 @@ multiclass AVX512_scalar_math_fp_patterns<SDPatternOperator Op, SDNode MaskedOp,
                             (MaskedOp (_.EltVT
                                        (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                                       _.FRC:$src2), (_.EltVT ZeroFP)))),
-      (!cast<I>("V"#OpcPrefix#"Zrr_Intkz")
+      (!cast<I>("V"#OpcPrefix#"Zrrkz_Int")
           VK1WM:$mask, _.VT:$src1,
           (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
     def : Pat<(MoveNode (_.VT VR128X:$src1),
@@ -12205,7 +12213,7 @@ multiclass AVX512_scalar_math_fp_patterns<SDPatternOperator Op, SDNode MaskedOp,
                             (MaskedOp (_.EltVT
                                        (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                                       (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
-      (!cast<I>("V"#OpcPrefix#"Zrm_Intkz") VK1WM:$mask, _.VT:$src1, addr:$src2)>;
+      (!cast<I>("V"#OpcPrefix#"Zrmkz_Int") VK1WM:$mask, _.VT:$src1, addr:$src2)>;
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 090ec68..0da4857 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -27,6 +27,11 @@ using namespace llvm;
   FMA3GROUP(Name, Suf##k, Attrs | X86InstrFMA3Group::KMergeMasked) \
   FMA3GROUP(Name, Suf##kz, Attrs | X86InstrFMA3Group::KZeroMasked)
 
+#define FMA3GROUP_MASKED_INT(Name, Suf, Attrs) \
+  FMA3GROUP(Name, Suf##_Int, Attrs) \
+  FMA3GROUP(Name, Suf##k_Int, Attrs | X86InstrFMA3Group::KMergeMasked) \
+  FMA3GROUP(Name, Suf##kz_Int, Attrs | X86InstrFMA3Group::KZeroMasked)
+
 #define FMA3GROUP_PACKED_WIDTHS_Z(Name, Suf, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Z128m, Attrs) \
   FMA3GROUP_MASKED(Name, Suf##Z128r, Attrs) \
@@ -52,9 +57,9 @@ using namespace llvm;
 
 #define FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
   FMA3GROUP(Name, Suf##Zm, Attrs) \
-  FMA3GROUP_MASKED(Name, Suf##Zm_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+  FMA3GROUP_MASKED_INT(Name, Suf##Zm, Attrs | X86InstrFMA3Group::Intrinsic) \
   FMA3GROUP(Name, Suf##Zr, Attrs) \
-  FMA3GROUP_MASKED(Name, Suf##Zr_Int, Attrs | X86InstrFMA3Group::Intrinsic) \
+  FMA3GROUP_MASKED_INT(Name, Suf##Zr, Attrs | X86InstrFMA3Group::Intrinsic) \
 
 #define FMA3GROUP_SCALAR_WIDTHS_ALL(Name, Suf, Attrs) \
   FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
@@ -108,11 +113,11 @@ static const X86InstrFMA3Group Groups[] = {
 
 #define FMA3GROUP_SCALAR_AVX512_ROUND(Name, Suf, Attrs) \
   FMA3GROUP(Name, SDZ##Suf, Attrs) \
-  FMA3GROUP_MASKED(Name, SDZ##Suf##_Int, Attrs) \
+  FMA3GROUP_MASKED_INT(Name, SDZ##Suf, Attrs) \
   FMA3GROUP(Name, SHZ##Suf, Attrs) \
-  FMA3GROUP_MASKED(Name, SHZ##Suf##_Int, Attrs) \
+  FMA3GROUP_MASKED_INT(Name, SHZ##Suf, Attrs) \
   FMA3GROUP(Name, SSZ##Suf, Attrs) \
-  FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
+  FMA3GROUP_MASKED_INT(Name, SSZ##Suf, Attrs)
 
 static const X86InstrFMA3Group BroadcastGroups[] = {
   FMA3GROUP_PACKED_AVX512_ALL(VFMADD, mb, 0)
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index f6231b7..af0267a 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -390,6 +390,13 @@ def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
                            SDTCisFP<0>, SDTCisVT<4, i32>]>;
 
+def SDTFPToxIntSatOp
+    : SDTypeProfile<1,
+                    1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisFP<1>]>;
+
+def X86fp2sisat : SDNode<"X86ISD::FP_TO_SINT_SAT", SDTFPToxIntSatOp>;
+def X86fp2uisat : SDNode<"X86ISD::FP_TO_UINT_SAT", SDTFPToxIntSatOp>;
+
 def X86PAlignr : SDNode<"X86ISD::PALIGNR",
                         SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i8>,
                                              SDTCisSameAs<0,1>,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5a6ea11..30a5161 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -7646,8 +7646,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::CVTSS2SDrr_Int:
     case X86::VCVTSS2SDrr_Int:
     case X86::VCVTSS2SDZrr_Int:
-    case X86::VCVTSS2SDZrr_Intk:
-    case X86::VCVTSS2SDZrr_Intkz:
+    case X86::VCVTSS2SDZrrk_Int:
+    case X86::VCVTSS2SDZrrkz_Int:
     case X86::CVTSS2SIrr_Int:
     case X86::CVTSS2SI64rr_Int:
     case X86::VCVTSS2SIrr_Int:
@@ -7700,21 +7700,21 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::SUBSSrr_Int:
     case X86::VSUBSSrr_Int:
     case X86::VSUBSSZrr_Int:
-    case X86::VADDSSZrr_Intk:
-    case X86::VADDSSZrr_Intkz:
-    case X86::VCMPSSZrri_Intk:
-    case X86::VDIVSSZrr_Intk:
-    case X86::VDIVSSZrr_Intkz:
-    case X86::VMAXSSZrr_Intk:
-    case X86::VMAXSSZrr_Intkz:
-    case X86::VMINSSZrr_Intk:
-    case X86::VMINSSZrr_Intkz:
-    case X86::VMULSSZrr_Intk:
-    case X86::VMULSSZrr_Intkz:
-    case X86::VSQRTSSZr_Intk:
-    case X86::VSQRTSSZr_Intkz:
-    case X86::VSUBSSZrr_Intk:
-    case X86::VSUBSSZrr_Intkz:
+    case X86::VADDSSZrrk_Int:
+    case X86::VADDSSZrrkz_Int:
+    case X86::VCMPSSZrrik_Int:
+    case X86::VDIVSSZrrk_Int:
+    case X86::VDIVSSZrrkz_Int:
+    case X86::VMAXSSZrrk_Int:
+    case X86::VMAXSSZrrkz_Int:
+    case X86::VMINSSZrrk_Int:
+    case X86::VMINSSZrrkz_Int:
+    case X86::VMULSSZrrk_Int:
+    case X86::VMULSSZrrkz_Int:
+    case X86::VSQRTSSZrk_Int:
+    case X86::VSQRTSSZrkz_Int:
+    case X86::VSUBSSZrrk_Int:
+    case X86::VSUBSSZrrkz_Int:
     case X86::VFMADDSS4rr_Int:
     case X86::VFNMADDSS4rr_Int:
     case X86::VFMSUBSS4rr_Int:
@@ -7743,30 +7743,30 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFNMSUB213SSZr_Int:
     case X86::VFMSUB231SSZr_Int:
     case X86::VFNMSUB231SSZr_Int:
-    case X86::VFMADD132SSZr_Intk:
-    case X86::VFNMADD132SSZr_Intk:
-    case X86::VFMADD213SSZr_Intk:
-    case X86::VFNMADD213SSZr_Intk:
-    case X86::VFMADD231SSZr_Intk:
-    case X86::VFNMADD231SSZr_Intk:
-    case X86::VFMSUB132SSZr_Intk:
-    case X86::VFNMSUB132SSZr_Intk:
-    case X86::VFMSUB213SSZr_Intk:
-    case X86::VFNMSUB213SSZr_Intk:
-    case X86::VFMSUB231SSZr_Intk:
-    case X86::VFNMSUB231SSZr_Intk:
-    case X86::VFMADD132SSZr_Intkz:
-    case X86::VFNMADD132SSZr_Intkz:
-    case X86::VFMADD213SSZr_Intkz:
-    case X86::VFNMADD213SSZr_Intkz:
-    case X86::VFMADD231SSZr_Intkz:
-    case X86::VFNMADD231SSZr_Intkz:
-    case X86::VFMSUB132SSZr_Intkz:
-    case X86::VFNMSUB132SSZr_Intkz:
-    case X86::VFMSUB213SSZr_Intkz:
-    case X86::VFNMSUB213SSZr_Intkz:
-    case X86::VFMSUB231SSZr_Intkz:
-    case X86::VFNMSUB231SSZr_Intkz:
+    case X86::VFMADD132SSZrk_Int:
+    case X86::VFNMADD132SSZrk_Int:
+    case X86::VFMADD213SSZrk_Int:
+    case X86::VFNMADD213SSZrk_Int:
+    case X86::VFMADD231SSZrk_Int:
+    case X86::VFNMADD231SSZrk_Int:
+    case X86::VFMSUB132SSZrk_Int:
+    case X86::VFNMSUB132SSZrk_Int:
+    case X86::VFMSUB213SSZrk_Int:
+    case X86::VFNMSUB213SSZrk_Int:
+    case X86::VFMSUB231SSZrk_Int:
+    case X86::VFNMSUB231SSZrk_Int:
+    case X86::VFMADD132SSZrkz_Int:
+    case X86::VFNMADD132SSZrkz_Int:
+    case X86::VFMADD213SSZrkz_Int:
+    case X86::VFNMADD213SSZrkz_Int:
+    case X86::VFMADD231SSZrkz_Int:
+    case X86::VFNMADD231SSZrkz_Int:
+    case X86::VFMSUB132SSZrkz_Int:
+    case X86::VFNMSUB132SSZrkz_Int:
+    case X86::VFMSUB213SSZrkz_Int:
+    case X86::VFNMSUB213SSZrkz_Int:
+    case X86::VFMSUB231SSZrkz_Int:
+    case X86::VFNMSUB231SSZrkz_Int:
     case X86::VFIXUPIMMSSZrri:
     case X86::VFIXUPIMMSSZrrik:
     case X86::VFIXUPIMMSSZrrikz:
@@ -7791,8 +7791,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VREDUCESSZrrik:
     case X86::VREDUCESSZrrikz:
     case X86::VRNDSCALESSZrri_Int:
-    case X86::VRNDSCALESSZrri_Intk:
-    case X86::VRNDSCALESSZrri_Intkz:
+    case X86::VRNDSCALESSZrrik_Int:
+    case X86::VRNDSCALESSZrrikz_Int:
     case X86::VRSQRT14SSZrr:
     case X86::VRSQRT14SSZrrk:
     case X86::VRSQRT14SSZrrkz:
@@ -7819,8 +7819,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::CVTSD2SSrr_Int:
     case X86::VCVTSD2SSrr_Int:
     case X86::VCVTSD2SSZrr_Int:
-    case X86::VCVTSD2SSZrr_Intk:
-    case X86::VCVTSD2SSZrr_Intkz:
+    case X86::VCVTSD2SSZrrk_Int:
+    case X86::VCVTSD2SSZrrkz_Int:
     case X86::CVTSD2SIrr_Int:
     case X86::CVTSD2SI64rr_Int:
     case X86::VCVTSD2SIrr_Int:
@@ -7869,21 +7869,21 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::SUBSDrr_Int:
     case X86::VSUBSDrr_Int:
     case X86::VSUBSDZrr_Int:
-    case X86::VADDSDZrr_Intk:
-    case X86::VADDSDZrr_Intkz:
-    case X86::VCMPSDZrri_Intk:
-    case X86::VDIVSDZrr_Intk:
-    case X86::VDIVSDZrr_Intkz:
-    case X86::VMAXSDZrr_Intk:
-    case X86::VMAXSDZrr_Intkz:
-    case X86::VMINSDZrr_Intk:
-    case X86::VMINSDZrr_Intkz:
-    case X86::VMULSDZrr_Intk:
-    case X86::VMULSDZrr_Intkz:
-    case X86::VSQRTSDZr_Intk:
-    case X86::VSQRTSDZr_Intkz:
-    case X86::VSUBSDZrr_Intk:
-    case X86::VSUBSDZrr_Intkz:
+    case X86::VADDSDZrrk_Int:
+    case X86::VADDSDZrrkz_Int:
+    case X86::VCMPSDZrrik_Int:
+    case X86::VDIVSDZrrk_Int:
+    case X86::VDIVSDZrrkz_Int:
+    case X86::VMAXSDZrrk_Int:
+    case X86::VMAXSDZrrkz_Int:
+    case X86::VMINSDZrrk_Int:
+    case X86::VMINSDZrrkz_Int:
+    case X86::VMULSDZrrk_Int:
+    case X86::VMULSDZrrkz_Int:
+    case X86::VSQRTSDZrk_Int:
+    case X86::VSQRTSDZrkz_Int:
+    case X86::VSUBSDZrrk_Int:
+    case X86::VSUBSDZrrkz_Int:
     case X86::VFMADDSD4rr_Int:
     case X86::VFNMADDSD4rr_Int:
     case X86::VFMSUBSD4rr_Int:
@@ -7912,30 +7912,30 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFNMSUB213SDZr_Int:
     case X86::VFMSUB231SDZr_Int:
     case X86::VFNMSUB231SDZr_Int:
-    case X86::VFMADD132SDZr_Intk:
-    case X86::VFNMADD132SDZr_Intk:
-    case X86::VFMADD213SDZr_Intk:
-    case X86::VFNMADD213SDZr_Intk:
-    case X86::VFMADD231SDZr_Intk:
-    case X86::VFNMADD231SDZr_Intk:
-    case X86::VFMSUB132SDZr_Intk:
-    case X86::VFNMSUB132SDZr_Intk:
-    case X86::VFMSUB213SDZr_Intk:
-    case X86::VFNMSUB213SDZr_Intk:
-    case X86::VFMSUB231SDZr_Intk:
-    case X86::VFNMSUB231SDZr_Intk:
-    case X86::VFMADD132SDZr_Intkz:
-    case X86::VFNMADD132SDZr_Intkz:
-    case X86::VFMADD213SDZr_Intkz:
-    case X86::VFNMADD213SDZr_Intkz:
-    case X86::VFMADD231SDZr_Intkz:
-    case X86::VFNMADD231SDZr_Intkz:
-    case X86::VFMSUB132SDZr_Intkz:
-    case X86::VFNMSUB132SDZr_Intkz:
-    case X86::VFMSUB213SDZr_Intkz:
-    case X86::VFNMSUB213SDZr_Intkz:
-    case X86::VFMSUB231SDZr_Intkz:
-    case X86::VFNMSUB231SDZr_Intkz:
+    case X86::VFMADD132SDZrk_Int:
+    case X86::VFNMADD132SDZrk_Int:
+    case X86::VFMADD213SDZrk_Int:
+    case X86::VFNMADD213SDZrk_Int:
+    case X86::VFMADD231SDZrk_Int:
+    case X86::VFNMADD231SDZrk_Int:
+    case X86::VFMSUB132SDZrk_Int:
+    case X86::VFNMSUB132SDZrk_Int:
+    case X86::VFMSUB213SDZrk_Int:
+    case X86::VFNMSUB213SDZrk_Int:
+    case X86::VFMSUB231SDZrk_Int:
+    case X86::VFNMSUB231SDZrk_Int:
+    case X86::VFMADD132SDZrkz_Int:
+    case X86::VFNMADD132SDZrkz_Int:
+    case X86::VFMADD213SDZrkz_Int:
+    case X86::VFNMADD213SDZrkz_Int:
+    case X86::VFMADD231SDZrkz_Int:
+    case X86::VFNMADD231SDZrkz_Int:
+    case X86::VFMSUB132SDZrkz_Int:
+    case X86::VFNMSUB132SDZrkz_Int:
+    case X86::VFMSUB213SDZrkz_Int:
+    case X86::VFNMSUB213SDZrkz_Int:
+    case X86::VFMSUB231SDZrkz_Int:
+    case X86::VFNMSUB231SDZrkz_Int:
     case X86::VFIXUPIMMSDZrri:
     case X86::VFIXUPIMMSDZrrik:
     case X86::VFIXUPIMMSDZrrikz:
@@ -7960,8 +7960,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VREDUCESDZrrik:
     case X86::VREDUCESDZrrikz:
     case X86::VRNDSCALESDZrri_Int:
-    case X86::VRNDSCALESDZrri_Intk:
-    case X86::VRNDSCALESDZrri_Intkz:
+    case X86::VRNDSCALESDZrrik_Int:
+    case X86::VRNDSCALESDZrrikz_Int:
     case X86::VRSQRT14SDZrr:
     case X86::VRSQRT14SDZrrk:
     case X86::VRSQRT14SDZrrkz:
@@ -7989,19 +7989,19 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VMINSHZrr_Int:
     case X86::VMULSHZrr_Int:
     case X86::VSUBSHZrr_Int:
-    case X86::VADDSHZrr_Intk:
-    case X86::VADDSHZrr_Intkz:
-    case X86::VCMPSHZrri_Intk:
-    case X86::VDIVSHZrr_Intk:
-    case X86::VDIVSHZrr_Intkz:
-    case X86::VMAXSHZrr_Intk:
-    case X86::VMAXSHZrr_Intkz:
-    case X86::VMINSHZrr_Intk:
-    case X86::VMINSHZrr_Intkz:
-    case X86::VMULSHZrr_Intk:
-    case X86::VMULSHZrr_Intkz:
-    case X86::VSUBSHZrr_Intk:
-    case X86::VSUBSHZrr_Intkz:
+    case X86::VADDSHZrrk_Int:
+    case X86::VADDSHZrrkz_Int:
+    case X86::VCMPSHZrrik_Int:
+    case X86::VDIVSHZrrk_Int:
+    case X86::VDIVSHZrrkz_Int:
+    case X86::VMAXSHZrrk_Int:
+    case X86::VMAXSHZrrkz_Int:
+    case X86::VMINSHZrrk_Int:
+    case X86::VMINSHZrrkz_Int:
+    case X86::VMULSHZrrk_Int:
+    case X86::VMULSHZrrkz_Int:
+    case X86::VSUBSHZrrk_Int:
+    case X86::VSUBSHZrrkz_Int:
     case X86::VFMADD132SHZr_Int:
     case X86::VFNMADD132SHZr_Int:
     case X86::VFMADD213SHZr_Int:
@@ -8014,30 +8014,30 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFNMSUB213SHZr_Int:
     case X86::VFMSUB231SHZr_Int:
     case X86::VFNMSUB231SHZr_Int:
-    case X86::VFMADD132SHZr_Intk:
-    case X86::VFNMADD132SHZr_Intk:
-    case X86::VFMADD213SHZr_Intk:
-    case X86::VFNMADD213SHZr_Intk:
-    case X86::VFMADD231SHZr_Intk:
-    case X86::VFNMADD231SHZr_Intk:
-    case X86::VFMSUB132SHZr_Intk:
-    case X86::VFNMSUB132SHZr_Intk:
-    case X86::VFMSUB213SHZr_Intk:
-    case X86::VFNMSUB213SHZr_Intk:
-    case X86::VFMSUB231SHZr_Intk:
-    case X86::VFNMSUB231SHZr_Intk:
-    case X86::VFMADD132SHZr_Intkz:
-    case X86::VFNMADD132SHZr_Intkz:
-    case X86::VFMADD213SHZr_Intkz:
-    case X86::VFNMADD213SHZr_Intkz:
-    case X86::VFMADD231SHZr_Intkz:
-    case X86::VFNMADD231SHZr_Intkz:
-    case X86::VFMSUB132SHZr_Intkz:
-    case X86::VFNMSUB132SHZr_Intkz:
-    case X86::VFMSUB213SHZr_Intkz:
-    case X86::VFNMSUB213SHZr_Intkz:
-    case X86::VFMSUB231SHZr_Intkz:
-    case X86::VFNMSUB231SHZr_Intkz:
+    case X86::VFMADD132SHZrk_Int:
+    case X86::VFNMADD132SHZrk_Int:
+    case X86::VFMADD213SHZrk_Int:
+    case X86::VFNMADD213SHZrk_Int:
+    case X86::VFMADD231SHZrk_Int:
+    case X86::VFNMADD231SHZrk_Int:
+    case X86::VFMSUB132SHZrk_Int:
+    case X86::VFNMSUB132SHZrk_Int:
+    case X86::VFMSUB213SHZrk_Int:
+    case X86::VFNMSUB213SHZrk_Int:
+    case X86::VFMSUB231SHZrk_Int:
+    case X86::VFNMSUB231SHZrk_Int:
+    case X86::VFMADD132SHZrkz_Int:
+    case X86::VFNMADD132SHZrkz_Int:
+    case X86::VFMADD213SHZrkz_Int:
+    case X86::VFNMADD213SHZrkz_Int:
+    case X86::VFMADD231SHZrkz_Int:
+    case X86::VFNMADD231SHZrkz_Int:
+    case X86::VFMSUB132SHZrkz_Int:
+    case X86::VFNMSUB132SHZrkz_Int:
+    case X86::VFMSUB213SHZrkz_Int:
+    case X86::VFNMSUB213SHZrkz_Int:
+    case X86::VFMSUB231SHZrkz_Int:
+    case X86::VFNMSUB231SHZrkz_Int:
       return false;
     default:
       return true;
@@ -9489,25 +9489,25 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::VDIVSDZrm:
   case X86::VDIVSDZrr:
   case X86::VDIVSDZrm_Int:
-  case X86::VDIVSDZrm_Intk:
-  case X86::VDIVSDZrm_Intkz:
+  case X86::VDIVSDZrmk_Int:
+  case X86::VDIVSDZrmkz_Int:
   case X86::VDIVSDZrr_Int:
-  case X86::VDIVSDZrr_Intk:
-  case X86::VDIVSDZrr_Intkz:
+  case X86::VDIVSDZrrk_Int:
+  case X86::VDIVSDZrrkz_Int:
   case X86::VDIVSDZrrb_Int:
-  case X86::VDIVSDZrrb_Intk:
-  case X86::VDIVSDZrrb_Intkz:
+  case X86::VDIVSDZrrbk_Int:
+  case X86::VDIVSDZrrbkz_Int:
   case X86::VDIVSSZrm:
   case X86::VDIVSSZrr:
   case X86::VDIVSSZrm_Int:
-  case X86::VDIVSSZrm_Intk:
-  case X86::VDIVSSZrm_Intkz:
+  case X86::VDIVSSZrmk_Int:
+  case X86::VDIVSSZrmkz_Int:
   case X86::VDIVSSZrr_Int:
-  case X86::VDIVSSZrr_Intk:
-  case X86::VDIVSSZrr_Intkz:
+  case X86::VDIVSSZrrk_Int:
+  case X86::VDIVSSZrrkz_Int:
   case X86::VDIVSSZrrb_Int:
-  case X86::VDIVSSZrrb_Intk:
-  case X86::VDIVSSZrrb_Intkz:
+  case X86::VDIVSSZrrbk_Int:
+  case X86::VDIVSSZrrbkz_Int:
   case X86::VSQRTPDZ128m:
   case X86::VSQRTPDZ128mb:
   case X86::VSQRTPDZ128mbk:
@@ -9570,26 +9570,26 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::VSQRTPSZrkz:
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
-  case X86::VSQRTSDZm_Intk:
-  case X86::VSQRTSDZm_Intkz:
+  case X86::VSQRTSDZmk_Int:
+  case X86::VSQRTSDZmkz_Int:
   case X86::VSQRTSDZr:
   case X86::VSQRTSDZr_Int:
-  case X86::VSQRTSDZr_Intk:
-  case X86::VSQRTSDZr_Intkz:
+  case X86::VSQRTSDZrk_Int:
+  case X86::VSQRTSDZrkz_Int:
   case X86::VSQRTSDZrb_Int:
-  case X86::VSQRTSDZrb_Intk:
-  case X86::VSQRTSDZrb_Intkz:
+  case X86::VSQRTSDZrbk_Int:
+  case X86::VSQRTSDZrbkz_Int:
   case X86::VSQRTSSZm:
   case X86::VSQRTSSZm_Int:
-  case X86::VSQRTSSZm_Intk:
-  case X86::VSQRTSSZm_Intkz:
+  case X86::VSQRTSSZmk_Int:
+  case X86::VSQRTSSZmkz_Int:
   case X86::VSQRTSSZr:
   case X86::VSQRTSSZr_Int:
-  case X86::VSQRTSSZr_Intk:
-  case X86::VSQRTSSZr_Intkz:
+  case X86::VSQRTSSZrk_Int:
+  case X86::VSQRTSSZrkz_Int:
   case X86::VSQRTSSZrb_Int:
-  case X86::VSQRTSSZrb_Intk:
-  case X86::VSQRTSSZrb_Intkz:
+  case X86::VSQRTSSZrbk_Int:
+  case X86::VSQRTSSZrbkz_Int:
 
   case X86::VGATHERDPDYrm:
   case X86::VGATHERDPDZ128rm:
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
index 3b370d8..64728a2 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -57,8 +57,6 @@ char X86LoadValueInjectionRetHardeningPass::ID = 0;
 
 bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
     MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
-                    << " *****\n");
   const X86Subtarget *Subtarget = &MF.getSubtarget<X86Subtarget>();
   if (!Subtarget->useLVIControlFlowIntegrity() || !Subtarget->is64Bit())
     return false; // FIXME: support 32-bit
@@ -68,6 +66,8 @@ bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
   if (!F.hasOptNone() && skipFunction(F))
     return false;
 
+  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+                    << " *****\n");
   ++NumFunctionsConsidered;
   const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const X86InstrInfo *TII = Subtarget->getInstrInfo();
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index e04ff68..4f0d366 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -669,7 +669,7 @@ def : InstRW<[SPRWriteResGroup12], (instregex "^ADD_F(P?)rST0$",
                                               "^VALIGN(D|Q)Z256rri((k|kz)?)$",
                                               "^VCMPP(D|H|S)Z(128|256)rri(k?)$",
                                               "^VCMPS(D|H|S)Zrri$",
-                                              "^VCMPS(D|H|S)Zrr(b?)i_Int(k?)$",
+                                              "^VCMPS(D|H|S)Zrr(b?)i(k?)_Int$",
                                               "^VFPCLASSP(D|H|S)Z(128|256)ri(k?)$",
                                               "^VFPCLASSS(D|H|S)Zri(k?)$",
                                               "^VPACK(S|U)S(DW|WB)Yrr$",
@@ -977,7 +977,7 @@ def SPRWriteResGroup49 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> {
   let NumMicroOps = 2;
 }
 def : InstRW<[SPRWriteResGroup49], (instregex "^DIV_F(32|64)m$")>;
-def : InstRW<[SPRWriteResGroup49, ReadAfterVecLd], (instregex "^VSQRTSHZm_Int((k|kz)?)$")>;
+def : InstRW<[SPRWriteResGroup49, ReadAfterVecLd], (instregex "^VSQRTSHZm((k|kz)?)_Int$")>;
 def : InstRW<[SPRWriteResGroup49, ReadAfterVecLd], (instrs VSQRTSHZm)>;
 
 def SPRWriteResGroup50 : SchedWriteRes<[SPRPort00, SPRPort02_03_10, SPRPort05]> {
@@ -1166,11 +1166,11 @@ def : InstRW<[SPRWriteResGroup73, ReadAfterVecXLd], (instregex "^(V?)GF2P8AFFINE
 def : InstRW<[SPRWriteResGroup73, ReadAfterVecXLd], (instrs VGETEXPPHZ128mbkz,
                                                             VGF2P8MULBZ128rm)>;
 def : InstRW<[SPRWriteResGroup73, ReadAfterVecLd], (instregex "^V(ADD|SUB)SHZrm$",
-                                                              "^V(ADD|SUB)SHZrm_Int((k|kz)?)$",
+                                                              "^V(ADD|SUB)SHZrm((k|kz)?)_Int$",
                                                               "^VCVTSH2SSZrm((_Int)?)$",
                                                               "^VM(AX|IN)CSHZrm$",
                                                               "^VM(AX|IN|UL)SHZrm$",
-                                                              "^VM(AX|IN|UL)SHZrm_Int((k|kz)?)$")>;
+                                                              "^VM(AX|IN|UL)SHZrm((k|kz)?)_Int$")>;
 def : InstRW<[SPRWriteResGroup73, ReadAfterVecYLd], (instregex "^VGF2P8AFFINE((INV)?)QBYrmi$",
                                                                "^VGF2P8AFFINE((INV)?)QBZ256rm(b?)i$",
                                                                "^VGF2P8MULB(Y|Z256)rm$")>;
@@ -1181,7 +1181,7 @@ def : InstRW<[SPRWriteResGroup73, ReadAfterVecXLd, ReadAfterVecXLd], (instregex
                                                                                 "^VFMSUBADD(132|213|231)PHZ128m((b|k|bk|kz)?)$",
                                                                                 "^VFMSUBADD(132|213|231)PHZ128mbkz$")>;
 def : InstRW<[SPRWriteResGroup73, ReadAfterVecLd, ReadAfterVecLd], (instregex "^VF(N?)M(ADD|SUB)(132|213|231)SHZm$",
-                                                                              "^VF(N?)M(ADD|SUB)(132|213|231)SHZm_Int((k|kz)?)$")>;
+                                                                              "^VF(N?)M(ADD|SUB)(132|213|231)SHZm((k|kz)?)_Int$")>;
 def : InstRW<[SPRWriteResGroup73, ReadAfterVecYLd, ReadAfterVecYLd], (instregex "^VPMADD52(H|L)UQZ256m((b|k|bk|kz)?)$",
                                                                                 "^VPMADD52(H|L)UQZ256mbkz$")>;
 
@@ -2301,7 +2301,7 @@ def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S
                                                                 "^VRNDSCALEP(D|S)Z128rmbik(z?)$",
                                                                 "^VRNDSCALEP(D|S)Z128rmi((kz)?)$",
                                                                 "^VRNDSCALES(D|S)Zrmi$",
-                                                                "^VRNDSCALES(D|S)Zrmi_Int((k|kz)?)$")>;
+                                                                "^VRNDSCALES(D|S)Zrmi((k|kz)?)_Int$")>;
 
 def SPRWriteResGroup219 : SchedWriteRes<[SPRPort00_01]> {
   let ReleaseAtCycles = [2];
@@ -2313,7 +2313,7 @@ def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)ri$",
                                                "^(V?)ROUNDS(D|S)ri_Int$",
                                                "^VRNDSCALEP(D|S)Z(128|256)rri((k|kz)?)$",
                                                "^VRNDSCALES(D|S)Zrri$",
-                                               "^VRNDSCALES(D|S)Zrri(b?)_Int((k|kz)?)$",
+                                               "^VRNDSCALES(D|S)Zrri(b?)((k|kz)?)_Int$",
                                                "^VROUNDP(D|S)Yri$")>;
 
 def SPRWriteResGroup220 : SchedWriteRes<[SPRPort00_06]> {
@@ -2530,7 +2530,7 @@ def SPRWriteResGroup249 : SchedWriteRes<[SPRPort01_05]> {
   let Latency = 4;
 }
 def : InstRW<[SPRWriteResGroup249], (instregex "^V(ADD|SUB)P(D|S)Z(128|256)rrkz$",
-                                               "^V(ADD|SUB)S(D|S)Zrr(b?)_Intkz$")>;
+                                               "^V(ADD|SUB)S(D|S)Zrr(b?)kz_Int$")>;
 
 def SPRWriteResGroup250 : SchedWriteRes<[SPRPort00_05]> {
   let Latency = 3;
@@ -2545,11 +2545,11 @@ def SPRWriteResGroup251 : SchedWriteRes<[SPRPort00_01]> {
   let Latency = 6;
 }
 def : InstRW<[SPRWriteResGroup251], (instregex "^V(ADD|SUB)PHZ(128|256)rrk(z?)$",
-                                               "^V(ADD|SUB)SHZrr(b?)_Intk(z?)$",
+                                               "^V(ADD|SUB)SHZrr(b?)k(z?)_Int$",
                                                "^VCVT(T?)PH2(U?)WZ(128|256)rrk(z?)$",
                                                "^VCVT(U?)W2PHZ(128|256)rrk(z?)$",
                                                "^VF(N?)M(ADD|SUB)(132|213|231)PHZ(128|256)rk(z?)$",
-                                               "^VF(N?)M(ADD|SUB)(132|213|231)SHZr(b?)_Intk(z?)$",
+                                               "^VF(N?)M(ADD|SUB)(132|213|231)SHZr(b?)k(z?)_Int$",
                                                "^VFMADDSUB(132|213|231)PHZ(128|256)rk(z?)$",
                                                "^VFMSUBADD(132|213|231)PHZ(128|256)rk(z?)$",
                                                "^VGETEXPPHZ(128|256)rk(z?)$",
@@ -2560,7 +2560,7 @@ def : InstRW<[SPRWriteResGroup251], (instregex "^V(ADD|SUB)PHZ(128|256)rrk(z?)$"
                                                "^VGETMANTSHZrri(k|bkz)$",
                                                "^VM(AX|IN)CPHZ(128|256)rrk(z?)$",
                                                "^VM(AX|IN|UL)PHZ(128|256)rrk(z?)$",
-                                               "^VM(AX|IN|UL)SHZrr(b?)_Intk(z?)$")>;
+                                               "^VM(AX|IN|UL)SHZrr(b?)k(z?)_Int$")>;
 
 def SPRWriteResGroup252 : SchedWriteRes<[SPRPort00]> {
   let Latency = 5;
@@ -2745,7 +2745,7 @@ def : InstRW<[SPRWriteResGroup263, ReadAfterVecYLd], (instregex "^VCMPP(D|H|S)Z(
                                                                 "^VPTEST(N?)M(B|D|Q|W)Z((256)?)rm(k?)$",
                                                                 "^VPTEST(N?)M(D|Q)Z((256)?)rmb(k?)$")>;
 def : InstRW<[SPRWriteResGroup263, ReadAfterVecLd], (instregex "^VCMPS(D|H|S)Zrmi$",
-                                                               "^VCMPS(D|H|S)Zrmi_Int(k?)$",
+                                                               "^VCMPS(D|H|S)Zrmi(k?)_Int$",
                                                                "^VFPCLASSS(D|H|S)Zmik$")>;
 
 def SPRWriteResGroup264 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> {
@@ -3171,7 +3171,7 @@ def : InstRW<[SPRWriteResGroup314], (instregex "^VCVT(T?)PD2(U?)QQZ(128|256)rr((
                                                "^VPLZCNT(D|Q)Z(128|256)rr((k|kz)?)$",
                                                "^VPMADD52(H|L)UQZ(128|256)r((k|kz)?)$",
                                                "^VSCALEFS(D|S)Zrr((k|kz)?)$",
-                                               "^VSCALEFS(D|S)Zrrb_Int((k|kz)?)$")>;
+                                               "^VSCALEFS(D|S)Zrrb((k|kz)?)_Int$")>;
 def : InstRW<[SPRWriteResGroup314, ReadAfterVecLd], (instregex "^VFIXUPIMMS(D|S)Zrrib((k|kz)?)$")>;
 
 def SPRWriteResGroup315 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10, SPRPort05]> {
@@ -3300,7 +3300,7 @@ def SPRWriteResGroup331 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10]> {
   let NumMicroOps = 2;
 }
 def : InstRW<[SPRWriteResGroup331], (instregex "^VCVTPH2PSZ(128|256)rmk(z?)$")>;
-def : InstRW<[SPRWriteResGroup331, ReadAfterVecLd], (instregex "^VCVTSH2SSZrm_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup331, ReadAfterVecLd], (instregex "^VCVTSH2SSZrmk(z?)_Int$")>;
 def : InstRW<[SPRWriteResGroup331, ReadAfterVecXLd], (instregex "^VPMADDUBSWZ128rmk(z?)$",
                                                                 "^VPMULH((U|RS)?)WZ128rmk(z?)$",
                                                                 "^VPMULLWZ128rmk(z?)$")>;
@@ -3460,7 +3460,7 @@ def SPRWriteResGroup353 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort0
   let Latency = 21;
   let NumMicroOps = 7;
 }
-def : InstRW<[SPRWriteResGroup353, ReadAfterVecLd], (instregex "^VCVTSD2SHZrm_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup353, ReadAfterVecLd], (instregex "^VCVTSD2SHZrmk(z?)_Int$")>;
 
 def SPRWriteResGroup354 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort05]> {
   let ReleaseAtCycles = [2, 1, 1];
@@ -3475,7 +3475,7 @@ def SPRWriteResGroup355 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort0
   let Latency = 14;
   let NumMicroOps = 4;
 }
-def : InstRW<[SPRWriteResGroup355], (instregex "^VCVTSD2SHZrr(b?)_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup355], (instregex "^VCVTSD2SHZrr(b?)k(z?)_Int$")>;
 
 def SPRWriteResGroup356 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10, SPRPort05]> {
   let ReleaseAtCycles = [2, 1, 1];
@@ -3489,7 +3489,7 @@ def SPRWriteResGroup357 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10, SPRPort0
   let Latency = 20;
   let NumMicroOps = 4;
 }
-def : InstRW<[SPRWriteResGroup357, ReadAfterVecLd], (instregex "^VCVTSH2SDZrm_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup357, ReadAfterVecLd], (instregex "^VCVTSH2SDZrmk(z?)_Int$")>;
 
 def SPRWriteResGroup358 : SchedWriteRes<[SPRPort00_01, SPRPort05]> {
   let ReleaseAtCycles = [2, 1];
@@ -3504,7 +3504,7 @@ def SPRWriteResGroup359 : SchedWriteRes<[SPRPort00_01, SPRPort05]> {
   let Latency = 13;
   let NumMicroOps = 3;
 }
-def : InstRW<[SPRWriteResGroup359], (instregex "^VCVTSH2SDZrr(b?)_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup359], (instregex "^VCVTSH2SDZrr(b?)k(z?)_Int$")>;
 
 def SPRWriteResGroup360 : SchedWriteRes<[SPRPort00, SPRPort00_01, SPRPort02_03_10]> {
   let Latency = 13;
@@ -3523,7 +3523,7 @@ def : InstRW<[SPRWriteResGroup361], (instregex "^VCVT(T?)SH2(U?)SI((64)?)Zrr(b?)
 def SPRWriteResGroup362 : SchedWriteRes<[SPRPort00_01]> {
   let Latency = 8;
 }
-def : InstRW<[SPRWriteResGroup362], (instregex "^VCVTSH2SSZrr(b?)_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup362], (instregex "^VCVTSH2SSZrr(b?)k(z?)_Int$")>;
 
 def SPRWriteResGroup363 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort02_03_10]> {
   let Latency = 14;
@@ -3536,7 +3536,7 @@ def SPRWriteResGroup364 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05, SPRPort0
   let Latency = 16;
   let NumMicroOps = 3;
 }
-def : InstRW<[SPRWriteResGroup364, ReadAfterVecLd], (instregex "^VCVTSS2SHZrm_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup364, ReadAfterVecLd], (instregex "^VCVTSS2SHZrmk(z?)_Int$")>;
 
 def SPRWriteResGroup365 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05]> {
   let Latency = 6;
@@ -3549,7 +3549,7 @@ def SPRWriteResGroup366 : SchedWriteRes<[SPRPort00_01, SPRPort00_01_05]> {
   let Latency = 9;
   let NumMicroOps = 2;
 }
-def : InstRW<[SPRWriteResGroup366], (instregex "^VCVTSS2SHZrr(b?)_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup366], (instregex "^VCVTSS2SHZrr(b?)k(z?)_Int$")>;
 
 def SPRWriteResGroup367 : SchedWriteRes<[SPRPort05]> {
   let Latency = 5;
@@ -3667,7 +3667,7 @@ def SPRWriteResGroup380 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> {
   let Latency = 21;
   let NumMicroOps = 2;
 }
-def : InstRW<[SPRWriteResGroup380, ReadAfterVecLd], (instregex "^VDIVSHZrm_Int((k|kz)?)$")>;
+def : InstRW<[SPRWriteResGroup380, ReadAfterVecLd], (instregex "^VDIVSHZrm((k|kz)?)_Int$")>;
 def : InstRW<[SPRWriteResGroup380, ReadAfterVecLd], (instrs VDIVSHZrm)>;
 
 def SPRWriteResGroup381 : SchedWriteRes<[SPRPort00]> {
@@ -4884,7 +4884,7 @@ def SPRWriteResGroup534 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_10]> {
   let NumMicroOps = 3;
 }
 def : InstRW<[SPRWriteResGroup534, ReadAfterVecXLd], (instregex "^VRNDSCALEPHZ128rm(b?)ik(z?)$",
-                                                                "^VRNDSCALESHZrmi_Intk(z?)$",
+                                                                "^VRNDSCALESHZrmik(z?)_Int$",
                                                                 "^VSCALEFPHZ128rm(bk|kz)$",
                                                                 "^VSCALEFPHZ128rm(k|bkz)$")>;
 def : InstRW<[SPRWriteResGroup534, ReadAfterVecYLd], (instregex "^VRNDSCALEPHZ256rm(b?)ik(z?)$",
@@ -4898,9 +4898,9 @@ def SPRWriteResGroup535 : SchedWriteRes<[SPRPort00_01]> {
   let NumMicroOps = 2;
 }
 def : InstRW<[SPRWriteResGroup535], (instregex "^VRNDSCALEPHZ(128|256)rrik(z?)$",
-                                               "^VRNDSCALESHZrri(b?)_Intk(z?)$",
+                                               "^VRNDSCALESHZrri(b?)k(z?)_Int$",
                                                "^VSCALEFPHZ(128|256)rrk(z?)$",
-                                               "^VSCALEFSHZrrb_Intk(z?)$",
+                                               "^VSCALEFSHZrrbk(z?)_Int$",
                                                "^VSCALEFSHZrrk(z?)$")>;
 
 def SPRWriteResGroup536 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> {
@@ -4944,7 +4944,7 @@ def SPRWriteResGroup540 : SchedWriteRes<[SPRPort00, SPRPort02_03_10]> {
 }
 def : InstRW<[SPRWriteResGroup540, ReadAfterVecXLd], (instregex "^VSQRTPDZ128m(bk|kz)$",
                                                                 "^VSQRTPDZ128m(k|bkz)$")>;
-def : InstRW<[SPRWriteResGroup540, ReadAfterVecLd], (instregex "^VSQRTSDZm_Intk(z?)$")>;
+def : InstRW<[SPRWriteResGroup540, ReadAfterVecLd], (instregex "^VSQRTSDZmk(z?)_Int$")>;
 
 def SPRWriteResGroup541 : SchedWriteRes<[SPRPort00, SPRPort00_05, SPRPort02_03_10]> {
   let ReleaseAtCycles = [2, 1, 1];
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 38f9b5e..c5478dd 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1545,7 +1545,7 @@ def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> {
   let NumMicroOps = 2;
 }
 def : InstRW<[Zn4WriteSCALErr], (instregex
-        "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)",
+        "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?)",
         "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
 	)>;
 
@@ -1585,7 +1585,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex
         "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
         "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
         "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
-	"VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz"
+	"VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
 	)>;
 
 def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 808f48e..c19bcfc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1650,6 +1650,13 @@ InstructionCost X86TTIImpl::getShuffleCost(
         return MatchingTypes ? TTI::TCC_Free : SubLT.first;
     }
 
+    // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
+    // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
+    // v1f32 (legalised to f32) into a v4f32.
+    if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
+        SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
+      return 1;
+
     // If the insertion isn't aligned, treat it like a 2-op shuffle.
     Kind = TTI::SK_PermuteTwoSrc;
   }
@@ -1698,8 +1705,7 @@ InstructionCost X86TTIImpl::getShuffleCost(
   // We are going to permute multiple sources and the result will be in multiple
   // destinations. Providing an accurate cost only for splits where the element
   // type remains the same.
-  if ((Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_PermuteTwoSrc) &&
-      LT.first != 1) {
+  if (LT.first != 1) {
     MVT LegalVT = LT.second;
     if (LegalVT.isVector() &&
         LegalVT.getVectorElementType().getSizeInBits() ==
@@ -2227,9 +2233,18 @@ InstructionCost X86TTIImpl::getShuffleCost(
     { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
   };
 
-  if (ST->hasSSE1())
+  if (ST->hasSSE1()) {
+    if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
+      // SHUFPS: both pairs must come from the same source register.
+      auto MatchSHUFPS = [](int X, int Y) {
+        return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
+      };
+      if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
+        return 1;
+    }
     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
       return LT.first * Entry->Cost;
+  }
 
   return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
 }
@@ -4789,9 +4804,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     MVT MScalarTy = LT.second.getScalarType();
     auto IsCheapPInsrPExtrInsertPS = [&]() {
       // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
+      // Inserting f32 into index0 is just movss.
       // Also, assume insertps is relatively cheap on all >= SSE41 targets.
       return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
              (MScalarTy.isInteger() && ST->hasSSE41()) ||
+             (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
+              Opcode == Instruction::InsertElement) ||
              (MScalarTy == MVT::f32 && ST->hasSSE41() &&
               Opcode == Instruction::InsertElement);
     };
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 50c9a56..7d0b8c3 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -48,17 +48,12 @@ std::optional<AArch64::ArchInfo> AArch64::ArchInfo::findBySubArch(StringRef SubA
   return {};
 }
 
-unsigned AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
-  constexpr unsigned MaxFMVPriority = 1000;
-  unsigned Priority = 0;
-  unsigned NumFeatures = 0;
-  for (StringRef Feature : Features) {
-    if (auto Ext = parseFMVExtension(Feature)) {
-      Priority = std::max(Priority, Ext->Priority);
-      NumFeatures++;
-    }
-  }
-  return Priority + MaxFMVPriority * NumFeatures;
+uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
+  uint64_t Priority = 0;
+  for (StringRef Feature : Features)
+    if (std::optional<FMVInfo> Info = parseFMVExtension(Feature))
+      Priority |= (1ULL << Info->PriorityBit);
+  return Priority;
 }
 
 uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
@@ -73,7 +68,7 @@ uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
   uint64_t FeaturesMask = 0;
   for (const FMVInfo &Info : getFMVInfo())
     if (Info.ID && FeatureBits.Enabled.test(*Info.ID))
-      FeaturesMask |= (1ULL << Info.Bit);
+      FeaturesMask |= (1ULL << Info.FeatureBit);
 
   return FeaturesMask;
 }
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 45b4caf..9d1b7b8b 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -173,7 +173,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
   // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line
   // in all cases.
   SmallVector<StringRef, 32> Lines;
-  ProcCpuinfoContent.split(Lines, "\n");
+  ProcCpuinfoContent.split(Lines, '\n');
 
   // Look for the CPU implementer line.
   StringRef Implementer;
@@ -436,7 +436,7 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) {
   // The "processor 0:" line comes after a fair amount of other information,
   // including a cache breakdown, but this should be plenty.
   SmallVector<StringRef, 32> Lines;
-  ProcCpuinfoContent.split(Lines, "\n");
+  ProcCpuinfoContent.split(Lines, '\n');
 
   // Look for the CPU features.
   SmallVector<StringRef, 32> CPUFeatures;
@@ -478,7 +478,7 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) {
 StringRef sys::detail::getHostCPUNameForRISCV(StringRef ProcCpuinfoContent) {
   // There are 24 lines in /proc/cpuinfo
   SmallVector<StringRef> Lines;
-  ProcCpuinfoContent.split(Lines, "\n");
+  ProcCpuinfoContent.split(Lines, '\n');
 
   // Look for uarch line to determine cpu name
   StringRef UArch;
@@ -1630,7 +1630,7 @@ StringRef sys::getHostCPUName() {
 #if defined(__linux__)
 StringRef sys::detail::getHostCPUNameForSPARC(StringRef ProcCpuinfoContent) {
   SmallVector<StringRef> Lines;
-  ProcCpuinfoContent.split(Lines, "\n");
+  ProcCpuinfoContent.split(Lines, '\n');
 
   // Look for cpu line to determine cpu name
   StringRef Cpu;
@@ -1970,7 +1970,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
     return Features;
 
   SmallVector<StringRef, 32> Lines;
-  P->getBuffer().split(Lines, "\n");
+  P->getBuffer().split(Lines, '\n');
 
   SmallVector<StringRef, 32> CPUFeatures;
 
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index cafc9d3..d6e1eac 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -742,7 +742,8 @@ Error RISCVISAInfo::checkDependency() {
   bool HasZvl = MinVLen != 0;
   bool HasZcmt = Exts.count("zcmt") != 0;
   static constexpr StringLiteral XqciExts[] = {
-      {"xqcia"}, {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}};
+      {"xqcia"},  {"xqciac"},  {"xqcicli"}, {"xqcicm"},
+      {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}};
 
   if (HasI && HasE)
     return getIncompatibleError("i", "e");
diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index 58ff720..1782e24 100644
--- a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -79,6 +79,7 @@ MachineTypes getEmulation(StringRef S) {
       .Case("arm", IMAGE_FILE_MACHINE_ARMNT)
       .Case("arm64", IMAGE_FILE_MACHINE_ARM64)
       .Case("arm64ec", IMAGE_FILE_MACHINE_ARM64EC)
+      .Case("r4000", IMAGE_FILE_MACHINE_R4000)
       .Default(IMAGE_FILE_MACHINE_UNKNOWN);
 }
 
@@ -93,6 +94,8 @@ MachineTypes getMachine(Triple T) {
   case Triple::aarch64:
     return T.isWindowsArm64EC() ? COFF::IMAGE_FILE_MACHINE_ARM64EC
                                 : COFF::IMAGE_FILE_MACHINE_ARM64;
+  case Triple::mipsel:
+    return COFF::IMAGE_FILE_MACHINE_R4000;
   default:
     return COFF::IMAGE_FILE_MACHINE_UNKNOWN;
   }
@@ -173,7 +176,8 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
       (!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l))) {
     Table.printHelp(outs(), "llvm-dlltool [options] file...", "llvm-dlltool",
                     false);
-    llvm::outs() << "\nTARGETS: i386, i386:x86-64, arm, arm64, arm64ec\n";
+    llvm::outs()
+        << "\nTARGETS: i386, i386:x86-64, arm, arm64, arm64ec, r4000\n";
     return 1;
   }
 
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 138d9fc..6ce06b4 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -171,6 +171,7 @@ static Expected<COFF::MachineTypes> getCOFFFileMachine(MemoryBufferRef MB) {
   uint16_t Machine = (*Obj)->getMachine();
   if (Machine != COFF::IMAGE_FILE_MACHINE_I386 &&
       Machine != COFF::IMAGE_FILE_MACHINE_AMD64 &&
+      Machine != COFF::IMAGE_FILE_MACHINE_R4000 &&
       Machine != COFF::IMAGE_FILE_MACHINE_ARMNT && !COFF::isAnyArm64(Machine)) {
     return createStringError(inconvertibleErrorCode(),
                              "unknown machine: " + std::to_string(Machine));
@@ -195,6 +196,8 @@ static Expected<COFF::MachineTypes> getBitcodeFileMachine(MemoryBufferRef MB) {
   case Triple::aarch64:
     return T.isWindowsArm64EC() ? COFF::IMAGE_FILE_MACHINE_ARM64EC
                                 : COFF::IMAGE_FILE_MACHINE_ARM64;
+  case Triple::mipsel:
+    return COFF::IMAGE_FILE_MACHINE_R4000;
   default:
     return createStringError(inconvertibleErrorCode(),
                              "unknown arch in target triple: " + *TripleStr);
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 45ee2d4..fe7b3b1 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -181,6 +181,7 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
 /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
 /// of 'and' ops, then we also need to capture the fact that we saw an
 /// "and X, 1", so that's an extra return value for that case.
+namespace {
 struct MaskOps {
   Value *Root = nullptr;
   APInt Mask;
@@ -190,6 +191,7 @@ struct MaskOps {
   MaskOps(unsigned BitWidth, bool MatchAnds)
       : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {}
 };
+} // namespace
 
 /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
 /// chain of 'and' or 'or' instructions looking for shift ops of a common source
@@ -423,11 +425,8 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
            Arg, 0,
            SimplifyQuery(Call->getDataLayout(), &TLI, &DT, &AC, Call)))) {
     IRBuilder<> Builder(Call);
-    IRBuilderBase::FastMathFlagGuard Guard(Builder);
-    Builder.setFastMathFlags(Call->getFastMathFlags());
-
-    Value *NewSqrt = Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg,
-                                             /*FMFSource=*/nullptr, "sqrt");
+    Value *NewSqrt =
+        Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg, Call, "sqrt");
     Call->replaceAllUsesWith(NewSqrt);
 
     // Explicitly erase the old call because a call with side effects is not
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 240d089..7b59c39 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -69,7 +69,6 @@ static const char *const CoroIntrinsics[] = {
     "llvm.coro.async.context.dealloc",
     "llvm.coro.async.resume",
     "llvm.coro.async.size.replace",
-    "llvm.coro.async.store_resume",
     "llvm.coro.await.suspend.bool",
     "llvm.coro.await.suspend.handle",
     "llvm.coro.await.suspend.void",
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index fde43bb..c3d0a1a 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1950,9 +1950,8 @@ Expected<bool> FunctionImporter::importFunctions(
     SrcModule->setPartialSampleProfileRatio(Index);
 
     // Link in the specified functions.
-    if (renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations,
-                               &GlobalsToImport))
-      return true;
+    renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations,
+                           &GlobalsToImport);
 
     if (PrintImports) {
       for (const auto *GV : GlobalsToImport)
@@ -2026,11 +2025,8 @@ static bool doImportingForModuleForTest(
 
   // Next we need to promote to global scope and rename any local values that
   // are potentially exported to other modules.
-  if (renameModuleForThinLTO(M, *Index, /*ClearDSOLocalOnDeclarations=*/false,
-                             /*GlobalsToImport=*/nullptr)) {
-    errs() << "Error renaming module\n";
-    return true;
-  }
+  renameModuleForThinLTO(M, *Index, /*ClearDSOLocalOnDeclarations=*/false,
+                         /*GlobalsToImport=*/nullptr);
 
   // Perform the import now.
   auto ModuleLoader = [&M](StringRef Identifier) {
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 96956481..449d64d 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -66,19 +66,19 @@ static cl::opt<unsigned> MaxCodeSizeGrowth(
     "Maximum codesize growth allowed per function"));
 
 static cl::opt<unsigned> MinCodeSizeSavings(
-    "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc(
-    "Reject specializations whose codesize savings are less than this"
-    "much percent of the original function size"));
+    "funcspec-min-codesize-savings", cl::init(20), cl::Hidden,
+    cl::desc("Reject specializations whose codesize savings are less than this "
+             "much percent of the original function size"));
 
 static cl::opt<unsigned> MinLatencySavings(
     "funcspec-min-latency-savings", cl::init(40), cl::Hidden,
-    cl::desc("Reject specializations whose latency savings are less than this"
+    cl::desc("Reject specializations whose latency savings are less than this "
              "much percent of the original function size"));
 
 static cl::opt<unsigned> MinInliningBonus(
-    "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc(
-    "Reject specializations whose inlining bonus is less than this"
-    "much percent of the original function size"));
+    "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden,
+    cl::desc("Reject specializations whose inlining bonus is less than this "
+             "much percent of the original function size"));
 
 static cl::opt<bool> SpecializeOnAddress(
     "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc(
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 16a80e9..78cd249 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -105,7 +105,7 @@ static cl::opt<int> ColdCCRelFreq(
     "coldcc-rel-freq", cl::Hidden, cl::init(2),
     cl::desc(
         "Maximum block frequency, expressed as a percentage of caller's "
-        "entry frequency, for a call site to be considered cold for enabling"
+        "entry frequency, for a call site to be considered cold for enabling "
         "coldcc"));
 
 /// Is this global variable possibly used by a leak checker as a root?  If so,
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 1bf7ff4..016db55 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -122,6 +122,20 @@ static cl::opt<unsigned>
                         cl::desc("Max depth to recursively search for missing "
                                  "frames through tail calls."));
 
+// By default enable cloning of callsites involved with recursive cycles
+static cl::opt<bool> AllowRecursiveCallsites(
+    "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
+    cl::desc("Allow cloning of callsites involved in recursive cycles"));
+
+// When disabled, try to detect and prevent cloning of recursive contexts.
+// This is only necessary until we support cloning through recursive cycles.
+// Leave on by default for now, as disabling requires a little bit of compile
+// time overhead and doesn't affect correctness, it will just inflate the cold
+// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
+static cl::opt<bool> AllowRecursiveContexts(
+    "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
+    cl::desc("Allow cloning of contexts through recursive cycles"));
+
 namespace llvm {
 cl::opt<bool> EnableMemProfContextDisambiguation(
     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
@@ -1236,9 +1250,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
       StackEntryIdToContextNodeMap[StackId] = StackNode;
       StackNode->OrigStackOrAllocId = StackId;
     }
-    auto Ins = StackIdSet.insert(StackId);
-    if (!Ins.second)
-      StackNode->Recursive = true;
+    // Marking a node recursive will prevent its cloning completely, even for
+    // non-recursive contexts flowing through it.
+    if (!AllowRecursiveCallsites) {
+      auto Ins = StackIdSet.insert(StackId);
+      if (!Ins.second)
+        StackNode->Recursive = true;
+    }
     StackNode->AllocTypes |= (uint8_t)AllocType;
     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
     PrevNode = StackNode;
@@ -1375,8 +1393,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
       set_union(CallerEdgeContextIds, Edge->ContextIds);
     }
     // Node can have more context ids than callers if some contexts terminate at
-    // node and some are longer.
-    assert(NodeContextIds == CallerEdgeContextIds ||
+    // node and some are longer. If we are allowing recursive callsites but
+    // haven't disabled recursive contexts, this will be violated for
+    // incompletely cloned recursive cycles, so skip the checking in that case.
+    assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
+           NodeContextIds == CallerEdgeContextIds ||
            set_is_subset(CallerEdgeContextIds, NodeContextIds));
   }
   if (Node->CalleeEdges.size()) {
@@ -3370,6 +3391,21 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
 
   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
 
+  DenseSet<uint32_t> RecursiveContextIds;
+  // If we are allowing recursive callsites, but have also disabled recursive
+  // contexts, look for context ids that show up in multiple caller edges.
+  if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
+    DenseSet<uint32_t> AllCallerContextIds;
+    for (auto &CE : Node->CallerEdges) {
+      // Resize to the largest set of caller context ids, since we know the
+      // final set will be at least that large.
+      AllCallerContextIds.reserve(CE->getContextIds().size());
+      for (auto Id : CE->getContextIds())
+        if (!AllCallerContextIds.insert(Id).second)
+          RecursiveContextIds.insert(Id);
+    }
+  }
+
   // Iterate until we find no more opportunities for disambiguating the alloc
   // types via cloning. In most cases this loop will terminate once the Node
   // has a single allocation type, in which case no more cloning is needed.
@@ -3394,6 +3430,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
     // allocation.
     auto CallerEdgeContextsForAlloc =
         set_intersection(CallerEdge->getContextIds(), AllocContextIds);
+    if (!RecursiveContextIds.empty())
+      CallerEdgeContextsForAlloc =
+          set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
     if (CallerEdgeContextsForAlloc.empty()) {
       ++EI;
       continue;
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index b40ab35..67585e9 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -129,7 +129,7 @@ static cl::opt<bool> PrintModuleBeforeOptimizations(
 
 static cl::opt<bool> AlwaysInlineDeviceFunctions(
     "openmp-opt-inline-device",
-    cl::desc("Inline all applicible functions on the device."), cl::Hidden,
+    cl::desc("Inline all applicable functions on the device."), cl::Hidden,
     cl::init(false));
 
 static cl::opt<bool>
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 603beb3..b978c54 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -162,7 +162,7 @@ static cl::opt<bool> ProfileSampleBlockAccurate(
 static cl::opt<bool> ProfileAccurateForSymsInList(
     "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
     cl::desc("For symbols in profile symbol list, regard their profiles to "
-             "be accurate. It may be overriden by profile-sample-accurate. "));
+             "be accurate. It may be overridden by profile-sample-accurate. "));
 
 static cl::opt<bool> ProfileMergeInlinee(
     "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
@@ -193,9 +193,10 @@ static cl::opt<bool> ProfileSizeInline(
 // and inline the hot functions (that are skipped in this pass).
 static cl::opt<bool> DisableSampleLoaderInlining(
     "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
-    cl::desc("If true, artifically skip inline transformation in sample-loader "
-             "pass, and merge (or scale) profiles (as configured by "
-             "--sample-profile-merge-inlinee)."));
+    cl::desc(
+        "If true, artificially skip inline transformation in sample-loader "
+        "pass, and merge (or scale) profiles (as configured by "
+        "--sample-profile-merge-inlinee)."));
 
 namespace llvm {
 cl::opt<bool>
@@ -255,7 +256,7 @@ static cl::opt<unsigned> PrecentMismatchForStalenessError(
 
 static cl::opt<bool> CallsitePrioritizedInline(
     "sample-profile-prioritized-inline", cl::Hidden,
-    cl::desc("Use call site prioritized inlining for sample profile loader."
+    cl::desc("Use call site prioritized inlining for sample profile loader. "
              "Currently only CSSPGO is supported."));
 
 static cl::opt<bool> UsePreInlinerDecision(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 7a184a1..73876d0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1326,6 +1326,18 @@ Instruction *InstCombinerImpl::foldAddLikeCommutative(Value *LHS, Value *RHS,
     R->setHasNoUnsignedWrap(NUWOut);
     return R;
   }
+
+  // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2
+  const APInt *C1, *C2;
+  if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) {
+    APInt One(C2->getBitWidth(), 1);
+    APInt MinusC1 = -(*C1);
+    if (MinusC1 == (One << *C2)) {
+      Constant *NewRHS = ConstantInt::get(RHS->getType(), MinusC1);
+      return BinaryOperator::CreateSRem(RHS, NewRHS);
+    }
+  }
+
   return nullptr;
 }
 
@@ -1623,17 +1635,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
   if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
 
-  // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2
-  const APInt *C1, *C2;
-  if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) {
-    APInt one(C2->getBitWidth(), 1);
-    APInt minusC1 = -(*C1);
-    if (minusC1 == (one << *C2)) {
-      Constant *NewRHS = ConstantInt::get(RHS->getType(), minusC1);
-      return BinaryOperator::CreateSRem(RHS, NewRHS);
-    }
-  }
-
+  const APInt *C1;
   // (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit
   if (match(&I, m_c_Add(m_And(m_Value(A), m_APInt(C1)), m_Deferred(A))) &&
       C1->isPowerOf2() && (ComputeNumSignBits(A) > C1->countl_zero())) {
@@ -2845,12 +2847,11 @@ Instruction *InstCombinerImpl::hoistFNegAboveFMulFDiv(Value *FNegOp,
     // Make sure to preserve flags and metadata on the call.
     if (II->getIntrinsicID() == Intrinsic::ldexp) {
       FastMathFlags FMF = FMFSource.getFastMathFlags() | II->getFastMathFlags();
-      IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-      Builder.setFastMathFlags(FMF);
-
-      CallInst *New = Builder.CreateCall(
-          II->getCalledFunction(),
-          {Builder.CreateFNeg(II->getArgOperand(0)), II->getArgOperand(1)});
+      CallInst *New =
+          Builder.CreateCall(II->getCalledFunction(),
+                             {Builder.CreateFNegFMF(II->getArgOperand(0), FMF),
+                              II->getArgOperand(1)});
+      New->setFastMathFlags(FMF);
       New->copyMetadata(*II);
       return New;
     }
@@ -2932,12 +2933,8 @@ Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
     // flags the copysign doesn't also have.
     FastMathFlags FMF = I.getFastMathFlags();
     FMF &= cast<FPMathOperator>(OneUse)->getFastMathFlags();
-
-    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-    Builder.setFastMathFlags(FMF);
-
-    Value *NegY = Builder.CreateFNeg(Y);
-    Value *NewCopySign = Builder.CreateCopySign(X, NegY);
+    Value *NegY = Builder.CreateFNegFMF(Y, FMF);
+    Value *NewCopySign = Builder.CreateCopySign(X, NegY, FMF);
     return replaceInstUsesWith(I, NewCopySign);
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index e576eea4..f82a557 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -39,11 +39,11 @@ static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS,
 /// This is the complement of getFCmpCode, which turns an opcode and two
 /// operands into either a FCmp instruction, or a true/false constant.
 static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
-                           InstCombiner::BuilderTy &Builder) {
+                           InstCombiner::BuilderTy &Builder, FMFSource FMF) {
   FCmpInst::Predicate NewPred;
   if (Constant *TorF = getPredForFCmpCode(Code, LHS->getType(), NewPred))
     return TorF;
-  return Builder.CreateFCmp(NewPred, LHS, RHS);
+  return Builder.CreateFCmpFMF(NewPred, LHS, RHS, FMF);
 }
 
 /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
@@ -513,7 +513,8 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric(
 /// into a single (icmp(A & X) ==/!= Y).
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      bool IsLogical,
-                                     InstCombiner::BuilderTy &Builder) {
+                                     InstCombiner::BuilderTy &Builder,
+                                     const SimplifyQuery &Q) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   std::optional<std::pair<unsigned, unsigned>> MaskPair =
@@ -586,93 +587,107 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     return Builder.CreateICmp(NewCC, NewAnd2, A);
   }
 
-  // Remaining cases assume at least that B and D are constant, and depend on
-  // their actual values. This isn't strictly necessary, just a "handle the
-  // easy cases for now" decision.
   const APInt *ConstB, *ConstD;
-  if (!match(B, m_APInt(ConstB)) || !match(D, m_APInt(ConstD)))
-    return nullptr;
-
-  if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
-    // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
-    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
-    //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
-    // Only valid if one of the masks is a superset of the other (check "B&D" is
-    // the same as either B or D).
-    APInt NewMask = *ConstB & *ConstD;
-    if (NewMask == *ConstB)
-      return LHS;
-    else if (NewMask == *ConstD)
-      return RHS;
-  }
-
-  if (Mask & AMask_NotAllOnes) {
-    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
-    //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
-    // Only valid if one of the masks is a superset of the other (check "B|D" is
-    // the same as either B or D).
-    APInt NewMask = *ConstB | *ConstD;
-    if (NewMask == *ConstB)
-      return LHS;
-    else if (NewMask == *ConstD)
-      return RHS;
-  }
-
-  if (Mask & (BMask_Mixed | BMask_NotMixed)) {
-    // Mixed:
-    // (icmp eq (A & B), C) & (icmp eq (A & D), E)
-    // We already know that B & C == C && D & E == E.
-    // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
-    // C and E, which are shared by both the mask B and the mask D, don't
-    // contradict, then we can transform to
-    // -> (icmp eq (A & (B|D)), (C|E))
-    // Currently, we only handle the case of B, C, D, and E being constant.
-    // We can't simply use C and E because we might actually handle
-    //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
-    // with B and D, having a single bit set.
-
-    // NotMixed:
-    // (icmp ne (A & B), C) & (icmp ne (A & D), E)
-    // -> (icmp ne (A & (B & D)), (C & E))
-    // Check the intersection (B & D) for inequality.
-    // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B
-    // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both the
-    // B and the D, don't contradict.
-    // Note that we can assume (~B & C) == 0 && (~D & E) == 0, previous
-    // operation should delete these icmps if it hadn't been met.
-
-    const APInt *OldConstC, *OldConstE;
-    if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE)))
-      return nullptr;
-
-    auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * {
-      CC = IsNot ? CmpInst::getInversePredicate(CC) : CC;
-      const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC;
-      const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE;
+  if (match(B, m_APInt(ConstB)) && match(D, m_APInt(ConstD))) {
+    if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
+      // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
+      // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+      //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
+      // Only valid if one of the masks is a superset of the other (check "B&D"
+      // is the same as either B or D).
+      APInt NewMask = *ConstB & *ConstD;
+      if (NewMask == *ConstB)
+        return LHS;
+      if (NewMask == *ConstD)
+        return RHS;
+    }
 
-      if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue())
-        return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd);
+    if (Mask & AMask_NotAllOnes) {
+      // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+      //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
+      // Only valid if one of the masks is a superset of the other (check "B|D"
+      // is the same as either B or D).
+      APInt NewMask = *ConstB | *ConstD;
+      if (NewMask == *ConstB)
+        return LHS;
+      if (NewMask == *ConstD)
+        return RHS;
+    }
 
-      if (IsNot && !ConstB->isSubsetOf(*ConstD) && !ConstD->isSubsetOf(*ConstB))
+    if (Mask & (BMask_Mixed | BMask_NotMixed)) {
+      // Mixed:
+      // (icmp eq (A & B), C) & (icmp eq (A & D), E)
+      // We already know that B & C == C && D & E == E.
+      // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
+      // C and E, which are shared by both the mask B and the mask D, don't
+      // contradict, then we can transform to
+      // -> (icmp eq (A & (B|D)), (C|E))
+      // Currently, we only handle the case of B, C, D, and E being constant.
+      // We can't simply use C and E because we might actually handle
+      //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
+      // with B and D, having a single bit set.
+
+      // NotMixed:
+      // (icmp ne (A & B), C) & (icmp ne (A & D), E)
+      // -> (icmp ne (A & (B & D)), (C & E))
+      // Check the intersection (B & D) for inequality.
+      // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B
+      // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both
+      // the B and the D, don't contradict. Note that we can assume (~B & C) ==
+      // 0 && (~D & E) == 0, previous operation should delete these icmps if it
+      // hadn't been met.
+
+      const APInt *OldConstC, *OldConstE;
+      if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE)))
         return nullptr;
 
-      APInt BD, CE;
-      if (IsNot) {
-        BD = *ConstB & *ConstD;
-        CE = ConstC & ConstE;
-      } else {
-        BD = *ConstB | *ConstD;
-        CE = ConstC | ConstE;
-      }
-      Value *NewAnd = Builder.CreateAnd(A, BD);
-      Value *CEVal = ConstantInt::get(A->getType(), CE);
-      return Builder.CreateICmp(CC, CEVal, NewAnd);
-    };
+      auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * {
+        CC = IsNot ? CmpInst::getInversePredicate(CC) : CC;
+        const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC;
+        const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE;
+
+        if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue())
+          return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd);
+
+        if (IsNot && !ConstB->isSubsetOf(*ConstD) &&
+            !ConstD->isSubsetOf(*ConstB))
+          return nullptr;
+
+        APInt BD, CE;
+        if (IsNot) {
+          BD = *ConstB & *ConstD;
+          CE = ConstC & ConstE;
+        } else {
+          BD = *ConstB | *ConstD;
+          CE = ConstC | ConstE;
+        }
+        Value *NewAnd = Builder.CreateAnd(A, BD);
+        Value *CEVal = ConstantInt::get(A->getType(), CE);
+        return Builder.CreateICmp(CC, CEVal, NewAnd);
+      };
+
+      if (Mask & BMask_Mixed)
+        return FoldBMixed(NewCC, false);
+      if (Mask & BMask_NotMixed) // can be else also
+        return FoldBMixed(NewCC, true);
+    }
+  }
 
-    if (Mask & BMask_Mixed)
-      return FoldBMixed(NewCC, false);
-    if (Mask & BMask_NotMixed) // can be else also
-      return FoldBMixed(NewCC, true);
+  // (icmp eq (A & B), 0) | (icmp eq (A & D), 0)
+  // -> (icmp ne (A & (B|D)), (B|D))
+  // (icmp ne (A & B), 0) & (icmp ne (A & D), 0)
+  // -> (icmp eq (A & (B|D)), (B|D))
+  // iff B and D is known to be a power of two
+  if (Mask & Mask_NotAllZeros &&
+      isKnownToBeAPowerOfTwo(B, /*OrZero=*/false, /*Depth=*/0, Q) &&
+      isKnownToBeAPowerOfTwo(D, /*OrZero=*/false, /*Depth=*/0, Q)) {
+    // If this is a logical and/or, then we must prevent propagation of a
+    // poison value from the RHS by inserting freeze.
+    if (IsLogical)
+      D = Builder.CreateFreeze(D);
+    Value *Mask = Builder.CreateOr(B, D);
+    Value *Masked = Builder.CreateAnd(A, Mask);
+    return Builder.CreateICmp(NewCC, Masked, Mask);
   }
   return nullptr;
 }
@@ -775,46 +790,6 @@ foldAndOrOfICmpsWithPow2AndWithZero(InstCombiner::BuilderTy &Builder,
   return Builder.CreateICmp(Pred, And, Op);
 }
 
-// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
-// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
-Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS,
-                                                       ICmpInst *RHS,
-                                                       Instruction *CxtI,
-                                                       bool IsAnd,
-                                                       bool IsLogical) {
-  CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
-  if (LHS->getPredicate() != Pred || RHS->getPredicate() != Pred)
-    return nullptr;
-
-  if (!match(LHS->getOperand(1), m_Zero()) ||
-      !match(RHS->getOperand(1), m_Zero()))
-    return nullptr;
-
-  Value *L1, *L2, *R1, *R2;
-  if (match(LHS->getOperand(0), m_And(m_Value(L1), m_Value(L2))) &&
-      match(RHS->getOperand(0), m_And(m_Value(R1), m_Value(R2)))) {
-    if (L1 == R2 || L2 == R2)
-      std::swap(R1, R2);
-    if (L2 == R1)
-      std::swap(L1, L2);
-
-    if (L1 == R1 &&
-        isKnownToBeAPowerOfTwo(L2, false, 0, CxtI) &&
-        isKnownToBeAPowerOfTwo(R2, false, 0, CxtI)) {
-      // If this is a logical and/or, then we must prevent propagation of a
-      // poison value from the RHS by inserting freeze.
-      if (IsLogical)
-        R2 = Builder.CreateFreeze(R2);
-      Value *Mask = Builder.CreateOr(L2, R2);
-      Value *Masked = Builder.CreateAnd(L1, Mask);
-      auto NewPred = IsAnd ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE;
-      return Builder.CreateICmp(NewPred, Masked, Mask);
-    }
-  }
-
-  return nullptr;
-}
-
 /// General pattern:
 ///   X & Y
 ///
@@ -1429,12 +1404,8 @@ static Value *matchIsFiniteTest(InstCombiner::BuilderTy &Builder, FCmpInst *LHS,
       !matchUnorderedInfCompare(PredR, RHS0, RHS1))
     return nullptr;
 
-  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-  FastMathFlags FMF = LHS->getFastMathFlags();
-  FMF &= RHS->getFastMathFlags();
-  Builder.setFastMathFlags(FMF);
-
-  return Builder.CreateFCmp(FCmpInst::getOrderedPredicate(PredR), RHS0, RHS1);
+  return Builder.CreateFCmpFMF(FCmpInst::getOrderedPredicate(PredR), RHS0, RHS1,
+                               FMFSource::intersect(LHS, RHS));
 }
 
 Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
@@ -1470,12 +1441,8 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
 
     // Intersect the fast math flags.
     // TODO: We can union the fast math flags unless this is a logical select.
-    IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-    FastMathFlags FMF = LHS->getFastMathFlags();
-    FMF &= RHS->getFastMathFlags();
-    Builder.setFastMathFlags(FMF);
-
-    return getFCmpValue(NewPred, LHS0, LHS1, Builder);
+    return getFCmpValue(NewPred, LHS0, LHS1, Builder,
+                        FMFSource::intersect(LHS, RHS));
   }
 
   // This transform is not valid for a logical select.
@@ -1492,10 +1459,8 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
       // Ignore the constants because they are obviously not NANs:
       // (fcmp ord x, 0.0) & (fcmp ord y, 0.0)  -> (fcmp ord x, y)
       // (fcmp uno x, 0.0) | (fcmp uno y, 0.0)  -> (fcmp uno x, y)
-      IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-      Builder.setFastMathFlags(LHS->getFastMathFlags() &
-                               RHS->getFastMathFlags());
-      return Builder.CreateFCmp(PredL, LHS0, RHS0);
+      return Builder.CreateFCmpFMF(PredL, LHS0, RHS0,
+                                   FMFSource::intersect(LHS, RHS));
     }
   }
 
@@ -1557,15 +1522,14 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
       std::swap(PredL, PredR);
     }
     if (IsLessThanOrLessEqual(IsAnd ? PredL : PredR)) {
-      BuilderTy::FastMathFlagGuard Guard(Builder);
       FastMathFlags NewFlag = LHS->getFastMathFlags();
       if (!IsLogicalSelect)
         NewFlag |= RHS->getFastMathFlags();
-      Builder.setFastMathFlags(NewFlag);
 
-      Value *FAbs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, LHS0);
-      return Builder.CreateFCmp(PredL, FAbs,
-                                ConstantFP::get(LHS0->getType(), *LHSC));
+      Value *FAbs =
+          Builder.CreateUnaryIntrinsic(Intrinsic::fabs, LHS0, NewFlag);
+      return Builder.CreateFCmpFMF(
+          PredL, FAbs, ConstantFP::get(LHS0->getType(), *LHSC), NewFlag);
     }
   }
 
@@ -2372,6 +2336,26 @@ static Value *simplifyAndOrWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   return IC.Builder.CreateBinOp(I->getOpcode(), NewOp0, NewOp1);
 }
 
+/// Reassociate and/or expressions to see if we can fold the inner and/or ops.
+/// TODO: Make this recursive; it's a little tricky because an arbitrary
+/// number of and/or instructions might have to be created.
+Value *InstCombinerImpl::reassociateBooleanAndOr(Value *LHS, Value *X, Value *Y,
+                                                 Instruction &I, bool IsAnd,
+                                                 bool RHSIsLogical) {
+  Instruction::BinaryOps Opcode = IsAnd ? Instruction::And : Instruction::Or;
+  // LHS bop (X lop Y) --> (LHS bop X) lop Y
+  // LHS bop (X bop Y) --> (LHS bop X) bop Y
+  if (Value *Res = foldBooleanAndOr(LHS, X, I, IsAnd, /*IsLogical=*/false))
+    return RHSIsLogical ? Builder.CreateLogicalOp(Opcode, Res, Y)
+                        : Builder.CreateBinOp(Opcode, Res, Y);
+  // LHS bop (X bop Y) --> X bop (LHS bop Y)
+  // LHS bop (X lop Y) --> X lop (LHS bop Y)
+  if (Value *Res = foldBooleanAndOr(LHS, Y, I, IsAnd, /*IsLogical=*/false))
+    return RHSIsLogical ? Builder.CreateLogicalOp(Opcode, X, Res)
+                        : Builder.CreateBinOp(Opcode, X, Res);
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -2755,31 +2739,17 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
           foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/true, /*IsLogical=*/false))
     return replaceInstUsesWith(I, Res);
 
-  // TODO: Make this recursive; it's a little tricky because an arbitrary
-  // number of 'and' instructions might have to be created.
   if (match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
     bool IsLogical = isa<SelectInst>(Op1);
-    // Op0 & (X && Y) --> (Op0 && X) && Y
-    if (Value *Res = foldBooleanAndOr(Op0, X, I, /* IsAnd */ true, IsLogical))
-      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(Res, Y)
-                                              : Builder.CreateAnd(Res, Y));
-    // Op0 & (X && Y) --> X && (Op0 & Y)
-    if (Value *Res = foldBooleanAndOr(Op0, Y, I, /* IsAnd */ true,
-                                      /* IsLogical */ false))
-      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(X, Res)
-                                              : Builder.CreateAnd(X, Res));
+    if (auto *V = reassociateBooleanAndOr(Op0, X, Y, I, /*IsAnd=*/true,
+                                          /*RHSIsLogical=*/IsLogical))
+      return replaceInstUsesWith(I, V);
   }
   if (match(Op0, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
     bool IsLogical = isa<SelectInst>(Op0);
-    // (X && Y) & Op1 --> (X && Op1) && Y
-    if (Value *Res = foldBooleanAndOr(X, Op1, I, /* IsAnd */ true, IsLogical))
-      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(Res, Y)
-                                              : Builder.CreateAnd(Res, Y));
-    // (X && Y) & Op1 --> X && (Y & Op1)
-    if (Value *Res = foldBooleanAndOr(Y, Op1, I, /* IsAnd */ true,
-                                      /* IsLogical */ false))
-      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalAnd(X, Res)
-                                              : Builder.CreateAnd(X, Res));
+    if (auto *V = reassociateBooleanAndOr(Op1, X, Y, I, /*IsAnd=*/true,
+                                          /*RHSIsLogical=*/IsLogical))
+      return replaceInstUsesWith(I, V);
   }
 
   if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
@@ -3330,12 +3300,6 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                           bool IsLogical) {
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
 
-  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
-  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
-  // if K1 and K2 are a one-bit mask.
-  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, &I, IsAnd, IsLogical))
-    return V;
-
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1);
@@ -3362,7 +3326,7 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // handle (roughly):
   // (icmp ne (A & B), C) | (icmp ne (A & D), E)
   // (icmp eq (A & B), C) & (icmp eq (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder))
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder, Q))
     return V;
 
   if (Value *V =
@@ -3840,31 +3804,17 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
           foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/false, /*IsLogical=*/false))
     return replaceInstUsesWith(I, Res);
 
-  // TODO: Make this recursive; it's a little tricky because an arbitrary
-  // number of 'or' instructions might have to be created.
   if (match(Op1, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) {
     bool IsLogical = isa<SelectInst>(Op1);
-    // Op0 | (X || Y) --> (Op0 || X) || Y
-    if (Value *Res = foldBooleanAndOr(Op0, X, I, /* IsAnd */ false, IsLogical))
-      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(Res, Y)
-                                              : Builder.CreateOr(Res, Y));
-    // Op0 | (X || Y) --> X || (Op0 | Y)
-    if (Value *Res = foldBooleanAndOr(Op0, Y, I, /* IsAnd */ false,
-                                      /* IsLogical */ false))
-      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(X, Res)
-                                              : Builder.CreateOr(X, Res));
+    if (auto *V = reassociateBooleanAndOr(Op0, X, Y, I, /*IsAnd=*/false,
+                                          /*RHSIsLogical=*/IsLogical))
+      return replaceInstUsesWith(I, V);
   }
   if (match(Op0, m_OneUse(m_LogicalOr(m_Value(X), m_Value(Y))))) {
     bool IsLogical = isa<SelectInst>(Op0);
-    // (X || Y) | Op1 --> (X || Op1) || Y
-    if (Value *Res = foldBooleanAndOr(X, Op1, I, /* IsAnd */ false, IsLogical))
-      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(Res, Y)
-                                              : Builder.CreateOr(Res, Y));
-    // (X || Y) | Op1 --> X || (Y | Op1)
-    if (Value *Res = foldBooleanAndOr(Y, Op1, I, /* IsAnd */ false,
-                                      /* IsLogical */ false))
-      return replaceInstUsesWith(I, IsLogical ? Builder.CreateLogicalOr(X, Res)
-                                              : Builder.CreateOr(X, Res));
+    if (auto *V = reassociateBooleanAndOr(Op1, X, Y, I, /*IsAnd=*/false,
+                                          /*RHSIsLogical=*/IsLogical))
+      return replaceInstUsesWith(I, V);
   }
 
   if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
@@ -4981,8 +4931,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
 
   // (A & B) ^ (A | C) --> A ? ~B : C -- There are 4 commuted variants.
   if (I.getType()->isIntOrIntVectorTy(1) &&
-      match(Op0, m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B)))) &&
-      match(Op1, m_OneUse(m_LogicalOr(m_Value(C), m_Value(D))))) {
+      match(&I, m_c_Xor(m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B))),
+                        m_OneUse(m_LogicalOr(m_Value(C), m_Value(D)))))) {
     bool NeedFreeze = isa<SelectInst>(Op0) && isa<SelectInst>(Op1) && B == D;
     if (B == C || B == D)
       std::swap(A, B);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index fd38738..c55c40c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -839,6 +839,35 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
   if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
     return createOverflowTuple(WO, OperationResult, OverflowResult);
+
+  // See whether we can optimize the overflow check with assumption information.
+  for (User *U : WO->users()) {
+    if (!match(U, m_ExtractValue<1>(m_Value())))
+      continue;
+
+    for (auto &AssumeVH : AC.assumptionsFor(U)) {
+      if (!AssumeVH)
+        continue;
+      CallInst *I = cast<CallInst>(AssumeVH);
+      if (!match(I->getArgOperand(0), m_Not(m_Specific(U))))
+        continue;
+      if (!isValidAssumeForContext(I, II, /*DT=*/nullptr,
+                                   /*AllowEphemerals=*/true))
+        continue;
+      Value *Result =
+          Builder.CreateBinOp(WO->getBinaryOp(), WO->getLHS(), WO->getRHS());
+      Result->takeName(WO);
+      if (auto *Inst = dyn_cast<Instruction>(Result)) {
+        if (WO->isSigned())
+          Inst->setHasNoSignedWrap();
+        else
+          Inst->setHasNoUnsignedWrap();
+      }
+      return createOverflowTuple(WO, Result,
+                                 ConstantInt::getFalse(U->getType()));
+    }
+  }
+
   return nullptr;
 }
 
@@ -2644,8 +2673,11 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // Propagate sign argument through nested calls:
     // copysign Mag, (copysign ?, X) --> copysign Mag, X
     Value *X;
-    if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X))))
-      return replaceOperand(*II, 1, X);
+    if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X)))) {
+      Value *CopySign =
+          Builder.CreateCopySign(Mag, X, FMFSource::intersect(II, Sign));
+      return replaceInstUsesWith(*II, CopySign);
+    }
 
     // Clear sign-bit of constant magnitude:
     // copysign -MagC, X --> copysign MagC, X
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 0b93799..4ec1af3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1852,15 +1852,13 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   Value *X;
   Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0));
   if (Op && Op->hasOneUse()) {
-    IRBuilder<>::FastMathFlagGuard FMFG(Builder);
     FastMathFlags FMF = FPT.getFastMathFlags();
     if (auto *FPMO = dyn_cast<FPMathOperator>(Op))
       FMF &= FPMO->getFastMathFlags();
-    Builder.setFastMathFlags(FMF);
 
     if (match(Op, m_FNeg(m_Value(X)))) {
-      Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
-      Value *Neg = Builder.CreateFNeg(InnerTrunc);
+      Value *InnerTrunc = Builder.CreateFPTruncFMF(X, Ty, FMF);
+      Value *Neg = Builder.CreateFNegFMF(InnerTrunc, FMF);
       return replaceInstUsesWith(FPT, Neg);
     }
 
@@ -1870,15 +1868,17 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
     if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) &&
         X->getType() == Ty) {
       // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y)
-      Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
-      Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op);
+      Value *NarrowY = Builder.CreateFPTruncFMF(Y, Ty, FMF);
+      Value *Sel =
+          Builder.CreateSelectFMF(Cond, X, NarrowY, FMF, "narrow.sel", Op);
       return replaceInstUsesWith(FPT, Sel);
     }
     if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) &&
         X->getType() == Ty) {
       // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X
-      Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
-      Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op);
+      Value *NarrowY = Builder.CreateFPTruncFMF(Y, Ty, FMF);
+      Value *Sel =
+          Builder.CreateSelectFMF(Cond, NarrowY, X, FMF, "narrow.sel", Op);
       return replaceInstUsesWith(FPT, Sel);
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d6fdade..2e45725 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -747,6 +747,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                         ConstantExpr::getPointerBitCastOrAddrSpaceCast(
                             cast<Constant>(RHS), Base->getType()));
   } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
+    GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags();
+
     // If the base pointers are different, but the indices are the same, just
     // compare the base pointer.
     if (PtrBase != GEPRHS->getOperand(0)) {
@@ -764,7 +766,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
       // If all indices are the same, just compare the base pointers.
       Type *BaseType = GEPLHS->getOperand(0)->getType();
-      if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType())
+      if (IndicesTheSame &&
+          CmpInst::makeCmpResultType(BaseType) == I.getType() && CanFold(NW))
         return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
 
       // If we're comparing GEPs with two base pointers that only differ in type
@@ -804,7 +807,6 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this);
     }
 
-    GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags();
     if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
         GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {
       // If the GEPs only differ by one index, compare it.
@@ -2483,9 +2485,8 @@ Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp,
       // icmp ule i64 (shl X, 32), 8589934592 ->
       // icmp ule i32 (trunc X, i32), 2 ->
       // icmp ult i32 (trunc X, i32), 3
-      if (auto FlippedStrictness =
-              InstCombiner::getFlippedStrictnessPredicateAndConstant(
-                  Pred, ConstantInt::get(ShType->getContext(), C))) {
+      if (auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(
+              Pred, ConstantInt::get(ShType->getContext(), C))) {
         CmpPred = FlippedStrictness->first;
         RHSC = cast<ConstantInt>(FlippedStrictness->second)->getValue();
       }
@@ -3089,12 +3090,12 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
     unsigned BW = C.getBitWidth();
     std::bitset<4> Table;
     auto ComputeTable = [&](bool Op0Val, bool Op1Val) {
-      int Res = 0;
+      APInt Res(BW, 0);
       if (Op0Val)
-        Res += isa<ZExtInst>(Ext0) ? 1 : -1;
+        Res += APInt(BW, isa<ZExtInst>(Ext0) ? 1 : -1, /*isSigned=*/true);
       if (Op1Val)
-        Res += isa<ZExtInst>(Ext1) ? 1 : -1;
-      return ICmpInst::compare(APInt(BW, Res, true), C, Pred);
+        Res += APInt(BW, isa<ZExtInst>(Ext1) ? 1 : -1, /*isSigned=*/true);
+      return ICmpInst::compare(Res, C, Pred);
     };
 
     Table[0] = ComputeTable(false, false);
@@ -3278,8 +3279,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
   if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
     // x sgt C-1  <-->  x sge C  <-->  not(x slt C)
     auto FlippedStrictness =
-        InstCombiner::getFlippedStrictnessPredicateAndConstant(
-            PredB, cast<Constant>(RHS2));
+        getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2));
     if (!FlippedStrictness)
       return false;
     assert(FlippedStrictness->first == ICmpInst::ICMP_SGE &&
@@ -6906,79 +6906,6 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) {
   return nullptr;
 }
 
-std::optional<std::pair<CmpPredicate, Constant *>>
-InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred,
-                                                       Constant *C) {
-  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
-         "Only for relational integer predicates.");
-
-  Type *Type = C->getType();
-  bool IsSigned = ICmpInst::isSigned(Pred);
-
-  CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
-  bool WillIncrement =
-      UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
-
-  // Check if the constant operand can be safely incremented/decremented
-  // without overflowing/underflowing.
-  auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
-    return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
-  };
-
-  Constant *SafeReplacementConstant = nullptr;
-  if (auto *CI = dyn_cast<ConstantInt>(C)) {
-    // Bail out if the constant can't be safely incremented/decremented.
-    if (!ConstantIsOk(CI))
-      return std::nullopt;
-  } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) {
-    unsigned NumElts = FVTy->getNumElements();
-    for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = C->getAggregateElement(i);
-      if (!Elt)
-        return std::nullopt;
-
-      if (isa<UndefValue>(Elt))
-        continue;
-
-      // Bail out if we can't determine if this constant is min/max or if we
-      // know that this constant is min/max.
-      auto *CI = dyn_cast<ConstantInt>(Elt);
-      if (!CI || !ConstantIsOk(CI))
-        return std::nullopt;
-
-      if (!SafeReplacementConstant)
-        SafeReplacementConstant = CI;
-    }
-  } else if (isa<VectorType>(C->getType())) {
-    // Handle scalable splat
-    Value *SplatC = C->getSplatValue();
-    auto *CI = dyn_cast_or_null<ConstantInt>(SplatC);
-    // Bail out if the constant can't be safely incremented/decremented.
-    if (!CI || !ConstantIsOk(CI))
-      return std::nullopt;
-  } else {
-    // ConstantExpr?
-    return std::nullopt;
-  }
-
-  // It may not be safe to change a compare predicate in the presence of
-  // undefined elements, so replace those elements with the first safe constant
-  // that we found.
-  // TODO: in case of poison, it is safe; let's replace undefs only.
-  if (C->containsUndefOrPoisonElement()) {
-    assert(SafeReplacementConstant && "Replacement constant not set");
-    C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
-  }
-
-  CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
-
-  // Increment or decrement the constant.
-  Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
-  Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
-
-  return std::make_pair(NewPred, NewC);
-}
-
 /// If we have an icmp le or icmp ge instruction with a constant operand, turn
 /// it into the appropriate icmp lt or icmp gt instruction. This transform
 /// allows them to be folded in visitICmpInst.
@@ -6994,8 +6921,7 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
   if (!Op1C)
     return nullptr;
 
-  auto FlippedStrictness =
-      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
+  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
   if (!FlippedStrictness)
     return nullptr;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 3a074ee..f6992119 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -429,12 +429,12 @@ private:
   Value *foldBooleanAndOr(Value *LHS, Value *RHS, Instruction &I, bool IsAnd,
                           bool IsLogical);
 
+  Value *reassociateBooleanAndOr(Value *LHS, Value *X, Value *Y, Instruction &I,
+                                 bool IsAnd, bool RHSIsLogical);
+
   Instruction *
   canonicalizeConditionalNegationViaMathToSelect(BinaryOperator &i);
 
-  Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
-                                       Instruction *CxtI, bool IsAnd,
-                                       bool IsLogical = false);
   Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D,
                               bool InvertFalseVal = false);
   Value *getSelectCondition(Value *A, Value *B, bool ABIsTheSame);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f85a3c9..0c34cf0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -121,21 +121,17 @@ static Value *foldMulSelectToNegate(BinaryOperator &I,
   // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp
   if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0),
                                            m_SpecificFP(-1.0))),
-                         m_Value(OtherOp)))) {
-    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-    Builder.setFastMathFlags(I.getFastMathFlags());
-    return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp));
-  }
+                         m_Value(OtherOp))))
+    return Builder.CreateSelectFMF(Cond, OtherOp,
+                                   Builder.CreateFNegFMF(OtherOp, &I), &I);
 
   // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp
   // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp
   if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0),
                                            m_SpecificFP(1.0))),
-                         m_Value(OtherOp)))) {
-    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-    Builder.setFastMathFlags(I.getFastMathFlags());
-    return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp);
-  }
+                         m_Value(OtherOp))))
+    return Builder.CreateSelectFMF(Cond, Builder.CreateFNegFMF(OtherOp, &I),
+                                   OtherOp, &I);
 
   return nullptr;
 }
@@ -590,11 +586,9 @@ Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
   // fabs(X) / fabs(Y) --> fabs(X / Y)
   if (match(Op0, m_FAbs(m_Value(X))) && match(Op1, m_FAbs(m_Value(Y))) &&
       (Op0->hasOneUse() || Op1->hasOneUse())) {
-    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-    Builder.setFastMathFlags(I.getFastMathFlags());
-    Value *XY = Builder.CreateBinOp(Opcode, X, Y);
-    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY);
-    Fabs->takeName(&I);
+    Value *XY = Builder.CreateBinOpFMF(Opcode, X, Y, &I);
+    Value *Fabs =
+        Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY, &I, I.getName());
     return replaceInstUsesWith(I, Fabs);
   }
 
@@ -685,8 +679,6 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
       match(Op0, m_AllowReassoc(m_BinOp(Op0BinOp)))) {
     // Everything in this scope folds I with Op0, intersecting their FMF.
     FastMathFlags FMF = I.getFastMathFlags() & Op0BinOp->getFastMathFlags();
-    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-    Builder.setFastMathFlags(FMF);
     Constant *C1;
     if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
       // (C1 / X) * C --> (C * C1) / X
@@ -718,7 +710,7 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
       // (X + C1) * C --> (X * C) + (C * C1)
       if (Constant *CC1 =
               ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL)) {
-        Value *XC = Builder.CreateFMul(X, C);
+        Value *XC = Builder.CreateFMulFMF(X, C, FMF);
         return BinaryOperator::CreateFAddFMF(XC, CC1, FMF);
       }
     }
@@ -726,7 +718,7 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
       // (C1 - X) * C --> (C * C1) - (X * C)
       if (Constant *CC1 =
               ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL)) {
-        Value *XC = Builder.CreateFMul(X, C);
+        Value *XC = Builder.CreateFMulFMF(X, C, FMF);
         return BinaryOperator::CreateFSubFMF(CC1, XC, FMF);
       }
     }
@@ -740,9 +732,7 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
     FastMathFlags FMF = I.getFastMathFlags() & DivOp->getFastMathFlags();
     if (FMF.allowReassoc()) {
       // Sink division: (X / Y) * Z --> (X * Z) / Y
-      IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-      Builder.setFastMathFlags(FMF);
-      auto *NewFMul = Builder.CreateFMul(X, Z);
+      auto *NewFMul = Builder.CreateFMulFMF(X, Z, FMF);
       return BinaryOperator::CreateFDivFMF(NewFMul, Y, FMF);
     }
   }
@@ -2066,14 +2056,18 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I,
   bool ShiftByX = false;
 
   // If V is not nullptr, it will be matched using m_Specific.
-  auto MatchShiftOrMulXC = [](Value *Op, Value *&V, APInt &C) -> bool {
+  auto MatchShiftOrMulXC = [](Value *Op, Value *&V, APInt &C,
+                              bool &PreserveNSW) -> bool {
     const APInt *Tmp = nullptr;
     if ((!V && match(Op, m_Mul(m_Value(V), m_APInt(Tmp)))) ||
         (V && match(Op, m_Mul(m_Specific(V), m_APInt(Tmp)))))
       C = *Tmp;
     else if ((!V && match(Op, m_Shl(m_Value(V), m_APInt(Tmp)))) ||
-             (V && match(Op, m_Shl(m_Specific(V), m_APInt(Tmp)))))
+             (V && match(Op, m_Shl(m_Specific(V), m_APInt(Tmp))))) {
       C = APInt(Tmp->getBitWidth(), 1) << *Tmp;
+      // We cannot preserve NSW when shifting by BW - 1.
+      PreserveNSW = Tmp->ult(Tmp->getBitWidth() - 1);
+    }
     if (Tmp != nullptr)
       return true;
 
@@ -2095,7 +2089,9 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I,
     return false;
   };
 
-  if (MatchShiftOrMulXC(Op0, X, Y) && MatchShiftOrMulXC(Op1, X, Z)) {
+  bool Op0PreserveNSW = true, Op1PreserveNSW = true;
+  if (MatchShiftOrMulXC(Op0, X, Y, Op0PreserveNSW) &&
+      MatchShiftOrMulXC(Op1, X, Z, Op1PreserveNSW)) {
     // pass
   } else if (MatchShiftCX(Op0, Y, X) && MatchShiftCX(Op1, Z, X)) {
     ShiftByX = true;
@@ -2108,7 +2104,7 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I,
   OverflowingBinaryOperator *BO0 = cast<OverflowingBinaryOperator>(Op0);
   // TODO: We may be able to deduce more about nsw/nuw of BO0/BO1 based on Y >=
   // Z or Z >= Y.
-  bool BO0HasNSW = BO0->hasNoSignedWrap();
+  bool BO0HasNSW = Op0PreserveNSW && BO0->hasNoSignedWrap();
   bool BO0HasNUW = BO0->hasNoUnsignedWrap();
   bool BO0NoWrap = IsSRem ? BO0HasNSW : BO0HasNUW;
 
@@ -2131,7 +2127,7 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I,
   };
 
   OverflowingBinaryOperator *BO1 = cast<OverflowingBinaryOperator>(Op1);
-  bool BO1HasNSW = BO1->hasNoSignedWrap();
+  bool BO1HasNSW = Op1PreserveNSW && BO1->hasNoSignedWrap();
   bool BO1HasNUW = BO1->hasNoUnsignedWrap();
   bool BO1NoWrap = IsSRem ? BO1HasNSW : BO1HasNUW;
   // (rem (mul X, Y), (mul nuw/nsw X, Z))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 1fcf1c5..80308bf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -765,30 +765,14 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
   NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
   LoadInst *NewLI =
       new LoadInst(FirstLI->getType(), NewPN, "", IsVolatile, LoadAlignment);
-
-  unsigned KnownIDs[] = {
-    LLVMContext::MD_tbaa,
-    LLVMContext::MD_range,
-    LLVMContext::MD_invariant_load,
-    LLVMContext::MD_alias_scope,
-    LLVMContext::MD_noalias,
-    LLVMContext::MD_nonnull,
-    LLVMContext::MD_align,
-    LLVMContext::MD_dereferenceable,
-    LLVMContext::MD_dereferenceable_or_null,
-    LLVMContext::MD_access_group,
-    LLVMContext::MD_noundef,
-  };
-
-  for (unsigned ID : KnownIDs)
-    NewLI->setMetadata(ID, FirstLI->getMetadata(ID));
+  NewLI->copyMetadata(*FirstLI);
 
   // Add all operands to the new PHI and combine TBAA metadata.
   for (auto Incoming : drop_begin(zip(PN.blocks(), PN.incoming_values()))) {
     BasicBlock *BB = std::get<0>(Incoming);
     Value *V = std::get<1>(Incoming);
     LoadInst *LI = cast<LoadInst>(V);
-    combineMetadata(NewLI, LI, KnownIDs, true);
+    combineMetadataForCSE(NewLI, LI, true);
     Value *NewInVal = LI->getOperand(0);
     if (NewInVal != InVal)
       InVal = nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3d251d6..1eca177 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1225,8 +1225,12 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
   // zext/trunc) have one use (ending at the select), the cttz/ctlz result will
   // not be used if the input is zero. Relax to 'zero is poison' for that case.
   if (II->hasOneUse() && SelectArg->hasOneUse() &&
-      !match(II->getArgOperand(1), m_One()))
+      !match(II->getArgOperand(1), m_One())) {
     II->setArgOperand(1, ConstantInt::getTrue(II->getContext()));
+    // noundef attribute on the intrinsic may no longer be valid.
+    II->dropUBImplyingAttrsAndMetadata();
+    IC.addToWorklist(II);
+  }
 
   return nullptr;
 }
@@ -1685,8 +1689,7 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
     return nullptr;
 
   // Check the constant we'd have with flipped-strictness predicate.
-  auto FlippedStrictness =
-      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0);
+  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0);
   if (!FlippedStrictness)
     return nullptr;
 
@@ -1966,8 +1969,7 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
   Value *RHS;
   SelectPatternFlavor SPF;
   const DataLayout &DL = BOp->getDataLayout();
-  auto Flipped =
-      InstCombiner::getFlippedStrictnessPredicateAndConstant(Predicate, C1);
+  auto Flipped = getFlippedStrictnessPredicateAndConstant(Predicate, C1);
 
   if (C3 == ConstantFoldBinaryOpOperands(Opcode, C1, C2, DL)) {
     SPF = getSelectPattern(Predicate).Flavor;
@@ -2819,9 +2821,9 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC,
   // %cnd = icmp slt i32 %rem, 0
   // %add = add i32 %rem, %n
   // %sel = select i1 %cnd, i32 %add, i32 %rem
-  if (match(TrueVal, m_Add(m_Specific(RemRes), m_Value(Remainder))) &&
+  if (match(TrueVal, m_c_Add(m_Specific(RemRes), m_Value(Remainder))) &&
       match(RemRes, m_SRem(m_Value(Op), m_Specific(Remainder))) &&
-      IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero*/ true) &&
+      IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero=*/true) &&
       FalseVal == RemRes)
     return FoldToBitwiseAnd(Remainder);
 
@@ -3769,22 +3771,9 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI,
   if (!SIFOp || !SIFOp->hasNoSignedZeros() || !SIFOp->hasNoNaNs())
     return nullptr;
 
-  // select((fcmp Pred, X, 0), (fadd X, C), C)
-  //      => fadd((select (fcmp Pred, X, 0), X, 0), C)
-  //
-  // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE
-  Instruction *FAdd;
-  Constant *C;
-  Value *X, *Z;
-  CmpPredicate Pred;
-
-  // Note: OneUse check for `Cmp` is necessary because it makes sure that other
-  // InstCombine folds don't undo this transformation and cause an infinite
-  // loop. Furthermore, it could also increase the operation count.
-  if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))),
-                          m_OneUse(m_Instruction(FAdd)), m_Constant(C))) ||
-      match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))),
-                          m_Constant(C), m_OneUse(m_Instruction(FAdd))))) {
+  auto TryFoldIntoAddConstant =
+      [&Builder, &SI](CmpInst::Predicate Pred, Value *X, Value *Z,
+                      Instruction *FAdd, Constant *C, bool Swapped) -> Value * {
     // Only these relational predicates can be transformed into maxnum/minnum
     // intrinsic.
     if (!CmpInst::isRelational(Pred) || !match(Z, m_AnyZeroFP()))
@@ -3793,7 +3782,8 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI,
     if (!match(FAdd, m_FAdd(m_Specific(X), m_Specific(C))))
       return nullptr;
 
-    Value *NewSelect = Builder.CreateSelect(SI.getCondition(), X, Z, "", &SI);
+    Value *NewSelect = Builder.CreateSelect(SI.getCondition(), Swapped ? Z : X,
+                                            Swapped ? X : Z, "", &SI);
     NewSelect->takeName(&SI);
 
     Value *NewFAdd = Builder.CreateFAdd(NewSelect, C);
@@ -3808,7 +3798,27 @@ static Value *foldSelectIntoAddConstant(SelectInst &SI,
     cast<Instruction>(NewSelect)->setFastMathFlags(NewFMF);
 
     return NewFAdd;
-  }
+  };
+
+  // select((fcmp Pred, X, 0), (fadd X, C), C)
+  //      => fadd((select (fcmp Pred, X, 0), X, 0), C)
+  //
+  // Pred := OGT, OGE, OLT, OLE, UGT, UGE, ULT, and ULE
+  Instruction *FAdd;
+  Constant *C;
+  Value *X, *Z;
+  CmpPredicate Pred;
+
+  // Note: OneUse check for `Cmp` is necessary because it makes sure that other
+  // InstCombine folds don't undo this transformation and cause an infinite
+  // loop. Furthermore, it could also increase the operation count.
+  if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))),
+                          m_OneUse(m_Instruction(FAdd)), m_Constant(C))))
+    return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/false);
+
+  if (match(&SI, m_Select(m_OneUse(m_FCmp(Pred, m_Value(X), m_Value(Z))),
+                          m_Constant(C), m_OneUse(m_Instruction(FAdd)))))
+    return TryFoldIntoAddConstant(Pred, X, Z, FAdd, C, /*Swapped=*/true);
 
   return nullptr;
 }
@@ -3902,12 +3912,11 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
       // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X
       if (FCmp->hasOneUse() && FCmpInst::isUnordered(Pred)) {
         FCmpInst::Predicate InvPred = FCmp->getInversePredicate();
-        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
         // FIXME: The FMF should propagate from the select, not the fcmp.
-        Builder.setFastMathFlags(FCmp->getFastMathFlags());
-        Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1,
-                                            FCmp->getName() + ".inv");
-        Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal);
+        Value *NewCond = Builder.CreateFCmpFMF(InvPred, Cmp0, Cmp1, FCmp,
+                                               FCmp->getName() + ".inv");
+        Value *NewSel =
+            Builder.CreateSelectFMF(NewCond, FalseVal, TrueVal, FCmp);
         return replaceInstUsesWith(SI, NewSel);
       }
     }
@@ -4072,15 +4081,11 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
         CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered);
 
         Value *Cmp;
-        if (CmpInst::isIntPredicate(MinMaxPred)) {
+        if (CmpInst::isIntPredicate(MinMaxPred))
           Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS);
-        } else {
-          IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-          auto FMF =
-              cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
-          Builder.setFastMathFlags(FMF);
-          Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS);
-        }
+        else
+          Cmp = Builder.CreateFCmpFMF(MinMaxPred, LHS, RHS,
+                                      cast<Instruction>(SI.getCondition()));
 
         Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI);
         if (!IsCastNeeded)
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 934156f..2fb60ef 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -939,12 +939,11 @@ Instruction *InstCombinerImpl::foldBinOpShiftWithShift(BinaryOperator &I) {
                m_OneUse(m_Shift(m_Value(Y), m_Value(Shift)))))
       return nullptr;
     if (!match(I.getOperand(1 - ShOpnum),
-               m_BinOp(m_Value(ShiftedX), m_Value(Mask))))
+               m_c_BinOp(m_CombineAnd(
+                             m_OneUse(m_Shift(m_Value(X), m_Specific(Shift))),
+                             m_Value(ShiftedX)),
+                         m_Value(Mask))))
       return nullptr;
-
-    if (!match(ShiftedX, m_OneUse(m_Shift(m_Value(X), m_Specific(Shift)))))
-      return nullptr;
-
     // Make sure we are matching instruction shifts and not ConstantExpr
     auto *IY = dyn_cast<Instruction>(I.getOperand(ShOpnum));
     auto *IX = dyn_cast<Instruction>(ShiftedX);
@@ -1822,12 +1821,29 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
       continue;
     }
 
-    // If the only use of phi is comparing it with a constant then we can
-    // put this comparison in the incoming BB directly after a ucmp/scmp call
-    // because we know that it will simplify to a single icmp.
-    const APInt *Ignored;
-    if (isa<CmpIntrinsic>(InVal) && InVal->hasOneUser() &&
-        match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) {
+    // Handle some cases that can't be fully simplified, but where we know that
+    // the two instructions will fold into one.
+    auto WillFold = [&]() {
+      if (!InVal->hasOneUser())
+        return false;
+
+      // icmp of ucmp/scmp with constant will fold to icmp.
+      const APInt *Ignored;
+      if (isa<CmpIntrinsic>(InVal) &&
+          match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored))))
+        return true;
+
+      // icmp eq zext(bool), 0 will fold to !bool.
+      if (isa<ZExtInst>(InVal) &&
+          cast<ZExtInst>(InVal)->getSrcTy()->isIntOrIntVectorTy(1) &&
+          match(&I,
+                m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(PN), m_Zero())))
+        return true;
+
+      return false;
+    };
+
+    if (WillFold()) {
       OpsToMoveUseToIncomingBB.push_back(i);
       NewPhiValues.push_back(nullptr);
       continue;
@@ -2782,6 +2798,7 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN,
   // loop iteration).
   if (Op1 == &GEP)
     return nullptr;
+  GEPNoWrapFlags NW = Op1->getNoWrapFlags();
 
   int DI = -1;
 
@@ -2838,6 +2855,8 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN,
         }
       }
     }
+
+    NW &= Op2->getNoWrapFlags();
   }
 
   // If not all GEPs are identical we'll have to create a new PHI node.
@@ -2847,6 +2866,8 @@ static Instruction *foldGEPOfPhi(GetElementPtrInst &GEP, PHINode *PN,
     return nullptr;
 
   auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+  NewGEP->setNoWrapFlags(NW);
+
   if (DI == -1) {
     // All the GEPs feeding the PHI are identical. Clone one down into our
     // BB so that it can be merged with the current GEP.
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index f9be7f9..6e86ffd 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -61,7 +61,7 @@ enum : uint32_t {
 };
 
 static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version",
-                                               cl::init("408*"), cl::Hidden,
+                                               cl::init("0000"), cl::Hidden,
                                                cl::ValueRequired);
 
 static cl::opt<bool> AtomicCounter("gcov-atomic-counter", cl::Hidden,
@@ -154,6 +154,7 @@ private:
   GCOVOptions Options;
   llvm::endianness Endian;
   raw_ostream *os;
+  int Version = 0;
 
   // Checksum, produced by hash of EdgeDestinations
   SmallVector<uint32_t, 4> FileChecksums;
@@ -334,12 +335,9 @@ namespace {
         : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident),
           Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) {
       LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
-      bool ExitBlockBeforeBody = Version >= 48;
-      uint32_t i = ExitBlockBeforeBody ? 2 : 1;
+      uint32_t i = 2;
       for (BasicBlock &BB : *F)
         Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++)));
-      if (!ExitBlockBeforeBody)
-        ReturnBlock.Number = i;
 
       std::string FunctionNameAndLine;
       raw_string_ostream FNLOS(FunctionNameAndLine);
@@ -363,44 +361,28 @@ namespace {
     void writeOut(uint32_t CfgChecksum) {
       write(GCOV_TAG_FUNCTION);
       SmallString<128> Filename = getFilename(SP);
-      uint32_t BlockLen =
-          2 + (Version >= 47) + wordsOfString(getFunctionName(SP));
-      if (Version < 80)
-        BlockLen += wordsOfString(Filename) + 1;
-      else
-        BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90);
+      uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP));
+      BlockLen += 1 + wordsOfString(Filename) + 4;
 
       write(BlockLen);
       write(Ident);
       write(FuncChecksum);
-      if (Version >= 47)
-        write(CfgChecksum);
+      write(CfgChecksum);
       writeString(getFunctionName(SP));
-      if (Version < 80) {
-        writeString(Filename);
-        write(SP->getLine());
-      } else {
-        write(SP->isArtificial()); // artificial
-        writeString(Filename);
-        write(SP->getLine()); // start_line
-        write(0);             // start_column
-        // EndLine is the last line with !dbg. It is not the } line as in GCC,
-        // but good enough.
-        write(EndLine);
-        if (Version >= 90)
-          write(0); // end_column
-      }
+
+      write(SP->isArtificial()); // artificial
+      writeString(Filename);
+      write(SP->getLine()); // start_line
+      write(0);             // start_column
+      // EndLine is the last line with !dbg. It is not the } line as in GCC,
+      // but good enough.
+      write(EndLine);
+      write(0); // end_column
 
       // Emit count of blocks.
       write(GCOV_TAG_BLOCKS);
-      if (Version < 80) {
-        write(Blocks.size() + 2);
-        for (int i = Blocks.size() + 2; i; --i)
-          write(0);
-      } else {
-        write(1);
-        write(Blocks.size() + 2);
-      }
+      write(1);
+      write(Blocks.size() + 2);
       LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n");
 
       // Emit edges between blocks.
@@ -767,7 +749,6 @@ bool GCOVProfiler::emitProfileNotes(
     function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
     function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
     function_ref<const TargetLibraryInfo &(Function &F)> GetTLI) {
-  int Version;
   {
     uint8_t c3 = Options.Version[0];
     uint8_t c2 = Options.Version[1];
@@ -775,6 +756,11 @@ bool GCOVProfiler::emitProfileNotes(
     Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0'
                         : (c3 - '0') * 10 + c1 - '0';
   }
+  // Emit .gcno files that are compatible with GCC 11.1.
+  if (Version < 111) {
+    Version = 111;
+    memcpy(Options.Version, "B11*", 4);
+  }
 
   bool EmitGCDA = Options.EmitData;
   for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) {
@@ -973,10 +959,8 @@ bool GCOVProfiler::emitProfileNotes(
         out.write(Tmp, 4);
       }
       write(Stamp);
-      if (Version >= 90)
-        writeString(""); // unuseful current_working_directory
-      if (Version >= 80)
-        write(0); // unuseful has_unexecuted_blocks
+      writeString("."); // unuseful current_working_directory
+      write(0);         // unuseful has_unexecuted_blocks
 
       for (auto &Func : Funcs)
         Func->writeOut(Stamp);
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 530061e..2031728 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -192,7 +192,7 @@ static cl::opt<bool>
                    cl::Hidden);
 
 static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot",
-                                          cl::desc("Hot percentile cuttoff."));
+                                          cl::desc("Hot percentile cutoff."));
 
 static cl::opt<float>
     ClRandomSkipRate("hwasan-random-rate",
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index 2418030..f27798c 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 
 static cl::opt<int>
     HotPercentileCutoff("lower-allow-check-percentile-cutoff-hot",
-                        cl::desc("Hot percentile cuttoff."));
+                        cl::desc("Hot percentile cutoff."));
 
 static cl::opt<float>
     RandomRate("lower-allow-check-random-rate",
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 471086c..db4d62e 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -158,11 +158,11 @@ STATISTIC(NumCoveredBlocks, "Number of basic blocks that were executed");
 
 // Command line option to specify the file to read profile from. This is
 // mainly used for testing.
-static cl::opt<std::string>
-    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
-                       cl::value_desc("filename"),
-                       cl::desc("Specify the path of profile data file. This is"
-                                "mainly for test purpose."));
+static cl::opt<std::string> PGOTestProfileFile(
+    "pgo-test-profile-file", cl::init(""), cl::Hidden,
+    cl::value_desc("filename"),
+    cl::desc("Specify the path of profile data file. This is "
+             "mainly for test purpose."));
 static cl::opt<std::string> PGOTestProfileRemappingFile(
     "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden,
     cl::value_desc("filename"),
@@ -186,7 +186,7 @@ static cl::opt<unsigned> MaxNumAnnotations(
 // to write to the metadata for a single memop intrinsic.
 static cl::opt<unsigned> MaxNumMemOPAnnotations(
     "memop-max-annotations", cl::init(4), cl::Hidden,
-    cl::desc("Max number of preicise value annotations for a single memop"
+    cl::desc("Max number of precise value annotations for a single memop"
              "intrinsic"));
 
 // Command line option to control appending FunctionHash to the name of a COMDAT
@@ -291,13 +291,13 @@ static cl::opt<bool> PGOVerifyHotBFI(
     cl::desc("Print out the non-match BFI count if a hot raw profile count "
              "becomes non-hot, or a cold raw profile count becomes hot. "
              "The print is enabled under -Rpass-analysis=pgo, or "
-             "internal option -pass-remakrs-analysis=pgo."));
+             "internal option -pass-remarks-analysis=pgo."));
 
 static cl::opt<bool> PGOVerifyBFI(
     "pgo-verify-bfi", cl::init(false), cl::Hidden,
     cl::desc("Print out mismatched BFI counts after setting profile metadata "
              "The print is enabled under -Rpass-analysis=pgo, or "
-             "internal option -pass-remakrs-analysis=pgo."));
+             "internal option -pass-remarks-analysis=pgo."));
 
 static cl::opt<unsigned> PGOVerifyBFIRatio(
     "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden,
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 1961095..9cd81f3 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -70,7 +70,7 @@ namespace {
 /// violations.
 struct TypeSanitizer {
   TypeSanitizer(Module &M);
-  bool run(Function &F, const TargetLibraryInfo &TLI);
+  bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
   void instrumentGlobals(Module &M);
 
 private:
@@ -510,7 +510,8 @@ void collectMemAccessInfo(
   }
 }
 
-bool TypeSanitizer::run(Function &F, const TargetLibraryInfo &TLI) {
+bool TypeSanitizer::sanitizeFunction(Function &F,
+                                     const TargetLibraryInfo &TLI) {
   // This is required to prevent instrumenting call to __tysan_init from within
   // the module constructor.
   if (&F == TysanCtorFunction.getCallee() || &F == TysanGlobalsSetTypeFunction)
@@ -876,15 +877,8 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
   return true;
 }
 
-PreservedAnalyses TypeSanitizerPass::run(Function &F,
-                                         FunctionAnalysisManager &FAM) {
-  TypeSanitizer TySan(*F.getParent());
-  TySan.run(F, FAM.getResult<TargetLibraryAnalysis>(F));
-  return PreservedAnalyses::none();
-}
-
-PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M,
-                                               ModuleAnalysisManager &AM) {
+PreservedAnalyses TypeSanitizerPass::run(Module &M,
+                                         ModuleAnalysisManager &MAM) {
   Function *TysanCtorFunction;
   std::tie(TysanCtorFunction, std::ignore) =
       createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName,
@@ -894,5 +888,12 @@ PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M,
   TypeSanitizer TySan(M);
   TySan.instrumentGlobals(M);
   appendToGlobalCtors(M, TysanCtorFunction, 0);
+
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M) {
+    const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+    TySan.sanitizeFunction(F, TLI);
+  }
+
   return PreservedAnalyses::none();
 }
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index ead07ed..91a3c3f 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -216,7 +216,7 @@ struct StackEntry {
   StackEntry(unsigned NumIn, unsigned NumOut, bool IsSigned,
              SmallVector<Value *, 2> ValuesToRelease)
       : NumIn(NumIn), NumOut(NumOut), IsSigned(IsSigned),
-        ValuesToRelease(ValuesToRelease) {}
+        ValuesToRelease(std::move(ValuesToRelease)) {}
 };
 
 struct ConstraintTy {
@@ -726,8 +726,8 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
   }
 
   for (const auto &KV : VariablesB) {
-    if (SubOverflow(R[GetOrAddIndex(KV.Variable)], KV.Coefficient,
-                    R[GetOrAddIndex(KV.Variable)]))
+    auto &Coeff = R[GetOrAddIndex(KV.Variable)];
+    if (SubOverflow(Coeff, KV.Coefficient, Coeff))
       return {};
     auto I =
         KnownNonNegativeVariables.insert({KV.Variable, KV.IsKnownNonNegative});
@@ -759,9 +759,9 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
     if (!KV.second ||
         (!Value2Index.contains(KV.first) && !NewIndexMap.contains(KV.first)))
       continue;
-    SmallVector<int64_t, 8> C(Value2Index.size() + NewVariables.size() + 1, 0);
+    auto &C = Res.ExtraInfo.emplace_back(
+        Value2Index.size() + NewVariables.size() + 1, 0);
     C[GetOrAddIndex(KV.first)] = -1;
-    Res.ExtraInfo.push_back(C);
   }
   return Res;
 }
@@ -1591,53 +1591,52 @@ void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B,
 
   LLVM_DEBUG(dbgs() << "Adding '"; dumpUnpackedICmp(dbgs(), Pred, A, B);
              dbgs() << "'\n");
-  bool Added = false;
   auto &CSToUse = getCS(R.IsSigned);
   if (R.Coefficients.empty())
     return;
 
-  Added |= CSToUse.addVariableRowFill(R.Coefficients);
+  bool Added = CSToUse.addVariableRowFill(R.Coefficients);
+  if (!Added)
+    return;
 
   // If R has been added to the system, add the new variables and queue it for
   // removal once it goes out-of-scope.
-  if (Added) {
-    SmallVector<Value *, 2> ValuesToRelease;
-    auto &Value2Index = getValue2Index(R.IsSigned);
-    for (Value *V : NewVariables) {
-      Value2Index.insert({V, Value2Index.size() + 1});
-      ValuesToRelease.push_back(V);
-    }
-
-    LLVM_DEBUG({
-      dbgs() << "  constraint: ";
-      dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned));
-      dbgs() << "\n";
-    });
+  SmallVector<Value *, 2> ValuesToRelease;
+  auto &Value2Index = getValue2Index(R.IsSigned);
+  for (Value *V : NewVariables) {
+    Value2Index.insert({V, Value2Index.size() + 1});
+    ValuesToRelease.push_back(V);
+  }
 
-    DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
-                            std::move(ValuesToRelease));
-
-    if (!R.IsSigned) {
-      for (Value *V : NewVariables) {
-        ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0),
-                            false, false, false);
-        VarPos.Coefficients[Value2Index[V]] = -1;
-        CSToUse.addVariableRow(VarPos.Coefficients);
-        DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
-                                SmallVector<Value *, 2>());
-      }
-    }
+  LLVM_DEBUG({
+    dbgs() << "  constraint: ";
+    dumpConstraint(R.Coefficients, getValue2Index(R.IsSigned));
+    dbgs() << "\n";
+  });
 
-    if (R.isEq()) {
-      // Also add the inverted constraint for equality constraints.
-      for (auto &Coeff : R.Coefficients)
-        Coeff *= -1;
-      CSToUse.addVariableRowFill(R.Coefficients);
+  DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
+                          std::move(ValuesToRelease));
 
+  if (!R.IsSigned) {
+    for (Value *V : NewVariables) {
+      ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0),
+                          false, false, false);
+      VarPos.Coefficients[Value2Index[V]] = -1;
+      CSToUse.addVariableRow(VarPos.Coefficients);
       DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
                               SmallVector<Value *, 2>());
     }
   }
+
+  if (R.isEq()) {
+    // Also add the inverted constraint for equality constraints.
+    for (auto &Coeff : R.Coefficients)
+      Coeff *= -1;
+    CSToUse.addVariableRowFill(R.Coefficients);
+
+    DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
+                            SmallVector<Value *, 2>());
+  }
 }
 
 static bool replaceSubOverflowUses(IntrinsicInst *II, Value *A, Value *B,
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 3c4a40f..8a5c506 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -109,7 +109,7 @@ static cl::opt<unsigned> MaxNumVisitiedPaths(
     "dfa-max-num-visited-paths",
     cl::desc(
         "Max number of blocks visited while enumerating paths around a switch"),
-    cl::Hidden, cl::init(2000));
+    cl::Hidden, cl::init(2500));
 
 static cl::opt<unsigned>
     MaxNumPaths("dfa-max-num-paths",
@@ -754,17 +754,15 @@ private:
     return Res;
   }
 
-  /// Walk the use-def chain and collect all the state-defining instructions.
-  ///
-  /// Return an empty map if unpredictable values encountered inside the basic
-  /// blocks of \p LoopPaths.
+  /// Walk the use-def chain and collect all the state-defining blocks and the
+  /// PHI nodes in those blocks that define the state.
   StateDefMap getStateDefMap() const {
     StateDefMap Res;
-    Value *FirstDef = Switch->getOperand(0);
-    assert(isa<PHINode>(FirstDef) && "The first definition must be a phi.");
+    PHINode *FirstDef = dyn_cast<PHINode>(Switch->getOperand(0));
+    assert(FirstDef && "The first definition must be a phi.");
 
     SmallVector<PHINode *, 8> Stack;
-    Stack.push_back(dyn_cast<PHINode>(FirstDef));
+    Stack.push_back(FirstDef);
     SmallSet<Value *, 16> SeenValues;
 
     while (!Stack.empty()) {
@@ -774,18 +772,15 @@ private:
       SeenValues.insert(CurPhi);
 
       for (BasicBlock *IncomingBB : CurPhi->blocks()) {
-        Value *Incoming = CurPhi->getIncomingValueForBlock(IncomingBB);
+        PHINode *IncomingPhi =
+            dyn_cast<PHINode>(CurPhi->getIncomingValueForBlock(IncomingBB));
+        if (!IncomingPhi)
+          continue;
         bool IsOutsideLoops = !SwitchOuterLoop->contains(IncomingBB);
-        if (Incoming == FirstDef || isa<ConstantInt>(Incoming) ||
-            SeenValues.contains(Incoming) || IsOutsideLoops) {
+        if (SeenValues.contains(IncomingPhi) || IsOutsideLoops)
           continue;
-        }
-
-        // Any unpredictable value inside the loops means we must bail out.
-        if (!isa<PHINode>(Incoming))
-          return StateDefMap();
 
-        Stack.push_back(cast<PHINode>(Incoming));
+        Stack.push_back(IncomingPhi);
       }
     }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ba1c224..3c82eed 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -128,7 +128,7 @@ static cl::opt<bool, true>
 
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
-    cl::desc("Use loop idiom recognition code size heuristics when compiling"
+    cl::desc("Use loop idiom recognition code size heuristics when compiling "
              "with -Os/-Oz"),
     cl::init(true), cl::Hidden);
 
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 260cc72..0903488 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -104,7 +104,7 @@ static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
 
 static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
     "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
-    cl::desc("Don't allow loop unrolling to simulate more than this number of"
+    cl::desc("Don't allow loop unrolling to simulate more than this number of "
              "iterations when checking full unroll profitability"));
 
 static cl::opt<unsigned> UnrollCount(
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index f58dcb5..6e91c4f 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -95,7 +95,7 @@ static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
 /// invariant instructions in a loop.
 static cl::opt<float>
     LVInvarThreshold("licm-versioning-invariant-threshold",
-                     cl::desc("LoopVersioningLICM's minimum allowed percentage"
+                     cl::desc("LoopVersioningLICM's minimum allowed percentage "
                               "of possible invariant instructions per loop"),
                      cl::init(25), cl::Hidden);
 
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index bb98b3d..5f7cb92 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -345,10 +345,14 @@ static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA,
 static void combineAAMetadata(Instruction *ReplInst, Instruction *I) {
   // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
   // handled here, but combineMetadata doesn't support them yet
-  unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
-                         LLVMContext::MD_noalias,
-                         LLVMContext::MD_invariant_group,
-                         LLVMContext::MD_access_group};
+  unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,         LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,      LLVMContext::MD_invariant_group,
+      LLVMContext::MD_access_group, LLVMContext::MD_prof,
+      LLVMContext::MD_memprof,      LLVMContext::MD_callsite};
+  // FIXME: https://github.com/llvm/llvm-project/issues/121495
+  // Use custom AA metadata combining handling instead of combineMetadata, which
+  // is meant for CSE and will drop any metadata not in the KnownIDs list.
   combineMetadata(ReplInst, I, KnownIDs, true);
 }
 
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index 1d4f561..b499ef8 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -28,8 +28,8 @@ using namespace llvm;
 namespace llvm {
 cl::opt<bool> ShouldPreserveAllAttributes(
     "assume-preserve-all", cl::init(false), cl::Hidden,
-    cl::desc("enable preservation of all attrbitues. even those that are "
-             "unlikely to be usefull"));
+    cl::desc("enable preservation of all attributes. even those that are "
+             "unlikely to be useful"));
 
 cl::opt<bool> EnableKnowledgeRetention(
     "enable-knowledge-retention", cl::init(false), cl::Hidden,
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 47bb319..d47f1b4 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -48,6 +48,21 @@ static void insertCall(Function &CurFn, StringRef Func,
                                                   /*isVarArg=*/false)),
           {GV}, "", InsertionPt);
       Call->setDebugLoc(DL);
+    } else if (TargetTriple.isRISCV() || TargetTriple.isAArch64() ||
+               TargetTriple.isLoongArch()) {
+      // On RISC-V, AArch64, and LoongArch, the `_mcount` function takes
+      // `__builtin_return_address(0)` as an argument since
+      // `__builtin_return_address(1)` is not available on these platforms.
+      Instruction *RetAddr = CallInst::Create(
+          Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress),
+          ConstantInt::get(Type::getInt32Ty(C), 0), "", InsertionPt);
+      RetAddr->setDebugLoc(DL);
+
+      FunctionCallee Fn = M.getOrInsertFunction(
+          Func, FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C),
+                                  false));
+      CallInst *Call = CallInst::Create(Fn, RetAddr, "", InsertionPt);
+      Call->setDebugLoc(DL);
     } else {
       FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C));
       CallInst *Call = CallInst::Create(Fn, "", InsertionPt);
@@ -88,6 +103,12 @@ static bool runOnFunction(Function &F, bool PostInlining) {
   if (F.hasFnAttribute(Attribute::Naked))
     return false;
 
+  // available_externally functions may not have definitions external to the
+  // module (e.g. gnu::always_inline). Instrumenting them might lead to linker
+  // errors if they are optimized out. Skip them like GCC.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+
   StringRef EntryAttr = PostInlining ? "instrument-function-entry-inlined"
                                      : "instrument-function-entry";
 
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 766c750..ae1af943 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -331,15 +331,12 @@ void FunctionImportGlobalProcessing::processGlobalsForThinLTO() {
       }
 }
 
-bool FunctionImportGlobalProcessing::run() {
-  processGlobalsForThinLTO();
-  return false;
-}
+void FunctionImportGlobalProcessing::run() { processGlobalsForThinLTO(); }
 
-bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
+void llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
                                   bool ClearDSOLocalOnDeclarations,
                                   SetVector<GlobalValue *> *GlobalsToImport) {
   FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport,
                                                    ClearDSOLocalOnDeclarations);
-  return ThinLTOProcessing.run();
+  ThinLTOProcessing.run();
 }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index a3af96d..1e4061c 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3308,6 +3308,9 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
   return Changed;
 }
 
+// FIXME: https://github.com/llvm/llvm-project/issues/121495
+// Once external callers of this function are removed, either inline into
+// combineMetadataForCSE, or internalize and remove KnownIDs parameter.
 void llvm::combineMetadata(Instruction *K, const Instruction *J,
                            ArrayRef<unsigned> KnownIDs, bool DoesKMove) {
   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
@@ -3320,6 +3323,10 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
 
     switch (Kind) {
       default:
+        // FIXME: https://github.com/llvm/llvm-project/issues/121495
+        // Change to removing only explicitly listed other metadata, and assert
+        // on unknown metadata, to avoid inadvertently dropping newly added
+        // metadata types.
         K->setMetadata(Kind, nullptr); // Remove unknown metadata
         break;
       case LLVMContext::MD_dbg:
@@ -3379,6 +3386,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
           K->setMetadata(Kind,
             MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
         break;
+      case LLVMContext::MD_memprof:
+        K->setMetadata(Kind, MDNode::getMergedMemProfMetadata(KMD, JMD));
+        break;
+      case LLVMContext::MD_callsite:
+        K->setMetadata(Kind, MDNode::getMergedCallsiteMetadata(KMD, JMD));
+        break;
       case LLVMContext::MD_preserve_access_index:
         // Preserve !preserve.access.index in K.
         break;
@@ -3442,7 +3455,9 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J,
                          LLVMContext::MD_nontemporal,
                          LLVMContext::MD_noundef,
                          LLVMContext::MD_mmra,
-                         LLVMContext::MD_noalias_addrspace};
+                         LLVMContext::MD_noalias_addrspace,
+                         LLVMContext::MD_memprof,
+                         LLVMContext::MD_callsite};
   combineMetadata(K, J, KnownIDs, KDominatesJ);
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index d829864..b3f9f76 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -778,7 +778,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops",
-                    false, true)
+                    false, false)
 
 // Publicly exposed interface to pass...
 char &llvm::LoopSimplifyID = LoopSimplify::ID;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 17f4b39..e367b01 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -96,8 +96,9 @@ using namespace PatternMatch;
 cl::opt<bool> llvm::RequireAndPreserveDomTree(
     "simplifycfg-require-and-preserve-domtree", cl::Hidden,
 
-    cl::desc("Temorary development switch used to gradually uplift SimplifyCFG "
-             "into preserving DomTree,"));
+    cl::desc(
+        "Temporary development switch used to gradually uplift SimplifyCFG "
+        "into preserving DomTree,"));
 
 // Chosen as 2 so as to be cheap, but still to have enough power to fold
 // a select, so the "clamp" idiom (of a min followed by a max) will be caught.
@@ -126,7 +127,7 @@ static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
 
 static cl::opt<unsigned> HoistLoadsStoresWithCondFaultingThreshold(
     "hoist-loads-stores-with-cond-faulting-threshold", cl::Hidden, cl::init(6),
-    cl::desc("Control the maximal conditonal load/store that we are willing "
+    cl::desc("Control the maximal conditional load/store that we are willing "
              "to speculatively execute to eliminate conditional branch "
              "(default = 6)"));
 
@@ -2153,12 +2154,9 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
         SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
         if (!SI) {
           // Propagate fast-math-flags from phi node to its replacement select.
-          IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
-          if (isa<FPMathOperator>(PN))
-            Builder.setFastMathFlags(PN.getFastMathFlags());
-
-          SI = cast<SelectInst>(Builder.CreateSelect(
+          SI = cast<SelectInst>(Builder.CreateSelectFMF(
               BI->getCondition(), BB1V, BB2V,
+              isa<FPMathOperator>(PN) ? &PN : nullptr,
               BB1V->getName() + "." + BB2V->getName(), BI));
         }
 
@@ -3898,16 +3896,14 @@ static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 
   IRBuilder<NoFolder> Builder(DomBI);
   // Propagate fast-math-flags from phi nodes to replacement selects.
-  IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
-    if (isa<FPMathOperator>(PN))
-      Builder.setFastMathFlags(PN->getFastMathFlags());
-
     // Change the PHI node into a select instruction.
     Value *TrueVal = PN->getIncomingValueForBlock(IfTrue);
     Value *FalseVal = PN->getIncomingValueForBlock(IfFalse);
 
-    Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", DomBI);
+    Value *Sel = Builder.CreateSelectFMF(IfCond, TrueVal, FalseVal,
+                                         isa<FPMathOperator>(PN) ? PN : nullptr,
+                                         "", DomBI);
     PN->replaceAllUsesWith(Sel);
     Sel->takeName(PN);
     PN->eraseFromParent();
@@ -6531,8 +6527,8 @@ SwitchLookupTable::SwitchLookupTable(
     uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
     TableContents[Idx] = CaseRes;
 
-    if (CaseRes != SingleValue)
-      SingleValue = nullptr;
+    if (SingleValue && !isa<PoisonValue>(CaseRes) && CaseRes != SingleValue)
+      SingleValue = isa<PoisonValue>(SingleValue) ? CaseRes : nullptr;
   }
 
   // Fill in any holes in the table with the default result.
@@ -6545,7 +6541,10 @@ SwitchLookupTable::SwitchLookupTable(
         TableContents[I] = DefaultValue;
     }
 
-    if (DefaultValue != SingleValue)
+    // If the default value is poison, all the holes are poison.
+    bool DefaultValueIsPoison = isa<PoisonValue>(DefaultValue);
+
+    if (DefaultValue != SingleValue && !DefaultValueIsPoison)
       SingleValue = nullptr;
   }
 
@@ -6569,6 +6568,16 @@ SwitchLookupTable::SwitchLookupTable(
     // Check if there is the same distance between two consecutive values.
     for (uint64_t I = 0; I < TableSize; ++I) {
       ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]);
+
+      if (!ConstVal && isa<PoisonValue>(TableContents[I])) {
+        // This is an poison, so it's (probably) a lookup table hole.
+        // To prevent any regressions from before we switched to using poison as
+        // the default value, holes will fall back to using the first value.
+        // This can be removed once we add proper handling for poisons in lookup
+        // tables.
+        ConstVal = dyn_cast<ConstantInt>(Values[0].second);
+      }
+
       if (!ConstVal) {
         // This is an undef. We could deal with it, but undefs in lookup tables
         // are very seldom. It's probably not worth the additional complexity.
@@ -7003,8 +7012,8 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // If the table has holes but the default destination doesn't produce any
   // constant results, the lookup table entries corresponding to the holes will
-  // contain undefined values.
-  bool AllHolesAreUndefined = TableHasHoles && !HasDefaultResults;
+  // contain poison.
+  bool AllHolesArePoison = TableHasHoles && !HasDefaultResults;
 
   // If the default destination doesn't produce a constant result but is still
   // reachable, and the lookup table has holes, we need to use a mask to
@@ -7012,7 +7021,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // to the default case.
   // The mask is unnecessary if the table has holes but the default destination
   // is unreachable, as in that case the holes must also be unreachable.
-  bool NeedMask = AllHolesAreUndefined && DefaultIsReachable;
+  bool NeedMask = AllHolesArePoison && DefaultIsReachable;
   if (NeedMask) {
     // As an extra penalty for the validity test we require more cases.
     if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
@@ -7157,9 +7166,11 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   for (PHINode *PHI : PHIs) {
     const ResultListTy &ResultList = ResultLists[PHI];
 
+    Type *ResultType = ResultList.begin()->second->getType();
+
     // Use any value to fill the lookup table holes.
     Constant *DV =
-        AllHolesAreUndefined ? ResultLists[PHI][0].second : DefaultResults[PHI];
+        AllHolesArePoison ? PoisonValue::get(ResultType) : DefaultResults[PHI];
     StringRef FuncName = Fn->getName();
     SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV,
                             DL, FuncName);
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 737818b..2b2b467 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -2005,28 +2005,21 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
         AbsOp = Real;
     }
 
-    if (AbsOp) {
-      IRBuilderBase::FastMathFlagGuard Guard(B);
-      B.setFastMathFlags(CI->getFastMathFlags());
-
+    if (AbsOp)
       return copyFlags(
-          *CI, B.CreateUnaryIntrinsic(Intrinsic::fabs, AbsOp, nullptr, "cabs"));
-    }
+          *CI, B.CreateUnaryIntrinsic(Intrinsic::fabs, AbsOp, CI, "cabs"));
 
     if (!CI->isFast())
       return nullptr;
   }
 
   // Propagate fast-math flags from the existing call to new instructions.
-  IRBuilderBase::FastMathFlagGuard Guard(B);
-  B.setFastMathFlags(CI->getFastMathFlags());
-
-  Value *RealReal = B.CreateFMul(Real, Real);
-  Value *ImagImag = B.CreateFMul(Imag, Imag);
-
-  return copyFlags(*CI, B.CreateUnaryIntrinsic(Intrinsic::sqrt,
-                                               B.CreateFAdd(RealReal, ImagImag),
-                                               nullptr, "cabs"));
+  Value *RealReal = B.CreateFMulFMF(Real, Real, CI);
+  Value *ImagImag = B.CreateFMulFMF(Imag, Imag, CI);
+  return copyFlags(
+      *CI, B.CreateUnaryIntrinsic(Intrinsic::sqrt,
+                                  B.CreateFAddFMF(RealReal, ImagImag, CI), CI,
+                                  "cabs"));
 }
 
 // Return a properly extended integer (DstWidth bits wide) if the operation is
@@ -2480,15 +2473,13 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
   // "Ideally, fmax would be sensitive to the sign of zero, for example
   // fmax(-0.0, +0.0) would return +0; however, implementation in software
   // might be impractical."
-  IRBuilderBase::FastMathFlagGuard Guard(B);
   FastMathFlags FMF = CI->getFastMathFlags();
   FMF.setNoSignedZeros();
-  B.setFastMathFlags(FMF);
 
   Intrinsic::ID IID = Callee->getName().starts_with("fmin") ? Intrinsic::minnum
                                                             : Intrinsic::maxnum;
   return copyFlags(*CI, B.CreateBinaryIntrinsic(IID, CI->getArgOperand(0),
-                                                CI->getArgOperand(1)));
+                                                CI->getArgOperand(1), FMF));
 }
 
 Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
@@ -2783,20 +2774,18 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
 
   // Fast math flags for any created instructions should match the sqrt
   // and multiply.
-  IRBuilderBase::FastMathFlagGuard Guard(B);
-  B.setFastMathFlags(I->getFastMathFlags());
 
   // If we found a repeated factor, hoist it out of the square root and
   // replace it with the fabs of that factor.
   Value *FabsCall =
-      B.CreateUnaryIntrinsic(Intrinsic::fabs, RepeatOp, nullptr, "fabs");
+      B.CreateUnaryIntrinsic(Intrinsic::fabs, RepeatOp, I, "fabs");
   if (OtherOp) {
     // If we found a non-repeated factor, we still need to get its square
     // root. We then multiply that by the value that was simplified out
     // of the square root calculation.
     Value *SqrtCall =
-        B.CreateUnaryIntrinsic(Intrinsic::sqrt, OtherOp, nullptr, "sqrt");
-    return copyFlags(*CI, B.CreateFMul(FabsCall, SqrtCall));
+        B.CreateUnaryIntrinsic(Intrinsic::sqrt, OtherOp, I, "sqrt");
+    return copyFlags(*CI, B.CreateFMulFMF(FabsCall, SqrtCall, I));
   }
   return copyFlags(*CI, FabsCall);
 }
@@ -2951,26 +2940,23 @@ static Value *optimizeSymmetricCall(CallInst *CI, bool IsEven,
   Value *Src = CI->getArgOperand(0);
 
   if (match(Src, m_OneUse(m_FNeg(m_Value(X))))) {
-    IRBuilderBase::FastMathFlagGuard Guard(B);
-    B.setFastMathFlags(CI->getFastMathFlags());
-
-    auto *CallInst = copyFlags(*CI, B.CreateCall(CI->getCalledFunction(), {X}));
+    auto *Call = B.CreateCall(CI->getCalledFunction(), {X});
+    Call->copyFastMathFlags(CI);
+    auto *CallInst = copyFlags(*CI, Call);
     if (IsEven) {
       // Even function: f(-x) = f(x)
       return CallInst;
     }
     // Odd function: f(-x) = -f(x)
-    return B.CreateFNeg(CallInst);
+    return B.CreateFNegFMF(CallInst, CI);
   }
 
   // Even function: f(abs(x)) = f(x), f(copysign(x, y)) = f(x)
   if (IsEven && (match(Src, m_FAbs(m_Value(X))) ||
                  match(Src, m_CopySign(m_Value(X), m_Value())))) {
-    IRBuilderBase::FastMathFlagGuard Guard(B);
-    B.setFastMathFlags(CI->getFastMathFlags());
-
-    auto *CallInst = copyFlags(*CI, B.CreateCall(CI->getCalledFunction(), {X}));
-    return CallInst;
+    auto *Call = B.CreateCall(CI->getCalledFunction(), {X});
+    Call->copyFastMathFlags(CI);
+    return copyFlags(*CI, Call);
   }
 
   return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5..9e81573 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -324,6 +324,11 @@ private:
       Instruction *ChainElem, Instruction *ChainBegin,
       const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
 
+  /// Merges the equivalence classes if they have underlying objects that differ
+  /// by one level of indirection (i.e., one is a getelementptr and the other is
+  /// the base pointer in that getelementptr).
+  void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
+
   /// Collects loads and stores grouped by "equivalence class", where:
   ///   - all elements in an eq class are a load or all are a store,
   ///   - they all load/store the same element size (it's OK to have e.g. i8 and
@@ -1305,6 +1310,119 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
   return std::nullopt;
 }
 
+void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
+  if (EQClasses.size() < 2) // There is nothing to merge.
+    return;
+
+  // The reduced key has all elements of the ECClassKey except the underlying
+  // object. Check that EqClassKey has 4 elements and define the reduced key.
+  static_assert(std::tuple_size_v<EqClassKey> == 4,
+                "EqClassKey has changed - EqClassReducedKey needs changes too");
+  using EqClassReducedKey =
+      std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
+                 std::tuple_element_t<2, EqClassKey> /* Element size */,
+                 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
+  using ECReducedKeyToUnderlyingObjectMap =
+      MapVector<EqClassReducedKey,
+                SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
+
+  // Form a map from the reduced key (without the underlying object) to the
+  // underlying objects: 1 reduced key to many underlying objects, to form
+  // groups of potentially merge-able equivalence classes.
+  ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
+  bool FoundPotentiallyOptimizableEC = false;
+  for (const auto &EC : EQClasses) {
+    const auto &Key = EC.first;
+    EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
+                             std::get<3>(Key)};
+    RedKeyToUOMap[RedKey].insert(std::get<0>(Key));
+    if (RedKeyToUOMap[RedKey].size() > 1)
+      FoundPotentiallyOptimizableEC = true;
+  }
+  if (!FoundPotentiallyOptimizableEC)
+    return;
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
+    for (const auto &EC : EQClasses) {
+      dbgs() << "  Key: {" << EC.first << "}\n";
+      for (const auto &Inst : EC.second)
+        dbgs() << "    Inst: " << *Inst << '\n';
+    }
+  });
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
+    for (const auto &RedKeyToUO : RedKeyToUOMap) {
+      dbgs() << "  Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
+             << std::get<1>(RedKeyToUO.first) << ", "
+             << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
+             << RedKeyToUO.second.size() << " underlying objects:\n";
+      for (auto UObject : RedKeyToUO.second)
+        dbgs() << "    " << *UObject << '\n';
+    }
+  });
+
+  using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
+
+  // Compute the ultimate targets for a set of underlying objects.
+  auto GetUltimateTargets =
+      [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
+    UObjectToUObjectMap IndirectionMap;
+    for (const auto *UObject : UObjects) {
+      const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
+      const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
+      if (UltimateTarget != UObject)
+        IndirectionMap[UObject] = UltimateTarget;
+    }
+    UObjectToUObjectMap UltimateTargetsMap;
+    for (const auto *UObject : UObjects) {
+      auto Target = UObject;
+      auto It = IndirectionMap.find(Target);
+      for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
+        Target = It->second;
+      UltimateTargetsMap[UObject] = Target;
+    }
+    return UltimateTargetsMap;
+  };
+
+  // For each item in RedKeyToUOMap, if it has more than one underlying object,
+  // try to merge the equivalence classes.
+  for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
+    if (UObjects.size() < 2)
+      continue;
+    auto UTMap = GetUltimateTargets(UObjects);
+    for (const auto &[UObject, UltimateTarget] : UTMap) {
+      if (UObject == UltimateTarget)
+        continue;
+
+      EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
+                         std::get<2>(RedKey)};
+      EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
+                       std::get<2>(RedKey)};
+      // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
+      // request the reference to the instructions vector for KeyTo first.
+      const auto &VecTo = EQClasses[KeyTo];
+      const auto &VecFrom = EQClasses[KeyFrom];
+      SmallVector<Instruction *, 8> MergedVec;
+      std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
+                 std::back_inserter(MergedVec),
+                 [](Instruction *A, Instruction *B) {
+                   return A && B && A->comesBefore(B);
+                 });
+      EQClasses[KeyTo] = std::move(MergedVec);
+      EQClasses.erase(KeyFrom);
+    }
+  }
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
+    for (const auto &EC : EQClasses) {
+      dbgs() << "  Key: {" << EC.first << "}\n";
+      for (const auto &Inst : EC.second)
+        dbgs() << "    Inst: " << *Inst << '\n';
+    }
+  });
+}
+
 EquivalenceClassMap
 Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
                                       BasicBlock::iterator End) {
@@ -1377,6 +1495,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
         .emplace_back(&I);
   }
 
+  mergeEquivalenceClasses(Ret);
   return Ret;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 650a485..bc44ec1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -231,17 +231,21 @@ public:
         new VPInstruction(Ptr, Offset, GEPNoWrapFlags::inBounds(), DL, Name));
   }
 
+  /// Convert the input value \p Current to the corresponding value of an
+  /// induction with \p Start and \p Step values, using \p Start + \p Current *
+  /// \p Step.
   VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
                                      FPMathOperator *FPBinOp, VPValue *Start,
-                                     VPCanonicalIVPHIRecipe *CanonicalIV,
-                                     VPValue *Step, const Twine &Name = "") {
+                                     VPValue *Current, VPValue *Step,
+                                     const Twine &Name = "") {
     return tryInsertInstruction(
-        new VPDerivedIVRecipe(Kind, FPBinOp, Start, CanonicalIV, Step, Name));
+        new VPDerivedIVRecipe(Kind, FPBinOp, Start, Current, Step, Name));
   }
 
   VPScalarCastRecipe *createScalarCast(Instruction::CastOps Opcode, VPValue *Op,
-                                       Type *ResultTy) {
-    return tryInsertInstruction(new VPScalarCastRecipe(Opcode, Op, ResultTy));
+                                       Type *ResultTy, DebugLoc DL) {
+    return tryInsertInstruction(
+        new VPScalarCastRecipe(Opcode, Op, ResultTy, DL));
   }
 
   VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 10b998f..47866da 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -479,7 +479,8 @@ public:
         AC(AC), ORE(ORE), VF(VecWidth),
         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
-        PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
+        PSI(PSI), RTChecks(RTChecks), Plan(Plan),
+        VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
     // Query this against the original loop and save it here because the profile
     // of the original loop header may change as the transformation happens.
     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -517,22 +518,6 @@ public:
   /// Fix the non-induction PHIs in \p Plan.
   void fixNonInductionPHIs(VPTransformState &State);
 
-  /// Create a ResumePHI VPInstruction for the induction \p InductionPhiIRI to
-  /// resume iteration count in the scalar epilogue from where the vectorized
-  /// loop left off, and add it to the scalar preheader of VPlan. Also creates
-  /// the induction resume value, and the value for the bypass block, if needed.
-  /// \p Step is the SCEV-expanded induction step to use. In cases where the
-  /// loop skeleton is more complicated (i.e., epilogue vectorization) and the
-  /// resume values can come from an additional bypass block,
-  /// \p MainVectorTripCount provides the trip count of the main vector loop,
-  /// used to compute the resume value reaching the scalar loop preheader
-  /// directly from this additional bypass block.
-  void createInductionResumeVPValue(VPIRInstruction *InductionPhiIRI,
-                                    const InductionDescriptor &ID, Value *Step,
-                                    ArrayRef<BasicBlock *> BypassBlocks,
-                                    VPBuilder &ScalarPHBuilder,
-                                    Value *MainVectorTripCount = nullptr);
-
   /// Returns the original loop trip count.
   Value *getTripCount() const { return TripCount; }
 
@@ -588,23 +573,21 @@ protected:
   /// vector loop preheader, middle block and scalar preheader.
   void createVectorLoopSkeleton(StringRef Prefix);
 
-  /// Create new phi nodes for the induction variables to resume iteration count
-  /// in the scalar epilogue, from where the vectorized loop left off.
-  /// In cases where the loop skeleton is more complicated (i.e. epilogue
-  /// vectorization), \p MainVectorTripCount provides the trip count of the main
-  /// loop, used to compute these resume values. If \p IVSubset is provided, it
-  /// contains the phi nodes for which resume values are needed, because they
-  /// will generate wide induction phis in the epilogue loop.
-  void
-  createInductionResumeVPValues(const SCEV2ValueTy &ExpandedSCEVs,
-                                Value *MainVectorTripCount = nullptr,
-                                SmallPtrSetImpl<PHINode *> *IVSubset = nullptr);
+  /// Create and record the values for induction variables to resume coming from
+  /// the additional bypass block.
+  void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
+                                             Value *MainVectorTripCount);
 
   /// Allow subclasses to override and print debug traces before/after vplan
   /// execution, when trace information is requested.
   virtual void printDebugTracesAtStart() {}
   virtual void printDebugTracesAtEnd() {}
 
+  /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
+  /// vector preheader and its predecessor, also connecting the new block to the
+  /// scalar preheader.
+  void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
+
   /// The original loop.
   Loop *OrigLoop;
 
@@ -699,6 +682,10 @@ protected:
   BasicBlock *AdditionalBypassBlock = nullptr;
 
   VPlan &Plan;
+
+  /// The vector preheader block of \p Plan, used as target for check blocks
+  /// introduced during skeleton creation.
+  VPBlockBase *VectorPHVPB;
 };
 
 /// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -1744,7 +1731,8 @@ private:
   bool needsExtract(Value *V, ElementCount VF) const {
     Instruction *I = dyn_cast<Instruction>(V);
     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
-        TheLoop->isLoopInvariant(I))
+        TheLoop->isLoopInvariant(I) ||
+        getWideningDecision(I, VF) == CM_Scalarize)
       return false;
 
     // Assume we can vectorize V (and hence we need extraction) if the
@@ -2406,12 +2394,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
   // End if-block.
   VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
   bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
-  assert((Parent || all_of(RepRecipe->operands(),
-                           [](VPValue *Op) {
-                             return Op->isDefinedOutsideLoopRegions();
-                           })) &&
-         "Expected a recipe is either within a region or all of its operands "
-         "are defined outside the vectorized region.");
+  assert(
+      (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
+       all_of(RepRecipe->operands(),
+              [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
+      "Expected a recipe is either within a region or all of its operands "
+      "are defined outside the vectorized region.");
   if (IfPredicateInstr)
     PredicatedInstructions.push_back(Cloned);
 }
@@ -2466,19 +2454,15 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
   return VectorTripCount;
 }
 
-/// Introduces a new VPIRBasicBlock for \p CheckIRBB to \p Plan between the
-/// vector preheader and its predecessor, also connecting the new block to the
-/// scalar preheader.
-static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) {
+void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   VPBlockBase *ScalarPH = Plan.getScalarPreheader();
-  VPBlockBase *VectorPH = Plan.getVectorPreheader();
-  VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
+  VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
   if (PreVectorPH->getNumSuccessors() != 1) {
     assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
     assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
            "Unexpected successor");
-    VPIRBasicBlock *CheckVPIRBB = VPIRBasicBlock::fromBasicBlock(CheckIRBB);
-    VPBlockUtils::insertOnEdge(PreVectorPH, VectorPH, CheckVPIRBB);
+    VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
+    VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
     PreVectorPH = CheckVPIRBB;
   }
   VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
@@ -2567,7 +2551,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   LoopBypassBlocks.push_back(TCCheckBlock);
 
   // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
-  introduceCheckBlockInVPlan(Plan, TCCheckBlock);
+  introduceCheckBlockInVPlan(TCCheckBlock);
 }
 
 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
@@ -2585,7 +2569,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
   LoopBypassBlocks.push_back(SCEVCheckBlock);
   AddedSafetyChecks = true;
 
-  introduceCheckBlockInVPlan(Plan, SCEVCheckBlock);
+  introduceCheckBlockInVPlan(SCEVCheckBlock);
   return SCEVCheckBlock;
 }
 
@@ -2622,10 +2606,25 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
 
   AddedSafetyChecks = true;
 
-  introduceCheckBlockInVPlan(Plan, MemCheckBlock);
+  introduceCheckBlockInVPlan(MemCheckBlock);
   return MemCheckBlock;
 }
 
+/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
+/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
+/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
+/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
+static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+  VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
+  for (auto &R : make_early_inc_range(*VPBB)) {
+    assert(!R.isPhi() && "Tried to move phi recipe to end of block");
+    R.moveBefore(*IRVPBB, IRVPBB->end());
+  }
+
+  VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
+  // VPBB is now dead and will be cleaned up when the plan gets destroyed.
+}
+
 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
@@ -2636,64 +2635,11 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopMiddleBlock =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
                  LI, nullptr, Twine(Prefix) + "middle.block");
+  replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
   LoopScalarPreHeader =
       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
                  nullptr, Twine(Prefix) + "scalar.ph");
-}
-
-void InnerLoopVectorizer::createInductionResumeVPValue(
-    VPIRInstruction *InductionPhiRI, const InductionDescriptor &II, Value *Step,
-    ArrayRef<BasicBlock *> BypassBlocks, VPBuilder &ScalarPHBuilder,
-    Value *MainVectorTripCount) {
-  // TODO: Move to LVP or general VPlan construction, once no IR values are
-  // generated.
-  auto *OrigPhi = cast<PHINode>(&InductionPhiRI->getInstruction());
-  Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
-  assert(VectorTripCount && "Expected valid arguments");
-
-  Instruction *OldInduction = Legal->getPrimaryInduction();
-  // For the primary induction the end values are known.
-  Value *EndValue = VectorTripCount;
-  Value *EndValueFromAdditionalBypass = MainVectorTripCount;
-  // Otherwise compute them accordingly.
-  if (OrigPhi != OldInduction) {
-    IRBuilder<> B(LoopVectorPreHeader->getTerminator());
-
-    // Fast-math-flags propagate from the original induction instruction.
-    if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
-      B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
-
-    EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
-                                    Step, II.getKind(), II.getInductionBinOp());
-    EndValue->setName("ind.end");
-
-    // Compute the end value for the additional bypass (if applicable).
-    if (MainVectorTripCount) {
-      B.SetInsertPoint(getAdditionalBypassBlock(),
-                       getAdditionalBypassBlock()->getFirstInsertionPt());
-      EndValueFromAdditionalBypass =
-          emitTransformedIndex(B, MainVectorTripCount, II.getStartValue(), Step,
-                               II.getKind(), II.getInductionBinOp());
-      EndValueFromAdditionalBypass->setName("ind.end");
-    }
-  }
-
-  auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
-      VPInstruction::ResumePhi,
-      {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
-      OrigPhi->getDebugLoc(), "bc.resume.val");
-  assert(InductionPhiRI->getNumOperands() == 0 &&
-         "InductionPhiRI should not have any operands");
-  InductionPhiRI->addOperand(ResumePhiRecipe);
-
-  if (EndValueFromAdditionalBypass) {
-    // Store the bypass value here, as it needs to be added as operand to its
-    // scalar preheader phi node after the epilogue skeleton has been created.
-    // TODO: Directly add as extra operand to the VPResumePHI recipe.
-    assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
-           "entry for OrigPhi already exits");
-    Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
-  }
+  replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
 }
 
 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2733,46 +2679,40 @@ static void addFullyUnrolledInstructionsToIgnore(
   }
 }
 
-void InnerLoopVectorizer::createInductionResumeVPValues(
-    const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
-    SmallPtrSetImpl<PHINode *> *IVSubset) {
-  // We are going to resume the execution of the scalar loop.
-  // Go over all of the induction variable PHIs of the scalar loop header and
-  // fix their starting values, which depend on the counter of the last
-  // iteration of the vectorized loop. If we come from one of the
-  // LoopBypassBlocks then we need to start from the original start value.
-  // Otherwise we provide the trip count from the main vector loop.
-  VPBasicBlock *ScalarPHVPBB = Plan.getScalarPreheader();
-  VPBuilder ScalarPHBuilder(ScalarPHVPBB, ScalarPHVPBB->begin());
-  bool HasCanonical = false;
-  for (VPRecipeBase &R : *Plan.getScalarHeader()) {
-    auto *PhiR = cast<VPIRInstruction>(&R);
-    auto *Phi = dyn_cast<PHINode>(&PhiR->getInstruction());
-    if (!Phi)
-      break;
-    if (!Legal->getInductionVars().contains(Phi) ||
-        (IVSubset && !IVSubset->contains(Phi)))
-      continue;
-    const InductionDescriptor &II = Legal->getInductionVars().find(Phi)->second;
-    createInductionResumeVPValue(PhiR, II, getExpandedStep(II, ExpandedSCEVs),
-                                 LoopBypassBlocks, ScalarPHBuilder,
-                                 MainVectorTripCount);
-    auto *ConstStart = dyn_cast<ConstantInt>(II.getStartValue());
-    auto *ConstStep = II.getConstIntStepValue();
-    if (Phi->getType() == VectorTripCount->getType() && ConstStart &&
-        ConstStart->isZero() && ConstStep && ConstStep->isOne())
-      HasCanonical = true;
-  }
-
-  if (!IVSubset || HasCanonical)
-    return;
-  // When vectorizing the epilogue, create a resume phi for the canonical IV if
-  // no suitable resume phi was already created.
-  ScalarPHBuilder.createNaryOp(
-      VPInstruction::ResumePhi,
-      {&Plan.getVectorTripCount(),
-       Plan.getOrAddLiveIn(ConstantInt::get(VectorTripCount->getType(), 0))},
-      {}, "vec.epilog.resume.val");
+void InnerLoopVectorizer::createInductionAdditionalBypassValues(
+    const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
+  assert(MainVectorTripCount && "Must have bypass information");
+
+  Instruction *OldInduction = Legal->getPrimaryInduction();
+  IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
+                            getAdditionalBypassBlock()->getFirstInsertionPt());
+  for (const auto &InductionEntry : Legal->getInductionVars()) {
+    PHINode *OrigPhi = InductionEntry.first;
+    const InductionDescriptor &II = InductionEntry.second;
+    Value *Step = getExpandedStep(II, ExpandedSCEVs);
+    // For the primary induction the additional bypass end value is known.
+    // Otherwise it is computed.
+    Value *EndValueFromAdditionalBypass = MainVectorTripCount;
+    if (OrigPhi != OldInduction) {
+      auto *BinOp = II.getInductionBinOp();
+      // Fast-math-flags propagate from the original induction instruction.
+      if (isa_and_nonnull<FPMathOperator>(BinOp))
+        BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
+
+      // Compute the end value for the additional bypass.
+      EndValueFromAdditionalBypass =
+          emitTransformedIndex(BypassBuilder, MainVectorTripCount,
+                               II.getStartValue(), Step, II.getKind(), BinOp);
+      EndValueFromAdditionalBypass->setName("ind.end");
+    }
+
+    // Store the bypass value here, as it needs to be added as operand to its
+    // scalar preheader phi node after the epilogue skeleton has been created.
+    // TODO: Directly add as extra operand to the VPResumePHI recipe.
+    assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
+           "entry for OrigPhi already exits");
+    Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
+  }
 }
 
 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
@@ -2832,9 +2772,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
   // faster.
   emitMemRuntimeChecks(LoopScalarPreHeader);
 
-  // Emit phis for the new starting index of the scalar loop.
-  createInductionResumeVPValues(ExpandedSCEVs);
-
   return LoopVectorPreHeader;
 }
 
@@ -3048,22 +2985,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   PSE.getSE()->forgetLoop(OrigLoop);
   PSE.getSE()->forgetBlockAndLoopDispositions();
 
-  // When dealing with uncountable early exits we create middle.split blocks
-  // between the vector loop region and the exit block. These blocks need
-  // adding to any outer loop.
-  VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
-  Loop *OuterLoop = OrigLoop->getParentLoop();
-  if (Legal->hasUncountableEarlyExit() && OuterLoop) {
-    VPBasicBlock *MiddleVPBB = State.Plan->getMiddleBlock();
-    VPBlockBase *PredVPBB = MiddleVPBB->getSinglePredecessor();
-    while (PredVPBB && PredVPBB != VectorRegion) {
-      BasicBlock *MiddleSplitBB =
-          State.CFG.VPBB2IRBB[cast<VPBasicBlock>(PredVPBB)];
-      OuterLoop->addBasicBlockToLoop(MiddleSplitBB, *LI);
-      PredVPBB = PredVPBB->getSinglePredecessor();
-    }
-  }
-
   // After vectorization, the exit blocks of the original loop will have
   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
   // looked through single-entry phis.
@@ -3091,9 +3012,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
                    getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
   }
 
+  // Don't apply optimizations below when no vector region remains, as they all
+  // require a vector loop at the moment.
+  if (!State.Plan->getVectorLoopRegion())
+    return;
+
   for (Instruction *PI : PredicatedInstructions)
     sinkScalarOperands(&*PI);
 
+  VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
 
@@ -4768,7 +4695,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
              << "LV: Vectorization seems to be not beneficial, "
              << "but was forced by a user.\n");
-  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
   return ChosenFactor;
 }
 #endif
@@ -7697,6 +7623,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
          "when vectorizing, the scalar cost must be computed.");
 #endif
 
+  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
   return BestFactor;
 }
 
@@ -7802,7 +7729,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
-                         &BestVPlan, Legal->getWidestInductionType());
+                         &BestVPlan, OrigLoop->getParentLoop(),
+                         Legal->getWidestInductionType());
 
 #ifdef EXPENSIVE_CHECKS
   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
@@ -7810,11 +7738,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   // 0. Generate SCEV-dependent code in the entry, including TripCount, before
   // making any changes to the CFG.
-  if (!BestVPlan.getEntry()->empty()) {
-    State.CFG.PrevBB = OrigLoop->getLoopPreheader();
-    State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
+  if (!BestVPlan.getEntry()->empty())
     BestVPlan.getEntry()->execute(&State);
-  }
+
   if (!ILV.getTripCount())
     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
   else
@@ -7823,6 +7749,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   // 1. Set up the skeleton for vectorization, including vector pre-header and
   // middle block. The vector loop is created during VPlan execution.
+  VPBasicBlock *VectorPH =
+      cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
       ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
   if (VectorizingEpilogue)
@@ -7860,19 +7788,20 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.prepareToExecute(
       ILV.getTripCount(),
       ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
+  replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
 
   BestVPlan.execute(&State);
 
-  auto *ExitVPBB = BestVPlan.getMiddleBlock();
+  auto *MiddleVPBB = BestVPlan.getMiddleBlock();
   // 2.5 When vectorizing the epilogue, fix reduction and induction resume
   // values from the additional bypass block.
   if (VectorizingEpilogue) {
     assert(!ILV.Legal->hasUncountableEarlyExit() &&
            "Epilogue vectorisation not yet supported with early exits");
     BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
-    for (VPRecipeBase &R : *ExitVPBB) {
+    for (VPRecipeBase &R : *MiddleVPBB) {
       fixReductionScalarResumeWhenVectorizingEpilog(
-          &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock);
+          &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
     }
     BasicBlock *PH = OrigLoop->getLoopPreheader();
     for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
@@ -7885,30 +7814,31 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
-  MDNode *OrigLoopID = OrigLoop->getLoopID();
+  if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
+    MDNode *OrigLoopID = OrigLoop->getLoopID();
 
-  std::optional<MDNode *> VectorizedLoopID =
-      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
-                                      LLVMLoopVectorizeFollowupVectorized});
-
-  VPBasicBlock *HeaderVPBB =
-      BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
-  Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
-  if (VectorizedLoopID)
-    L->setLoopID(*VectorizedLoopID);
-  else {
-    // Keep all loop hints from the original loop on the vector loop (we'll
-    // replace the vectorizer-specific hints below).
-    if (MDNode *LID = OrigLoop->getLoopID())
-      L->setLoopID(LID);
-
-    LoopVectorizeHints Hints(L, true, *ORE);
-    Hints.setAlreadyVectorized();
+    std::optional<MDNode *> VectorizedLoopID =
+        makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+                                        LLVMLoopVectorizeFollowupVectorized});
+
+    VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+    Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
+    if (VectorizedLoopID) {
+      L->setLoopID(*VectorizedLoopID);
+    } else {
+      // Keep all loop hints from the original loop on the vector loop (we'll
+      // replace the vectorizer-specific hints below).
+      if (MDNode *LID = OrigLoop->getLoopID())
+        L->setLoopID(LID);
+
+      LoopVectorizeHints Hints(L, true, *ORE);
+      Hints.setAlreadyVectorized();
+    }
+    TargetTransformInfo::UnrollingPreferences UP;
+    TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
+    if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
+      addRuntimeUnrollDisableMetaData(L);
   }
-  TargetTransformInfo::UnrollingPreferences UP;
-  TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
-  if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
-    addRuntimeUnrollDisableMetaData(L);
 
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
@@ -7917,15 +7847,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  auto *MiddleTerm =
-      cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
-  if (MiddleTerm->isConditional() &&
-      hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    // Assume that `Count % VectorTripCount` is equally distributed.
-    unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-    assert(TripCount > 0 && "trip count should not be zero");
-    const uint32_t Weights[] = {1, TripCount - 1};
-    setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+  if (BestVPlan.getVectorLoopRegion()) {
+    auto *MiddleVPBB = BestVPlan.getMiddleBlock();
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
   }
 
   return State.ExpandedSCEVs;
@@ -7968,17 +7901,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
   // Generate the induction variable.
   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
 
-  // Generate VPValues and ResumePhi recipes for wide inductions in the epilogue
-  // plan only. Other inductions only need a resume value for the canonical
-  // induction, which will get created during epilogue skeleton construction.
-  SmallPtrSet<PHINode *, 4> WideIVs;
-  for (VPRecipeBase &H :
-       EPI.EpiloguePlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    if (auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&H))
-      WideIVs.insert(WideIV->getPHINode());
-  }
-  createInductionResumeVPValues(ExpandedSCEVs, nullptr, &WideIVs);
-
   return LoopVectorPreHeader;
 }
 
@@ -8048,7 +7970,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
 
-  introduceCheckBlockInVPlan(Plan, TCCheckBlock);
+  introduceCheckBlockInVPlan(TCCheckBlock);
   return TCCheckBlock;
 }
 
@@ -8128,14 +8050,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
       Phi->removeIncomingValue(EPI.MemSafetyCheck);
   }
 
-  // Generate induction resume values. These variables save the new starting
-  // indexes for the scalar loop. They are used to test if there are any tail
-  // iterations left once the vector loop has completed.
-  // Note that when the vectorized epilogue is skipped due to iteration count
-  // check, then the resume value for the induction variable comes from
-  // the trip count of the main vector loop, passed as the second argument.
-  createInductionResumeVPValues(ExpandedSCEVs, EPI.VectorTripCount);
-
+  // Generate bypass values from the additional bypass block. Note that when the
+  // vectorized epilogue is skipped due to iteration count check, then the
+  // resume value for the induction variable comes from the trip count of the
+  // main vector loop, passed as the second argument.
+  createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
   return LoopVectorPreHeader;
 }
 
@@ -8185,13 +8104,13 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
 
   // A new entry block has been created for the epilogue VPlan. Hook it in, as
   // otherwise we would try to modify the entry to the main vector loop.
-  VPIRBasicBlock *NewEntry = VPIRBasicBlock::fromBasicBlock(Insert);
+  VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
   VPBasicBlock *OldEntry = Plan.getEntry();
   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
   Plan.setEntry(NewEntry);
-  delete OldEntry;
+  // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
 
-  introduceCheckBlockInVPlan(Plan, Insert);
+  introduceCheckBlockInVPlan(Insert);
   return Insert;
 }
 
@@ -8435,17 +8354,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     auto *GEP = dyn_cast<GetElementPtrInst>(
         Ptr->getUnderlyingValue()->stripPointerCasts());
     VPSingleDefRecipe *VectorPtr;
-    if (Reverse)
+    if (Reverse) {
+      // When folding the tail, we may compute an address that we don't in the
+      // original scalar loop and it may not be inbounds. Drop Inbounds in that
+      // case.
+      GEPNoWrapFlags Flags =
+          (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
+              ? GEPNoWrapFlags::none()
+              : GEPNoWrapFlags::inBounds();
       VectorPtr = new VPReverseVectorPointerRecipe(
-          Ptr, &Plan.getVF(), getLoadStoreType(I),
-          GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
-                                   : GEPNoWrapFlags::none(),
-          I->getDebugLoc());
-    else
+          Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
+    } else {
       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
                                             GEP ? GEP->getNoWrapFlags()
                                                 : GEPNoWrapFlags::none(),
                                             I->getDebugLoc());
+    }
     Builder.getInsertBlock()->appendRecipe(VectorPtr);
     Ptr = VectorPtr;
   }
@@ -8955,14 +8879,56 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
 }
 
-/// Create resume phis in the scalar preheader for first-order recurrences and
-/// reductions and update the VPIRInstructions wrapping the original phis in the
-/// scalar header.
+/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
+/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
+/// the end value of the induction.
+static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
+                                               VPBuilder &VectorPHBuilder,
+                                               VPBuilder &ScalarPHBuilder,
+                                               VPTypeAnalysis &TypeInfo,
+                                               VPValue *VectorTC) {
+  auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+  // Truncated wide inductions resume from the last lane of their vector value
+  // in the last vector iteration which is handled elsewhere.
+  if (WideIntOrFp && WideIntOrFp->getTruncInst())
+    return nullptr;
+
+  VPValue *Start = WideIV->getStartValue();
+  VPValue *Step = WideIV->getStepValue();
+  const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+  VPValue *EndValue = VectorTC;
+  if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+    EndValue = VectorPHBuilder.createDerivedIV(
+        ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+        Start, VectorTC, Step);
+  }
+
+  // EndValue is derived from the vector trip count (which has the same type as
+  // the widest induction) and thus may be wider than the induction here.
+  Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
+  if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
+    EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
+                                                ScalarTypeOfWideIV,
+                                                WideIV->getDebugLoc());
+  }
+
+  auto *ResumePhiRecipe =
+      ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
+                                   WideIV->getDebugLoc(), "bc.resume.val");
+  return ResumePhiRecipe;
+}
+
+/// Create resume phis in the scalar preheader for first-order recurrences,
+/// reductions and inductions, and update the VPIRInstructions wrapping the
+/// original phis in the scalar header.
 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
   auto *ScalarPH = Plan.getScalarPreheader();
   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
-  VPBuilder ScalarPHBuilder(ScalarPH);
+  VPBuilder VectorPHBuilder(
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+  VPBuilder ScalarPHBuilder(ScalarPH);
   VPValue *OneVPV = Plan.getOrAddLiveIn(
       ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
   for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
@@ -8970,9 +8936,23 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
     auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
     if (!ScalarPhiI)
       break;
+
     auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
-    if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR))
+    if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
+      if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
+              WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
+              &Plan.getVectorTripCount())) {
+        ScalarPhiIRI->addOperand(ResumePhi);
+        continue;
+      }
+      // TODO: Also handle truncated inductions here. Computing end-values
+      // separately should be done as VPlan-to-VPlan optimization, after
+      // legalizing all resume values to use the last lane from the loop.
+      assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
+             "should only skip truncated wide inductions");
       continue;
+    }
+
     // The backedge value provides the value to resume coming out of a loop,
     // which for FORs is a vector whose last element needs to be extracted. The
     // start value provides the value if the loop is bypassed.
@@ -8990,14 +8970,73 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
   }
 }
 
+/// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
+/// either an untruncated wide induction, or if it increments a wide induction
+/// by its step.
+static bool isOptimizableIVOrUse(VPValue *VPV) {
+  VPRecipeBase *Def = VPV->getDefiningRecipe();
+  if (!Def)
+    return false;
+  auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
+  if (WideIV) {
+    // VPV itself is a wide induction, separately compute the end value for exit
+    // users if it is not a truncated IV.
+    return isa<VPWidenPointerInductionRecipe>(WideIV) ||
+           !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
+  }
+
+  // Check if VPV is an optimizable induction increment.
+  if (Def->getNumOperands() != 2)
+    return false;
+  WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
+  if (!WideIV)
+    WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
+  if (!WideIV)
+    return false;
+
+  using namespace VPlanPatternMatch;
+  auto &ID = WideIV->getInductionDescriptor();
+
+  // Check if VPV increments the induction by the induction step.
+  VPValue *IVStep = WideIV->getStepValue();
+  switch (ID.getInductionOpcode()) {
+  case Instruction::Add:
+    return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
+                                                   m_Specific(IVStep)));
+  case Instruction::FAdd:
+    return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
+                                                    m_Specific(IVStep)));
+  case Instruction::FSub:
+    return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
+                                                  m_Specific(IVStep)));
+  case Instruction::Sub: {
+    // IVStep will be the negated step of the subtraction. Check if Step == -1 *
+    // IVStep.
+    VPValue *Step;
+    if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
+        !Step->isLiveIn() || !IVStep->isLiveIn())
+      return false;
+    auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
+    auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
+    return StepCI && IVStepCI &&
+           StepCI->getValue() == (-1 * IVStepCI->getValue());
+  }
+  default:
+    return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
+           match(VPV, m_GetElementPtr(m_Specific(WideIV),
+                                      m_Specific(WideIV->getStepValue())));
+  }
+  llvm_unreachable("should have been covered by switch above");
+}
+
 // Collect VPIRInstructions for phis in the exit blocks that are modeled
 // in VPlan and add the exiting VPValue as operand. Some exiting values are not
 // modeled explicitly yet and won't be included. Those are un-truncated
 // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
 // increments.
-static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
-    Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
-    const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+static SetVector<VPIRInstruction *>
+collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
+                         VPlan &Plan) {
   auto *MiddleVPBB = Plan.getMiddleBlock();
   SetVector<VPIRInstruction *> ExitUsersToFix;
   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
@@ -9022,18 +9061,9 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
         // Exit values for inductions are computed and updated outside of VPlan
         // and independent of induction recipes.
         // TODO: Compute induction exit values in VPlan.
-        if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
-             !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
-            isa<VPWidenPointerInductionRecipe>(V) ||
-            (isa<Instruction>(IncomingValue) &&
-             OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
-             any_of(IncomingValue->users(), [&Inductions](User *U) {
-               auto *P = dyn_cast<PHINode>(U);
-               return P && Inductions.contains(P);
-             }))) {
-          if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
-            continue;
-        }
+        if (isOptimizableIVOrUse(V) &&
+            ExitVPBB->getSinglePredecessor() == MiddleVPBB)
+          continue;
         ExitUsersToFix.insert(ExitIRI);
         ExitIRI->addOperand(V);
       }
@@ -9335,7 +9365,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         VPBB->appendRecipe(Recipe);
     }
 
-    VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+    VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
   }
 
@@ -9348,14 +9378,28 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
+  // Update wide induction increments to use the same step as the corresponding
+  // wide induction. This enables detecting induction increments directly in
+  // VPlan and removes redundant splats.
+  for (const auto &[Phi, ID] : Legal->getInductionVars()) {
+    auto *IVInc = cast<Instruction>(
+        Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+    if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
+      continue;
+    VPWidenInductionRecipe *WideIV =
+        cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
+    VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
+    R->setOperand(1, WideIV->getStepValue());
+  }
+
   if (auto *UncountableExitingBlock =
           Legal->getUncountableEarlyExitingBlock()) {
     VPlanTransforms::handleUncountableEarlyExit(
         *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
   }
   addScalarResumePhis(RecipeBuilder, *Plan);
-  SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
-      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+  SetVector<VPIRInstruction *> ExitUsersToFix =
+      collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
   if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
     reportVectorizationFailure(
@@ -9474,6 +9518,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   bool HasNUW = true;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
                         DebugLoc());
+
+  // Collect mapping of IR header phis to header phi recipes, to be used in
+  // addScalarResumePhis.
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+  for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+    if (isa<VPCanonicalIVPHIRecipe>(&R))
+      continue;
+    auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
+    RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
+  }
+  addScalarResumePhis(RecipeBuilder, *Plan);
+
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
@@ -9611,9 +9667,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       if (CM.blockNeedsPredicationForAnyReason(BB))
         CondOp = RecipeBuilder.getBlockInMask(BB);
 
-      VPReductionRecipe *RedRecipe =
-          new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
-                                CondOp, CM.useOrderedReductions(RdxDesc));
+      auto *RedRecipe = new VPReductionRecipe(
+          RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
+          CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
       // Append the recipe to the end of the VPBasicBlock because we need to
       // ensure that it comes after all of it's inputs, including CondOp.
       // Note that this transformation may leave over dead recipes (including
@@ -9762,13 +9818,18 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
 
   Value *Step = State.get(getStepValue(), VPLane(0));
-  Value *CanonicalIV = State.get(getOperand(1), VPLane(0));
+  Value *Index = State.get(getOperand(1), VPLane(0));
   Value *DerivedIV = emitTransformedIndex(
-      State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
-      Kind, cast_if_present<BinaryOperator>(FPBinOp));
+      State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
+      cast_if_present<BinaryOperator>(FPBinOp));
   DerivedIV->setName(Name);
-  assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
-
+  // If index is the vector trip count, the concrete value will only be set in
+  // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
+  // TODO: Remove the special case for the vector trip count once it is computed
+  // in VPlan and can be used during VPlan simplification.
+  assert((DerivedIV != Index ||
+          getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
+         "IV didn't need transforming?");
   State.set(this, DerivedIV, VPLane(0));
 }
 
@@ -10078,6 +10139,57 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
                               !EnableLoopVectorization) {}
 
+/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
+/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
+/// don't have a corresponding wide induction in \p EpiPlan.
+static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
+  // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
+  // will need their resume-values computed in the main vector loop. Others
+  // can be removed from the main VPlan.
+  SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
+  for (VPRecipeBase &R :
+       EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+    if (isa<VPCanonicalIVPHIRecipe>(&R))
+      continue;
+    EpiWidenedPhis.insert(
+        cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
+  }
+  for (VPRecipeBase &R : make_early_inc_range(
+           *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
+    auto *VPIRInst = cast<VPIRInstruction>(&R);
+    auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
+    if (!IRI)
+      break;
+    if (EpiWidenedPhis.contains(IRI))
+      continue;
+    // There is no corresponding wide induction in the epilogue plan that would
+    // need a resume value. Remove the VPIRInst wrapping the scalar header phi
+    // together with the corresponding ResumePhi. The resume values for the
+    // scalar loop will be created during execution of EpiPlan.
+    VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
+    VPIRInst->eraseFromParent();
+    ResumePhi->eraseFromParent();
+  }
+  VPlanTransforms::removeDeadRecipes(MainPlan);
+
+  using namespace VPlanPatternMatch;
+  VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
+  VPValue *VectorTC = &MainPlan.getVectorTripCount();
+  // If there is a suitable resume value for the canonical induction in the
+  // scalar (which will become vector) epilogue loop we are done. Otherwise
+  // create it below.
+  if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
+        return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
+                             m_Specific(VectorTC), m_SpecificInt(0)));
+      }))
+    return;
+  VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
+  ScalarPHBuilder.createNaryOp(
+      VPInstruction::ResumePhi,
+      {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
+      "vec.epilog.resume.val");
+}
+
 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
 static void
@@ -10542,12 +10654,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+        preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
                                           BestEpiPlan);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks,
                                            *BestMainPlan);
-
         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
                                              *BestMainPlan, MainILV, DT, false);
         ++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b5d68c0..36fed89 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -104,6 +104,7 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 using namespace slpvectorizer;
+using namespace std::placeholders;
 
 #define SV_NAME "slp-vectorizer"
 #define DEBUG_TYPE "SLP"
@@ -702,7 +703,8 @@ static SmallBitVector isUndefVector(const Value *V,
 /// TODO: Can we split off and reuse the shuffle mask detection from
 /// ShuffleVectorInst/getShuffleCost?
 static std::optional<TargetTransformInfo::ShuffleKind>
-isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
+isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+                     AssumptionCache *AC) {
   const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
   if (It == VL.end())
     return std::nullopt;
@@ -719,14 +721,14 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
 
   Value *Vec1 = nullptr;
   Value *Vec2 = nullptr;
-  bool HasNonUndefVec = any_of(VL, [](Value *V) {
+  bool HasNonUndefVec = any_of(VL, [&](Value *V) {
     auto *EE = dyn_cast<ExtractElementInst>(V);
     if (!EE)
       return false;
     Value *Vec = EE->getVectorOperand();
     if (isa<UndefValue>(Vec))
       return false;
-    return isGuaranteedNotToBePoison(Vec);
+    return isGuaranteedNotToBePoison(Vec, AC);
   });
   enum ShuffleMode { Unknown, Select, Permute };
   ShuffleMode CommonShuffleMode = Unknown;
@@ -815,27 +817,34 @@ class InstructionsState {
   Instruction *AltOp = nullptr;
 
 public:
-  Instruction *getMainOp() const { return MainOp; }
+  Instruction *getMainOp() const {
+    assert(valid() && "InstructionsState is invalid.");
+    return MainOp;
+  }
 
-  Instruction *getAltOp() const { return AltOp; }
+  Instruction *getAltOp() const {
+    assert(valid() && "InstructionsState is invalid.");
+    return AltOp;
+  }
 
   /// The main/alternate opcodes for the list of instructions.
-  unsigned getOpcode() const {
-    return MainOp ? MainOp->getOpcode() : 0;
-  }
+  unsigned getOpcode() const { return getMainOp()->getOpcode(); }
 
-  unsigned getAltOpcode() const {
-    return AltOp ? AltOp->getOpcode() : 0;
-  }
+  unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
 
   /// Some of the instructions in the list have alternate opcodes.
-  bool isAltShuffle() const { return AltOp != MainOp; }
+  bool isAltShuffle() const { return getMainOp() != getAltOp(); }
 
   bool isOpcodeOrAlt(Instruction *I) const {
     unsigned CheckedOpcode = I->getOpcode();
     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
   }
 
+  /// Checks if the current state is valid, i.e. has non-null MainOp
+  bool valid() const { return MainOp && AltOp; }
+
+  explicit operator bool() const { return valid(); }
+
   InstructionsState() = delete;
   InstructionsState(Instruction *MainOp, Instruction *AltOp)
       : MainOp(MainOp), AltOp(AltOp) {}
@@ -868,8 +877,8 @@ static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
          (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
           !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
          BaseOp0 == Op0 || BaseOp1 == Op1 ||
-         getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
-         getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
+         getSameOpcode({BaseOp0, Op0}, TLI) ||
+         getSameOpcode({BaseOp1, Op1}, TLI);
 }
 
 /// \returns true if a compare instruction \p CI has similar "look" and
@@ -1846,7 +1855,7 @@ public:
         InstructionsState S = getSameOpcode(Ops, TLI);
         // Note: Only consider instructions with <= 2 operands to avoid
         // complexity explosion.
-        if (S.getOpcode() &&
+        if (S &&
             (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
              !S.isAltShuffle()) &&
             all_of(Ops, [&S](Value *V) {
@@ -2381,7 +2390,7 @@ public:
         // Use Boyer-Moore majority voting for finding the majority opcode and
         // the number of times it occurs.
         if (auto *I = dyn_cast<Instruction>(OpData.V)) {
-          if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
+          if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
               I->getParent() != Parent) {
             if (NumOpsWithSameOpcodeParent == 0) {
               NumOpsWithSameOpcodeParent = 1;
@@ -2500,8 +2509,7 @@ public:
                 // 2.1. If we have only 2 lanes, need to check that value in the
                 // next lane does not build same opcode sequence.
                 (Lns == 2 &&
-                 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
-                      .getOpcode() &&
+                 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
                  isa<Constant>(Data.V)))) ||
               // 3. The operand in the current lane is loop invariant (can be
               // hoisted out) and another operand is also a loop invariant
@@ -2510,7 +2518,7 @@ public:
               // FIXME: need to teach the cost model about this case for better
               // estimation.
               (IsInvariant && !isa<Constant>(Data.V) &&
-               !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
+               !getSameOpcode({Op, Data.V}, TLI) &&
                L->isLoopInvariant(Data.V))) {
             FoundCandidate = true;
             Data.IsUsed = Data.V == Op;
@@ -2540,7 +2548,7 @@ public:
                 return true;
               Value *OpILn = getValue(OpI, Ln);
               return (L && L->isLoopInvariant(OpILn)) ||
-                     (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
+                     (getSameOpcode({Op, OpILn}, TLI) &&
                       allSameBlock({Op, OpILn}));
             }))
           return true;
@@ -2697,7 +2705,7 @@ public:
                 OperandData &AltOp = getData(OpIdx, Lane);
                 InstructionsState OpS =
                     getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
-                if (OpS.getOpcode() && OpS.isAltShuffle())
+                if (OpS && OpS.isAltShuffle())
                   MainAltOps[OpIdx].push_back(AltOp.V);
               }
             }
@@ -3310,7 +3318,7 @@ private:
 
     /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
     /// other nodes as a series of insertvector instructions.
-    SmallVector<std::pair<unsigned, unsigned>, 0> CombinedEntriesWithIndices;
+    SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
 
   private:
     /// The operands of each instruction in each lane Operands[op_index][lane].
@@ -3399,6 +3407,7 @@ private:
     }
 
     void setOperations(const InstructionsState &S) {
+      assert(S && "InstructionsState is invalid.");
       MainOp = S.getMainOp();
       AltOp = S.getAltOp();
     }
@@ -3544,6 +3553,13 @@ private:
       for (const auto &EInfo : UserTreeIndices)
         dbgs() << EInfo << ", ";
       dbgs() << "\n";
+      if (!CombinedEntriesWithIndices.empty()) {
+        dbgs() << "Combined entries: ";
+        interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
+          dbgs() << "Entry index " << P.first << " with offset " << P.second;
+        });
+        dbgs() << "\n";
+      }
     }
 #endif
   };
@@ -3592,7 +3608,7 @@ private:
            "Need to vectorize gather entry?");
     // Gathered loads still gathered? Do not create entry, use the original one.
     if (GatheredLoadsEntriesFirst.has_value() &&
-        EntryState == TreeEntry::NeedToGather &&
+        EntryState == TreeEntry::NeedToGather && S &&
         S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
         !UserTreeIdx.UserTE)
       return nullptr;
@@ -3610,7 +3626,8 @@ private:
                                      ReuseShuffleIndices.end());
     if (ReorderIndices.empty()) {
       Last->Scalars.assign(VL.begin(), VL.end());
-      Last->setOperations(S);
+      if (S)
+        Last->setOperations(S);
     } else {
       // Reorder scalars and build final mask.
       Last->Scalars.assign(VL.size(), nullptr);
@@ -3621,7 +3638,8 @@ private:
                   return VL[Idx];
                 });
       InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
-      Last->setOperations(S);
+      if (S)
+        Last->setOperations(S);
       Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
     }
     if (!Last->isGather()) {
@@ -4766,8 +4784,7 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
            (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
           !CompareOpcodes ||
           (GEP1 && GEP2 &&
-           getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
-               .getOpcode()));
+           getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
 }
 
 /// Calculates minimal alignment as a common alignment.
@@ -4939,6 +4956,37 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
   return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
 }
 
+/// Correctly creates insert_subvector, checking that the index is multiple of
+/// the subvectors length. Otherwise, generates shuffle using \p Generator or
+/// using default shuffle.
+static Value *createInsertVector(
+    IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
+    function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
+  const unsigned SubVecVF = getNumElements(V->getType());
+  if (Index % SubVecVF == 0) {
+    Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
+                                     Builder.getInt64(Index));
+  } else {
+    // Create shuffle, insertvector requires that index is multiple of
+    // the subvector length.
+    const unsigned VecVF = getNumElements(Vec->getType());
+    SmallVector<int> Mask(VecVF, PoisonMaskElem);
+    std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0);
+    for (unsigned I : seq<unsigned>(SubVecVF))
+      Mask[I + Index] = I + VecVF;
+    if (Generator) {
+      Vec = Generator(Vec, V, Mask);
+    } else {
+      // 1. Resize V to the size of Vec.
+      SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
+      std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
+      V = Builder.CreateShuffleVector(V, ResizeMask);
+      Vec = Builder.CreateShuffleVector(Vec, V, Mask);
+    }
+  }
+  return Vec;
+}
+
 BoUpSLP::LoadsState
 BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                            SmallVectorImpl<unsigned> &Order,
@@ -5339,11 +5387,10 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
     SmallPtrSet<Value *, 13> SecondPointers;
     Value *P1 = Ptr1;
     Value *P2 = Ptr2;
-    if (P1 == P2)
-      return false;
     unsigned Depth = 0;
-    while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) &&
-           Depth <= RecursionMaxDepth) {
+    while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
+      if (P1 == P2 || Depth > RecursionMaxDepth)
+        return false;
       FirstPointers.insert(P1);
       SecondPointers.insert(P2);
       P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
@@ -5635,8 +5682,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     auto PHICompare = [&](unsigned I1, unsigned I2) {
       Value *V1 = TE.Scalars[I1];
       Value *V2 = TE.Scalars[I2];
-      if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0) ||
-          isa<PoisonValue>(V1) || isa<PoisonValue>(V2))
+      if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
+        return false;
+      if (isa<PoisonValue>(V1))
+        return true;
+      if (isa<PoisonValue>(V2))
         return false;
       if (V1->getNumUses() < V2->getNumUses())
         return true;
@@ -7489,7 +7539,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
                  [&](ArrayRef<Value *> Op) {
                    if (allConstant(Op) ||
                        (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
-                        getSameOpcode(Op, *TLI).getMainOp()))
+                        getSameOpcode(Op, *TLI)))
                      return false;
                    DenseMap<Value *, unsigned> Uniques;
                    for (Value *V : Op) {
@@ -8060,15 +8110,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Don't go into catchswitch blocks, which can happen with PHIs.
   // Such blocks can only have PHIs and the catchswitch.  There is no
   // place to insert a shuffle if we need to, so just avoid that issue.
-  if (S.getMainOp() &&
-      isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
+  if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
     LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
     return;
   }
 
   // Check if this is a duplicate of another entry.
-  if (S.getOpcode()) {
+  if (S) {
     if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
       LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
                         << ".\n");
@@ -8129,13 +8178,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // a load), in which case peek through to include it in the tree, without
   // ballooning over-budget.
   if (Depth >= RecursionMaxDepth &&
-      !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 &&
+      !(S && !S.isAltShuffle() && VL.size() >= 4 &&
         (match(S.getMainOp(), m_Load(m_Value())) ||
          all_of(VL, [&S](const Value *I) {
            return match(I,
                         m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
-                  cast<Instruction>(I)->getOpcode() ==
-                      S.getMainOp()->getOpcode();
+                  cast<Instruction>(I)->getOpcode() == S.getOpcode();
          })))) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
     if (TryToFindDuplicates(S))
@@ -8145,7 +8193,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // Don't handle scalable vectors
-  if (S.getOpcode() == Instruction::ExtractElement &&
+  if (S && S.getOpcode() == Instruction::ExtractElement &&
       isa<ScalableVectorType>(
           cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
@@ -8169,7 +8217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // vectorize.
   auto &&NotProfitableForVectorization = [&S, this,
                                           Depth](ArrayRef<Value *> VL) {
-    if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
+    if (!S || !S.isAltShuffle() || VL.size() > 2)
       return false;
     if (VectorizableTree.size() < MinTreeSize)
       return false;
@@ -8224,7 +8272,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   bool IsScatterVectorizeUserTE =
       UserTreeIdx.UserTE &&
       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
-  bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
+  bool AreAllSameBlock = S && allSameBlock(VL);
   bool AreScatterAllGEPSameBlock =
       (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
        VL.size() > 2 &&
@@ -8241,8 +8289,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
        sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
                        SortedIndices));
   bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
-  if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) ||
-      (isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>(
+  if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
+      (S &&
+       isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
            S.getMainOp()) &&
        !all_of(VL, isVectorLikeInstWithConstOps)) ||
       NotProfitableForVectorization(VL)) {
@@ -8254,7 +8303,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // Don't vectorize ephemeral values.
-  if (S.getOpcode() && !EphValues.empty()) {
+  if (S && !EphValues.empty()) {
     for (Value *V : VL) {
       if (EphValues.count(V)) {
         LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
@@ -8313,7 +8362,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   Instruction *VL0 = S.getMainOp();
   BB = VL0->getParent();
 
-  if (S.getMainOp() &&
+  if (S &&
       (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
        !DT->isReachableFromEntry(BB))) {
     // Don't go into unreachable blocks. They may contain instructions with
@@ -8367,8 +8416,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-                (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  unsigned ShuffleOrOp =
+      S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
   auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
     // Postpone PHI nodes creation
     SmallVector<unsigned> PHIOps;
@@ -8377,7 +8426,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (Op.empty())
         continue;
       InstructionsState S = getSameOpcode(Op, *TLI);
-      if (S.getOpcode() != Instruction::PHI || S.isAltShuffle())
+      if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
         buildTree_rec(Op, Depth + 1, {TE, I});
       else
         PHIOps.push_back(I);
@@ -9760,7 +9809,7 @@ void BoUpSLP::transformNodes() {
             if (IsSplat)
               continue;
             InstructionsState S = getSameOpcode(Slice, *TLI);
-            if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) ||
+            if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
                 (S.getOpcode() == Instruction::Load &&
                  areKnownNonVectorizableLoads(Slice)) ||
                 (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
@@ -10765,23 +10814,21 @@ public:
     }
     if (ForExtracts) {
       // No need to add vectors here, already handled them in adjustExtracts.
-      assert(
-          InVectors.size() == 1 && isa<const TreeEntry *>(InVectors.front()) &&
-          !CommonMask.empty() &&
-          all_of(enumerate(CommonMask),
-                 [&](auto P) {
-                   Value *Scalar =
-                       InVectors.front().get<const TreeEntry *>()->getOrdered(
-                           P.index());
-                   if (P.value() == PoisonMaskElem)
-                     return P.value() == Mask[P.index()] ||
-                            isa<UndefValue>(Scalar);
-                   if (isa<Constant>(V1))
-                     return true;
-                   auto *EI = cast<ExtractElementInst>(Scalar);
-                   return EI->getVectorOperand() == V1;
-                 }) &&
-          "Expected only tree entry for extractelement vectors.");
+      assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
+             !CommonMask.empty() &&
+             all_of(enumerate(CommonMask),
+                    [&](auto P) {
+                      Value *Scalar = cast<const TreeEntry *>(InVectors[0])
+                                          ->getOrdered(P.index());
+                      if (P.value() == PoisonMaskElem)
+                        return P.value() == Mask[P.index()] ||
+                               isa<UndefValue>(Scalar);
+                      if (isa<Constant>(V1))
+                        return true;
+                      auto *EI = cast<ExtractElementInst>(Scalar);
+                      return EI->getVectorOperand() == V1;
+                    }) &&
+             "Expected only tree entry for extractelement vectors.");
       return;
     }
     assert(!InVectors.empty() && !CommonMask.empty() &&
@@ -11077,7 +11124,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (const TreeEntry *OpTE = getTreeEntry(V))
       return getCastContextHint(*OpTE);
     InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
-    if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
+    if (SrcState && SrcState.getOpcode() == Instruction::Load &&
+        !SrcState.isAltShuffle())
       return TTI::CastContextHint::GatherScatter;
     return TTI::CastContextHint::None;
   };
@@ -11872,7 +11920,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
             TE->Scalars.size() < Limit ||
             ((TE->getOpcode() == Instruction::ExtractElement ||
               all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
-             isFixedVectorShuffle(TE->Scalars, Mask)) ||
+             isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
             (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
             any_of(TE->Scalars, IsaPred<LoadInst>));
   };
@@ -12937,7 +12985,7 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements(
   // Check that gather of extractelements can be represented as just a
   // shuffle of a single/two vectors the scalars are extracted from.
   std::optional<TTI::ShuffleKind> Res =
-      isFixedVectorShuffle(GatheredExtracts, Mask);
+      isFixedVectorShuffle(GatheredExtracts, Mask, AC);
   if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
     // TODO: try to check other subsets if possible.
     // Restore the original VL if attempt was not successful.
@@ -13195,6 +13243,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     // No perfect match, just shuffle, so choose the first tree node from the
     // tree.
     Entries.push_back(FirstEntries.front());
+    VF = FirstEntries.front()->getVectorFactor();
   } else {
     // Try to find nodes with the same vector factor.
     assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
@@ -13235,6 +13284,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       Entries.push_back(SecondEntries.front());
       VF = std::max(Entries.front()->getVectorFactor(),
                     Entries.back()->getVectorFactor());
+    } else {
+      VF = Entries.front()->getVectorFactor();
     }
   }
 
@@ -13253,7 +13304,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       Value *In1 = PHI1->getIncomingValue(I);
       if (isConstant(In) && isConstant(In1))
         continue;
-      if (!getSameOpcode({In, In1}, *TLI).getOpcode())
+      if (!getSameOpcode({In, In1}, *TLI))
         return false;
       if (cast<Instruction>(In)->getParent() !=
           cast<Instruction>(In1)->getParent())
@@ -13281,7 +13332,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     if (It != UsedValuesEntry.end())
       UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
     return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
-           getSameOpcode({V, V1}, *TLI).getOpcode() &&
+           getSameOpcode({V, V1}, *TLI) &&
            cast<Instruction>(V)->getParent() ==
                cast<Instruction>(V1)->getParent() &&
            (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
@@ -13346,17 +13397,141 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
                   : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
     IsIdentity &= Mask[Idx] == Pair.second;
   }
-  switch (Entries.size()) {
-  case 1:
-    if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
-      return TargetTransformInfo::SK_PermuteSingleSrc;
-    break;
-  case 2:
-    if (EntryLanes.size() > 2 || VL.size() <= 2)
-      return TargetTransformInfo::SK_PermuteTwoSrc;
-    break;
-  default:
-    break;
+  if (ForOrder || IsIdentity || Entries.empty()) {
+    switch (Entries.size()) {
+    case 1:
+      if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
+        return TargetTransformInfo::SK_PermuteSingleSrc;
+      break;
+    case 2:
+      if (EntryLanes.size() > 2 || VL.size() <= 2)
+        return TargetTransformInfo::SK_PermuteTwoSrc;
+      break;
+    default:
+      break;
+    }
+  } else if (!isa<VectorType>(VL.front()->getType()) &&
+             (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
+    // Do the cost estimation if shuffle beneficial than buildvector.
+    SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
+                             std::next(Mask.begin(), (Part + 1) * VL.size()));
+    int MinElement = SubMask.front(), MaxElement = SubMask.front();
+    for (int Idx : SubMask) {
+      if (Idx == PoisonMaskElem)
+        continue;
+      if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
+        MinElement = Idx;
+      if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
+        MaxElement = Idx;
+    }
+    assert(MaxElement >= 0 && MinElement >= 0 &&
+           MaxElement % VF >= MinElement % VF &&
+           "Expected at least single element.");
+    unsigned NewVF = std::max<unsigned>(
+        VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
+                                                 (MaxElement % VF) -
+                                                     (MinElement % VF) + 1));
+    if (NewVF < VF) {
+      for_each(SubMask, [&](int &Idx) {
+        if (Idx == PoisonMaskElem)
+          return;
+        Idx = (Idx % VF) - (MinElement % VF) +
+              (Idx >= static_cast<int>(VF) ? NewVF : 0);
+      });
+      VF = NewVF;
+    }
+
+    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    auto *VecTy = getWidenedType(VL.front()->getType(), VF);
+    auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
+    auto GetShuffleCost = [&,
+                           &TTI = *TTI](ArrayRef<int> Mask,
+                                        ArrayRef<const TreeEntry *> Entries,
+                                        VectorType *VecTy) -> InstructionCost {
+      if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
+          ShuffleVectorInst::isDeInterleaveMaskOfFactor(
+              Mask, Entries.front()->getInterleaveFactor()))
+        return TTI::TCC_Free;
+      return ::getShuffleCost(TTI,
+                              Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
+                                                 : TTI::SK_PermuteSingleSrc,
+                              VecTy, Mask, CostKind);
+    };
+    InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
+    InstructionCost FirstShuffleCost = 0;
+    SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
+    if (Entries.size() == 1 || !Entries[0]->isGather()) {
+      FirstShuffleCost = ShuffleCost;
+    } else {
+      // Transform mask to include only first entry.
+      APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+      bool IsIdentity = true;
+      for (auto [I, Idx] : enumerate(FirstMask)) {
+        if (Idx >= static_cast<int>(VF)) {
+          Idx = PoisonMaskElem;
+        } else {
+          DemandedElts.clearBit(I);
+          if (Idx != PoisonMaskElem)
+            IsIdentity &= static_cast<int>(I) == Idx;
+        }
+      }
+      if (!IsIdentity)
+        FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
+      FirstShuffleCost += TTI->getScalarizationOverhead(
+          MaskVecTy, DemandedElts, /*Insert=*/true,
+          /*Extract=*/false, CostKind);
+    }
+    InstructionCost SecondShuffleCost = 0;
+    SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
+    if (Entries.size() == 1 || !Entries[1]->isGather()) {
+      SecondShuffleCost = ShuffleCost;
+    } else {
+      // Transform mask to include only first entry.
+      APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+      bool IsIdentity = true;
+      for (auto [I, Idx] : enumerate(SecondMask)) {
+        if (Idx < static_cast<int>(VF) && Idx >= 0) {
+          Idx = PoisonMaskElem;
+        } else {
+          DemandedElts.clearBit(I);
+          if (Idx != PoisonMaskElem) {
+            Idx -= VF;
+            IsIdentity &= static_cast<int>(I) == Idx;
+          }
+        }
+      }
+      if (!IsIdentity)
+        SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
+      SecondShuffleCost += TTI->getScalarizationOverhead(
+          MaskVecTy, DemandedElts, /*Insert=*/true,
+          /*Extract=*/false, CostKind);
+    }
+    APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+    for (auto [I, Idx] : enumerate(SubMask))
+      if (Idx == PoisonMaskElem)
+        DemandedElts.clearBit(I);
+    InstructionCost BuildVectorCost =
+        TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
+                                      /*Extract=*/false, CostKind);
+    const TreeEntry *BestEntry = nullptr;
+    if (FirstShuffleCost < ShuffleCost) {
+      copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
+      BestEntry = Entries.front();
+      ShuffleCost = FirstShuffleCost;
+    }
+    if (SecondShuffleCost < ShuffleCost) {
+      copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
+      BestEntry = Entries[1];
+      ShuffleCost = SecondShuffleCost;
+    }
+    if (BuildVectorCost >= ShuffleCost) {
+      if (BestEntry) {
+        Entries.clear();
+        Entries.push_back(BestEntry);
+      }
+      return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
+                                : TargetTransformInfo::SK_PermuteSingleSrc;
+    }
   }
   Entries.clear();
   // Clear the corresponding mask elements.
@@ -13740,9 +13915,8 @@ Value *BoUpSLP::gather(
     Instruction *InsElt;
     if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
       assert(SLPReVec && "FixedVectorType is not expected.");
-      Vec = InsElt = Builder.CreateInsertVector(
-          Vec->getType(), Vec, Scalar,
-          Builder.getInt64(Pos * VecTy->getNumElements()));
+      Vec = InsElt = cast<Instruction>(createInsertVector(
+          Builder, Vec, Scalar, Pos * getNumElements(VecTy)));
       auto *II = dyn_cast<IntrinsicInst>(InsElt);
       if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
         return Vec;
@@ -14342,23 +14516,10 @@ public:
                                          V, SimplifyQuery(*R.DL));
                                    }));
           unsigned InsertionIndex = Idx * ScalarTyNumElements;
-          const unsigned SubVecVF =
-              cast<FixedVectorType>(V->getType())->getNumElements();
-          if (InsertionIndex % SubVecVF == 0) {
-            Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
-                                             Builder.getInt64(InsertionIndex));
-          } else {
-            // Create shuffle, insertvector requires that index is multiple of
-            // the subvectors length.
-            const unsigned VecVF =
-                cast<FixedVectorType>(Vec->getType())->getNumElements();
-            SmallVector<int> Mask(VecVF, PoisonMaskElem);
-            std::iota(Mask.begin(), Mask.end(), 0);
-            for (unsigned I : seq<unsigned>(
-                     InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements))
-              Mask[I] = I - Idx + VecVF;
-            Vec = createShuffle(Vec, V, Mask);
-          }
+          Vec = createInsertVector(
+              Builder, Vec, V, InsertionIndex,
+              std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
+                        _3));
           if (!CommonMask.empty()) {
             std::iota(
                 std::next(CommonMask.begin(), InsertionIndex),
@@ -14424,12 +14585,12 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
   ArrayRef<Value *> VL = E->getOperand(NodeIdx);
   InstructionsState S = getSameOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
-  if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
+  if (!S && VL.front()->getType()->isPointerTy()) {
     const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
     if (It != VL.end())
       S = getSameOpcode(*It, *TLI);
   }
-  if (!S.getOpcode())
+  if (!S)
     return nullptr;
   auto CheckSameVE = [&](const TreeEntry *VE) {
     return VE->isSame(VL) &&
@@ -14825,7 +14986,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       // non-poisonous, or by freezing the incoming scalar value first.
       auto *It = find_if(Scalars, [this, E](Value *V) {
         return !isa<UndefValue>(V) &&
-               (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
+               (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
                 (E->UserTreeIndices.size() == 1 &&
                  any_of(V->uses(), [E](const Use &U) {
                    // Check if the value already used in the same operation in
@@ -14897,11 +15058,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
       if (Vec2) {
         IsUsedInExpr = false;
-        IsNonPoisoned &=
-            isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
+        IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
+                         isGuaranteedNotToBePoison(Vec2, AC);
         ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
       } else if (Vec1) {
-        bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1);
+        bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
         IsUsedInExpr &= FindReusedSplat(
             ExtractMask,
             cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
@@ -14932,7 +15093,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
         if (TEs.size() == 1) {
           bool IsNotPoisonedVec =
               TEs.front()->VectorizedValue
-                  ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue)
+                  ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
                   : true;
           IsUsedInExpr &=
               FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
@@ -14944,8 +15105,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
           ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
           if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
             IsNonPoisoned &=
-                isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
-                isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
+                isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
+                isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
         }
       }
     }
@@ -15280,7 +15441,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       }
       if (!IsIdentity || NumElts != NumScalars) {
         Value *V2 = nullptr;
-        bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
+        bool IsVNonPoisonous =
+            !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
         SmallVector<int> InsertMask(Mask);
         if (NumElts != NumScalars && Offset == 0) {
           // Follow all insert element instructions from the current buildvector
@@ -17603,7 +17765,6 @@ bool BoUpSLP::collectValuesToDemote(
     BitWidth = std::max(BitWidth, BitWidth1);
     return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
   };
-  using namespace std::placeholders;
   auto FinalAnalysis = [&]() {
     if (!IsProfitableToDemote)
       return false;
@@ -18409,8 +18570,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
         hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
                                  ValOps.size()) ||
         (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
-    if ((!IsAllowedSize && S.getOpcode() &&
-         S.getOpcode() != Instruction::Load &&
+    if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
          (!S.getMainOp()->isSafeToRemove() ||
           any_of(ValOps.getArrayRef(),
                  [&](Value *V) {
@@ -18420,8 +18580,8 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                              return !Stores.contains(U);
                            }));
                  }))) ||
-        (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
-      Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2;
+        (ValOps.size() > Chain.size() / 2 && !S)) {
+      Size = (!IsAllowedSize && S) ? 1 : 2;
       return false;
     }
   }
@@ -18444,7 +18604,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   R.computeMinimumValueSizes();
 
   Size = R.getCanonicalGraphSize();
-  if (S.getOpcode() == Instruction::Load)
+  if (S && S.getOpcode() == Instruction::Load)
     Size = 2; // cut off masked gather small trees
   InstructionCost Cost = R.getTreeCost();
 
@@ -18945,7 +19105,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   // Check that all of the parts are instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
   InstructionsState S = getSameOpcode(VL, *TLI);
-  if (!S.getOpcode())
+  if (!S)
     return false;
 
   Instruction *I0 = S.getMainOp();
@@ -19635,7 +19795,7 @@ public:
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
-                     const TargetLibraryInfo &TLI) {
+                     const TargetLibraryInfo &TLI, AssumptionCache *AC) {
     const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
     constexpr unsigned RegMaxNumber = 4;
     constexpr unsigned RedValsMaxNumber = 128;
@@ -19679,20 +19839,35 @@ public:
       return cast<Instruction>(ScalarCond);
     };
 
+    bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
+      return isBoolLogicOp(cast<Instruction>(V));
+    });
     // Return new VectorizedTree, based on previous value.
     auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
       if (VectorizedTree) {
         // Update the final value in the reduction.
         Builder.SetCurrentDebugLocation(
             cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
-        if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
-            (isGuaranteedNotToBePoison(Res) &&
-             !isGuaranteedNotToBePoison(VectorizedTree))) {
-          auto It = ReducedValsToOps.find(Res);
-          if (It != ReducedValsToOps.end() &&
-              any_of(It->getSecond(),
-                     [](Instruction *I) { return isBoolLogicOp(I); }))
+        if (AnyBoolLogicOp) {
+          auto It = ReducedValsToOps.find(VectorizedTree);
+          auto It1 = ReducedValsToOps.find(Res);
+          if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
+              isGuaranteedNotToBePoison(VectorizedTree, AC) ||
+              (It != ReducedValsToOps.end() &&
+               any_of(It->getSecond(), [&](Instruction *I) {
+                 return isBoolLogicOp(I) &&
+                        getRdxOperand(I, 0) == VectorizedTree;
+               }))) {
+            ;
+          } else if (isGuaranteedNotToBePoison(Res, AC) ||
+                     (It1 != ReducedValsToOps.end() &&
+                     any_of(It1->getSecond(), [&](Instruction *I) {
+                       return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
+                     }))) {
             std::swap(VectorizedTree, Res);
+          } else {
+            VectorizedTree = Builder.CreateFreeze(VectorizedTree);
+          }
         }
 
         return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
@@ -19701,9 +19876,6 @@ public:
       // Initialize the final value in the reduction.
       return Res;
     };
-    bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
-      return isBoolLogicOp(cast<Instruction>(V));
-    });
     SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
                                       ReductionOps.front().size());
     for (ReductionOpsType &RdxOps : ReductionOps)
@@ -19757,16 +19929,16 @@ public:
         // Also check if the instruction was folded to constant/other value.
         auto *Inst = dyn_cast<Instruction>(RdxVal);
         if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
-             (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
-            (S.getOpcode() && !Inst))
+             (!S || !S.isOpcodeOrAlt(Inst))) ||
+            (S && !Inst))
           continue;
         Candidates.push_back(RdxVal);
         TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
       }
       bool ShuffledExtracts = false;
       // Try to handle shuffled extractelements.
-      if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
-          I + 1 < E) {
+      if (S && S.getOpcode() == Instruction::ExtractElement &&
+          !S.isAltShuffle() && I + 1 < E) {
         SmallVector<Value *> CommonCandidates(Candidates);
         for (Value *RV : ReducedVals[I + 1]) {
           Value *RdxVal = TrackedVals.at(RV);
@@ -19780,7 +19952,7 @@ public:
           TrackedToOrig.try_emplace(RdxVal, RV);
         }
         SmallVector<int> Mask;
-        if (isFixedVectorShuffle(CommonCandidates, Mask)) {
+        if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
           ++I;
           Candidates.swap(CommonCandidates);
           ShuffledExtracts = true;
@@ -20095,7 +20267,7 @@ public:
         // To prevent poison from leaking across what used to be sequential,
         // safe, scalar boolean logic operations, the reduction operand must be
         // frozen.
-        if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot))
+        if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
           VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
 
         // Emit code to correctly handle reused reduced values, if required.
@@ -20202,13 +20374,13 @@ public:
                                                    bool InitStep) {
         if (!AnyBoolLogicOp)
           return;
-        if (isBoolLogicOp(RedOp1) &&
-            ((!InitStep && LHS == VectorizedTree) ||
-             getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
+        if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
+                                      getRdxOperand(RedOp1, 0) == LHS ||
+                                      isGuaranteedNotToBePoison(LHS, AC)))
           return;
         if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
                                       getRdxOperand(RedOp2, 0) == RHS ||
-                                      isGuaranteedNotToBePoison(RHS))) {
+                                      isGuaranteedNotToBePoison(RHS, AC))) {
           std::swap(LHS, RHS);
           return;
         }
@@ -20856,7 +21028,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
     HorizontalReduction HorRdx;
     if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
-    return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
+    return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
   };
   auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
     if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -20962,8 +21134,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
   SmallVector<Value *, 16> BuildVectorOpds;
   SmallVector<int> Mask;
   if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
-      (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
-       isFixedVectorShuffle(BuildVectorOpds, Mask)))
+      (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
+       isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
     return false;
 
   if (MaxVFOnly && BuildVectorInsts.size() == 2) {
@@ -21161,7 +21333,7 @@ static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
             return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
         }
         InstructionsState S = getSameOpcode({I1, I2}, TLI);
-        if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
+        if (S && (IsCompatibility || !S.isAltShuffle()))
           continue;
         if (IsCompatibility)
           return false;
@@ -21319,7 +21491,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
           if (NodeI1 != NodeI2)
             return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
           InstructionsState S = getSameOpcode({I1, I2}, *TLI);
-          if (S.getOpcode() && !S.isAltShuffle())
+          if (S && !S.isAltShuffle())
             continue;
           return I1->getOpcode() < I2->getOpcode();
         }
@@ -21382,8 +21554,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
             return false;
           if (I1->getParent() != I2->getParent())
             return false;
-          InstructionsState S = getSameOpcode({I1, I2}, *TLI);
-          if (S.getOpcode())
+          if (getSameOpcode({I1, I2}, *TLI))
             continue;
           return false;
         }
@@ -21721,9 +21892,6 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
         V2->getValueOperand()->getType()->getScalarSizeInBits())
       return false;
     // UndefValues are compatible with all other values.
-    if (isa<UndefValue>(V->getValueOperand()) ||
-        isa<UndefValue>(V2->getValueOperand()))
-      return false;
     if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
       if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
         DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
@@ -21737,14 +21905,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
                "Different nodes should have different DFS numbers");
         if (NodeI1 != NodeI2)
           return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
-        InstructionsState S = getSameOpcode({I1, I2}, *TLI);
-        if (S.getOpcode())
-          return false;
         return I1->getOpcode() < I2->getOpcode();
       }
-    if (isa<Constant>(V->getValueOperand()) &&
-        isa<Constant>(V2->getValueOperand()))
-      return false;
     return V->getValueOperand()->getValueID() <
            V2->getValueOperand()->getValueID();
   };
@@ -21764,8 +21926,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
       if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
         if (I1->getParent() != I2->getParent())
           return false;
-        InstructionsState S = getSameOpcode({I1, I2}, *TLI);
-        return S.getOpcode() > 0;
+        return getSameOpcode({I1, I2}, *TLI).valid();
       }
     if (isa<Constant>(V1->getValueOperand()) &&
         isa<Constant>(V2->getValueOperand()))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 71f43ab..e804f81 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -205,11 +205,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
   return Parent->getEnclosingBlockWithPredecessors();
 }
 
-void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
-  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Entry)))
-    delete Block;
-}
-
 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
   iterator It = begin();
   while (It != end() && It->isPhi())
@@ -221,9 +216,10 @@ VPTransformState::VPTransformState(const TargetTransformInfo *TTI,
                                    ElementCount VF, unsigned UF, LoopInfo *LI,
                                    DominatorTree *DT, IRBuilderBase &Builder,
                                    InnerLoopVectorizer *ILV, VPlan *Plan,
-                                   Type *CanonicalIVTy)
+                                   Loop *CurrentParentLoop, Type *CanonicalIVTy)
     : TTI(TTI), VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
-      LVer(nullptr), TypeAnalysis(CanonicalIVTy) {}
+      CurrentParentLoop(CurrentParentLoop), LVer(nullptr),
+      TypeAnalysis(CanonicalIVTy) {}
 
 Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
   if (Def->isLiveIn())
@@ -474,6 +470,13 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
   connectToPredecessors(State->CFG);
 }
 
+VPIRBasicBlock *VPIRBasicBlock::clone() {
+  auto *NewBlock = getPlan()->createEmptyVPIRBasicBlock(IRBB);
+  for (VPRecipeBase &R : Recipes)
+    NewBlock->appendRecipe(R.clone());
+  return NewBlock;
+}
+
 void VPBasicBlock::execute(VPTransformState *State) {
   bool Replica = bool(State->Lane);
   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
@@ -484,11 +487,9 @@ void VPBasicBlock::execute(VPTransformState *State) {
   };
 
   // 1. Create an IR basic block.
-  if (this == getPlan()->getVectorPreheader() ||
-      (Replica && this == getParent()->getEntry()) ||
+  if ((Replica && this == getParent()->getEntry()) ||
       IsReplicateRegion(getSingleHierarchicalPredecessor())) {
     // Reuse the previous basic block if the current VPBB is either
-    //  * the vector preheader,
     //  * the entry to a replicate region, or
     //  * the exit of a replicate region.
     State->CFG.VPBB2IRBB[this] = NewBB;
@@ -500,8 +501,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
     // Register NewBB in its loop. In innermost loops its the same for all
     // BB's.
-    if (State->CurrentVectorLoop)
-      State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
+    if (State->CurrentParentLoop)
+      State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI);
     State->Builder.SetInsertPoint(Terminator);
 
     State->CFG.PrevBB = NewBB;
@@ -513,14 +514,11 @@ void VPBasicBlock::execute(VPTransformState *State) {
   executeRecipes(State, NewBB);
 }
 
-void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
-  for (VPRecipeBase &R : Recipes) {
-    for (auto *Def : R.definedValues())
-      Def->replaceAllUsesWith(NewValue);
-
-    for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
-      R.setOperand(I, NewValue);
-  }
+VPBasicBlock *VPBasicBlock::clone() {
+  auto *NewBlock = getPlan()->createVPBasicBlock(getName());
+  for (VPRecipeBase &R : *this)
+    NewBlock->appendRecipe(R.clone());
+  return NewBlock;
 }
 
 void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
@@ -541,7 +539,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
 
   SmallVector<VPBlockBase *, 2> Succs(successors());
   // Create new empty block after the block to split.
-  auto *SplitBlock = new VPBasicBlock(getName() + ".split");
+  auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split");
   VPBlockUtils::insertBlockAfter(SplitBlock, this);
 
   // Finally, move the recipes starting at SplitAt to new block.
@@ -557,7 +555,9 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
 template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
   if (P && P->isReplicator()) {
     P = P->getParent();
-    assert(!cast<VPRegionBlock>(P)->isReplicator() &&
+    // Multiple loop regions can be nested, but replicate regions can only be
+    // nested inside a loop region or must be outside any other region.
+    assert((!P || !cast<VPRegionBlock>(P)->isReplicator()) &&
            "unexpected nested replicate regions");
   }
   return P;
@@ -701,37 +701,30 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
 
 VPRegionBlock *VPRegionBlock::clone() {
   const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
-  auto *NewRegion =
-      new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator());
+  auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
+                                                   getName(), isReplicator());
   for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
     Block->setParent(NewRegion);
   return NewRegion;
 }
 
-void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
-  for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
-    // Drop all references in VPBasicBlocks and replace all uses with
-    // DummyValue.
-    Block->dropAllReferences(NewValue);
-}
-
 void VPRegionBlock::execute(VPTransformState *State) {
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
       RPOT(Entry);
 
   if (!isReplicator()) {
     // Create and register the new vector loop.
-    Loop *PrevLoop = State->CurrentVectorLoop;
-    State->CurrentVectorLoop = State->LI->AllocateLoop();
+    Loop *PrevLoop = State->CurrentParentLoop;
+    State->CurrentParentLoop = State->LI->AllocateLoop();
     BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
     Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
 
     // Insert the new loop into the loop nest and register the new basic blocks
     // before calling any utilities such as SCEV that require valid LoopInfo.
     if (ParentLoop)
-      ParentLoop->addChildLoop(State->CurrentVectorLoop);
+      ParentLoop->addChildLoop(State->CurrentParentLoop);
     else
-      State->LI->addTopLevelLoop(State->CurrentVectorLoop);
+      State->LI->addTopLevelLoop(State->CurrentParentLoop);
 
     // Visit the VPBlocks connected to "this", starting from it.
     for (VPBlockBase *Block : RPOT) {
@@ -739,7 +732,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
       Block->execute(State);
     }
 
-    State->CurrentVectorLoop = PrevLoop;
+    State->CurrentParentLoop = PrevLoop;
     return;
   }
 
@@ -822,17 +815,26 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 VPlan::VPlan(Loop *L) {
-  setEntry(VPIRBasicBlock::fromBasicBlock(L->getLoopPreheader()));
-  ScalarHeader = VPIRBasicBlock::fromBasicBlock(L->getHeader());
+  setEntry(createVPIRBasicBlock(L->getLoopPreheader()));
+  ScalarHeader = createVPIRBasicBlock(L->getHeader());
 }
 
 VPlan::~VPlan() {
-  if (Entry) {
-    VPValue DummyValue;
-    for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
-      Block->dropAllReferences(&DummyValue);
-
-    VPBlockBase::deleteCFG(Entry);
+  VPValue DummyValue;
+
+  for (auto *VPB : CreatedBlocks) {
+    if (auto *VPBB = dyn_cast<VPBasicBlock>(VPB)) {
+      // Replace all operands of recipes and all VPValues defined in VPBB with
+      // DummyValue so the block can be deleted.
+      for (VPRecipeBase &R : *VPBB) {
+        for (auto *Def : R.definedValues())
+          Def->replaceAllUsesWith(&DummyValue);
+
+        for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
+          R.setOperand(I, &DummyValue);
+      }
+    }
+    delete VPB;
   }
   for (VPValue *VPV : VPLiveInsToFree)
     delete VPV;
@@ -840,14 +842,6 @@ VPlan::~VPlan() {
     delete BackedgeTakenCount;
 }
 
-VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
-  auto *VPIRBB = new VPIRBasicBlock(IRBB);
-  for (Instruction &I :
-       make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
-    VPIRBB->appendRecipe(new VPIRInstruction(I));
-  return VPIRBB;
-}
-
 VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
                                    PredicatedScalarEvolution &PSE,
                                    bool RequiresScalarEpilogueCheck,
@@ -861,7 +855,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   // an epilogue vector loop, the original entry block here will be replaced by
   // a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after
   // generating code for the main vector loop.
-  VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
+  VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph");
   VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader);
 
   // Create SCEV and VPValue for the trip count.
@@ -878,17 +872,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
 
   // Create VPRegionBlock, with empty header and latch blocks, to be filled
   // during processing later.
-  VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
-  VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+  VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body");
+  VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch");
   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
-  auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop",
-                                      false /*isReplicator*/);
+  auto *TopRegion = Plan->createVPRegionBlock(
+      HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/);
 
   VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
-  VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
+  VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
-  VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+  VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph");
   VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
   if (!RequiresScalarEpilogueCheck) {
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -904,7 +898,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
-  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+  auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock);
   // The connection order corresponds to the operands of the conditional branch.
   VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
   VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -942,7 +936,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
+  assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) &&
+         "VFxUF expected to always have users");
   unsigned UF = getUF();
   if (VF.getNumUsers()) {
     Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
@@ -955,22 +950,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   }
 }
 
-/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
-/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
-/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
-/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
-static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
-  VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB);
-  for (auto &R : make_early_inc_range(*VPBB)) {
-    assert(!R.isPhi() && "Tried to move phi recipe to end of block");
-    R.moveBefore(*IRVPBB, IRVPBB->end());
-  }
-
-  VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
-
-  delete VPBB;
-}
-
 /// Generate the code inside the preheader and body of the vectorized loop.
 /// Assumes a single pre-header basic-block was created for this. Introduce
 /// additional basic-blocks as needed, and fill them all.
@@ -978,30 +957,13 @@ void VPlan::execute(VPTransformState *State) {
   // Initialize CFG state.
   State->CFG.PrevVPBB = nullptr;
   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
-  BasicBlock *VectorPreHeader = State->CFG.PrevBB;
-  State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
 
   // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
+  BasicBlock *VectorPreHeader = State->CFG.PrevBB;
   cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
   State->CFG.DTU.applyUpdates(
       {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
 
-  // Replace regular VPBB's for the vector preheader, middle and scalar
-  // preheader blocks with VPIRBasicBlocks wrapping their IR blocks. The IR
-  // blocks are created during skeleton creation, so we can only create the
-  // VPIRBasicBlocks now during VPlan execution rather than earlier during VPlan
-  // construction.
-  BasicBlock *MiddleBB = State->CFG.ExitBB;
-  BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
-  replaceVPBBWithIRVPBB(getVectorPreheader(), VectorPreHeader);
-  replaceVPBBWithIRVPBB(getMiddleBlock(), MiddleBB);
-  replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh);
-
-  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
-                    << ", UF=" << getUF() << '\n');
-  setName("Final VPlan");
-  LLVM_DEBUG(dump());
-
   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << State->VF
                     << ", UF=" << getUF() << '\n');
   setName("Final VPlan");
@@ -1010,6 +972,8 @@ void VPlan::execute(VPTransformState *State) {
   // Disconnect the middle block from its single successor (the scalar loop
   // header) in both the CFG and DT. The branch will be recreated during VPlan
   // execution.
+  BasicBlock *MiddleBB = State->CFG.ExitBB;
+  BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
   auto *BrInst = new UnreachableInst(MiddleBB->getContext());
   BrInst->insertBefore(MiddleBB->getTerminator());
   MiddleBB->getTerminator()->eraseFromParent();
@@ -1027,12 +991,18 @@ void VPlan::execute(VPTransformState *State) {
   for (VPBlockBase *Block : RPOT)
     Block->execute(State);
 
-  VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
+  State->CFG.DTU.flush();
+
+  auto *LoopRegion = getVectorLoopRegion();
+  if (!LoopRegion)
+    return;
+
+  VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
   // Fix the latch value of canonical, reduction and first-order recurrences
   // phis in the vector loop.
-  VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
+  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
   for (VPRecipeBase &R : Header->phis()) {
     // Skip phi-like recipes that generate their backedege values themselves.
     if (isa<VPWidenPHIRecipe>(&R))
@@ -1071,8 +1041,6 @@ void VPlan::execute(VPTransformState *State) {
     Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
     cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
   }
-
-  State->CFG.DTU.flush();
 }
 
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
@@ -1085,14 +1053,14 @@ VPRegionBlock *VPlan::getVectorLoopRegion() {
   // TODO: Cache if possible.
   for (VPBlockBase *B : vp_depth_first_shallow(getEntry()))
     if (auto *R = dyn_cast<VPRegionBlock>(B))
-      return R;
+      return R->isReplicator() ? nullptr : R;
   return nullptr;
 }
 
 const VPRegionBlock *VPlan::getVectorLoopRegion() const {
   for (const VPBlockBase *B : vp_depth_first_shallow(getEntry()))
     if (auto *R = dyn_cast<VPRegionBlock>(B))
-      return R;
+      return R->isReplicator() ? nullptr : R;
   return nullptr;
 }
 
@@ -1222,6 +1190,7 @@ static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
 }
 
 VPlan *VPlan::duplicate() {
+  unsigned NumBlocksBeforeCloning = CreatedBlocks.size();
   // Clone blocks.
   const auto &[NewEntry, __] = cloneFrom(Entry);
 
@@ -1262,9 +1231,32 @@ VPlan *VPlan::duplicate() {
   assert(Old2NewVPValues.contains(TripCount) &&
          "TripCount must have been added to Old2NewVPValues");
   NewPlan->TripCount = Old2NewVPValues[TripCount];
+
+  // Transfer all cloned blocks (the second half of all current blocks) from
+  // current to new VPlan.
+  unsigned NumBlocksAfterCloning = CreatedBlocks.size();
+  for (unsigned I :
+       seq<unsigned>(NumBlocksBeforeCloning, NumBlocksAfterCloning))
+    NewPlan->CreatedBlocks.push_back(this->CreatedBlocks[I]);
+  CreatedBlocks.truncate(NumBlocksBeforeCloning);
+
   return NewPlan;
 }
 
+VPIRBasicBlock *VPlan::createEmptyVPIRBasicBlock(BasicBlock *IRBB) {
+  auto *VPIRBB = new VPIRBasicBlock(IRBB);
+  CreatedBlocks.push_back(VPIRBB);
+  return VPIRBB;
+}
+
+VPIRBasicBlock *VPlan::createVPIRBasicBlock(BasicBlock *IRBB) {
+  auto *VPIRBB = createEmptyVPIRBasicBlock(IRBB);
+  for (Instruction &I :
+       make_range(IRBB->begin(), IRBB->getTerminator()->getIterator()))
+    VPIRBB->appendRecipe(new VPIRInstruction(I));
+  return VPIRBB;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
 Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
@@ -1414,11 +1406,17 @@ void VPlanIngredient::print(raw_ostream &O) const {
 
 #endif
 
-bool VPValue::isDefinedOutsideLoopRegions() const {
-  return !hasDefiningRecipe() ||
-         !getDefiningRecipe()->getParent()->getEnclosingLoopRegion();
+/// Returns true if there is a vector loop region and \p VPV is defined in a
+/// loop region.
+static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
+  const VPRecipeBase *DefR = VPV->getDefiningRecipe();
+  return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
+                  DefR->getParent()->getEnclosingLoopRegion());
 }
 
+bool VPValue::isDefinedOutsideLoopRegions() const {
+  return !isDefinedInsideLoopRegions(this);
+}
 void VPValue::replaceAllUsesWith(VPValue *New) {
   replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6486c67..cfbb4ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -236,7 +236,8 @@ public:
 struct VPTransformState {
   VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF,
                    LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
-                   InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy);
+                   InnerLoopVectorizer *ILV, VPlan *Plan,
+                   Loop *CurrentParentLoop, Type *CanonicalIVTy);
   /// Target Transform Info.
   const TargetTransformInfo *TTI;
 
@@ -373,8 +374,8 @@ struct VPTransformState {
   /// Pointer to the VPlan code is generated for.
   VPlan *Plan;
 
-  /// The loop object for the current parent region, or nullptr.
-  Loop *CurrentVectorLoop = nullptr;
+  /// The parent loop object for the current scope, or nullptr.
+  Loop *CurrentParentLoop = nullptr;
 
   /// LoopVersioning.  It's only set up (non-null) if memchecks were
   /// used.
@@ -636,9 +637,6 @@ public:
   /// Return the cost of the block.
   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
 
-  /// Delete all blocks reachable from a given VPBlockBase, inclusive.
-  static void deleteCFG(VPBlockBase *Entry);
-
   /// Return true if it is legal to hoist instructions into this block.
   bool isLegalToHoistInto() {
     // There are currently no constraints that prevent an instruction to be
@@ -646,10 +644,6 @@ public:
     return true;
   }
 
-  /// Replace all operands of VPUsers in the block with \p NewValue and also
-  /// replaces all uses of VPValues defined in the block with NewValue.
-  virtual void dropAllReferences(VPValue *NewValue) = 0;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
     OS << getName();
@@ -1357,6 +1351,9 @@ public:
     }
   }
 
+  /// Returns true if the underlying opcode may read from or write to memory.
+  bool opcodeMayReadOrWriteFromMemory() const;
+
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool onlyFirstLaneUsed(const VPValue *Op) const override;
 
@@ -1586,14 +1583,16 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
   Value *generate(VPTransformState &State);
 
 public:
-  VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
-      : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}), Opcode(Opcode),
+  VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
+                     DebugLoc DL)
+      : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}, DL), Opcode(Opcode),
         ResultTy(ResultTy) {}
 
   ~VPScalarCastRecipe() override = default;
 
   VPScalarCastRecipe *clone() override {
-    return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy);
+    return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy,
+                                  getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPScalarCastSC)
@@ -1653,7 +1652,7 @@ public:
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
                          DebugLoc DL = {})
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
         VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
     LLVMContext &Ctx = Ty->getContext();
     AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
@@ -2101,6 +2100,15 @@ public:
            R->getVPDefID() == VPDef::VPWidenPointerInductionSC;
   }
 
+  static inline bool classof(const VPValue *V) {
+    auto *R = V->getDefiningRecipe();
+    return R && classof(R);
+  }
+
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return classof(static_cast<const VPRecipeBase *>(R));
+  }
+
   virtual void execute(VPTransformState &State) override = 0;
 
   /// Returns the step value of the induction.
@@ -2597,8 +2605,9 @@ class VPReductionRecipe : public VPSingleDefRecipe {
 protected:
   VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R,
                     Instruction *I, ArrayRef<VPValue *> Operands,
-                    VPValue *CondOp, bool IsOrdered)
-      : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) {
+                    VPValue *CondOp, bool IsOrdered, DebugLoc DL)
+      : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R),
+        IsOrdered(IsOrdered) {
     if (CondOp) {
       IsConditional = true;
       addOperand(CondOp);
@@ -2608,16 +2617,17 @@ protected:
 public:
   VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
-                    bool IsOrdered)
+                    bool IsOrdered, DebugLoc DL = {})
       : VPReductionRecipe(VPDef::VPReductionSC, R, I,
                           ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
-                          IsOrdered) {}
+                          IsOrdered, DL) {}
 
   ~VPReductionRecipe() override = default;
 
   VPReductionRecipe *clone() override {
     return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
-                                 getVecOp(), getCondOp(), IsOrdered);
+                                 getVecOp(), getCondOp(), IsOrdered,
+                                 getDebugLoc());
   }
 
   static inline bool classof(const VPRecipeBase *R) {
@@ -2672,7 +2682,7 @@ public:
             VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(),
             cast_or_null<Instruction>(R.getUnderlyingValue()),
             ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
-            R.isOrdered()) {}
+            R.isOrdered(), R.getDebugLoc()) {}
 
   ~VPReductionEVLRecipe() override = default;
 
@@ -3554,8 +3564,6 @@ public:
     return make_range(begin(), getFirstNonPhi());
   }
 
-  void dropAllReferences(VPValue *NewValue) override;
-
   /// Split current block at \p SplitAt by inserting a new block between the
   /// current block and its successors and moving all recipes starting at
   /// SplitAt to the new block. Returns the new block.
@@ -3585,12 +3593,7 @@ public:
 
   /// Clone the current block and it's recipes, without updating the operands of
   /// the cloned recipes.
-  VPBasicBlock *clone() override {
-    auto *NewBlock = new VPBasicBlock(getName());
-    for (VPRecipeBase &R : *this)
-      NewBlock->appendRecipe(R.clone());
-    return NewBlock;
-  }
+  VPBasicBlock *clone() override;
 
 protected:
   /// Execute the recipes in the IR basic block \p BB.
@@ -3626,20 +3629,11 @@ public:
     return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
   }
 
-  /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
-  /// instructions in \p IRBB, except its terminator which is managed in VPlan.
-  static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB);
-
   /// The method which generates the output IR instructions that correspond to
   /// this VPBasicBlock, thereby "executing" the VPlan.
   void execute(VPTransformState *State) override;
 
-  VPIRBasicBlock *clone() override {
-    auto *NewBlock = new VPIRBasicBlock(IRBB);
-    for (VPRecipeBase &R : Recipes)
-      NewBlock->appendRecipe(R.clone());
-    return NewBlock;
-  }
+  VPIRBasicBlock *clone() override;
 
   BasicBlock *getIRBasicBlock() const { return IRBB; }
 };
@@ -3678,13 +3672,7 @@ public:
       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
         IsReplicator(IsReplicator) {}
 
-  ~VPRegionBlock() override {
-    if (Entry) {
-      VPValue DummyValue;
-      Entry->dropAllReferences(&DummyValue);
-      deleteCFG(Entry);
-    }
-  }
+  ~VPRegionBlock() override {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPBlockBase *V) {
@@ -3732,8 +3720,6 @@ public:
   // Return the cost of this region.
   InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
 
-  void dropAllReferences(VPValue *NewValue) override;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
   /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
@@ -3810,7 +3796,10 @@ class VPlan {
   /// been modeled in VPlan directly.
   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
 
-public:
+  /// Blocks allocated and owned by the VPlan. They will be deleted once the
+  /// VPlan is destroyed.
+  SmallVector<VPBlockBase *> CreatedBlocks;
+
   /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
   /// wrapping the original header of the scalar loop.
   VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
@@ -3820,18 +3809,20 @@ public:
            "scalar header must be a leaf node");
   }
 
-  /// Construct a VPlan with \p Entry entering the plan, trip count \p TC and
-  /// with \p ScalarHeader wrapping the original header of the scalar loop.
-  VPlan(VPBasicBlock *Entry, VPValue *TC, VPIRBasicBlock *ScalarHeader)
-      : VPlan(Entry, ScalarHeader) {
-    TripCount = TC;
-  }
-
+public:
   /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the
   /// original preheader and scalar header of \p L, to be used as entry and
   /// scalar header blocks of the new VPlan.
   VPlan(Loop *L);
 
+  /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock
+  /// wrapping \p ScalarHeaderBB and a trip count of \p TC.
+  VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) {
+    setEntry(createVPBasicBlock("preheader"));
+    ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB);
+    TripCount = TC;
+  }
+
   ~VPlan();
 
   void setEntry(VPBasicBlock *VPBB) {
@@ -3867,9 +3858,13 @@ public:
   VPBasicBlock *getEntry() { return Entry; }
   const VPBasicBlock *getEntry() const { return Entry; }
 
-  /// Returns the preheader of the vector loop region.
+  /// Returns the preheader of the vector loop region, if one exists, or null
+  /// otherwise.
   VPBasicBlock *getVectorPreheader() {
-    return cast<VPBasicBlock>(getVectorLoopRegion()->getSinglePredecessor());
+    VPRegionBlock *VectorRegion = getVectorLoopRegion();
+    return VectorRegion
+               ? cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())
+               : nullptr;
   }
 
   /// Returns the VPRegionBlock of the vector loop.
@@ -4026,6 +4021,49 @@ public:
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
+
+  /// Create a new VPBasicBlock with \p Name and containing \p Recipe if
+  /// present. The returned block is owned by the VPlan and deleted once the
+  /// VPlan is destroyed.
+  VPBasicBlock *createVPBasicBlock(const Twine &Name,
+                                   VPRecipeBase *Recipe = nullptr) {
+    auto *VPB = new VPBasicBlock(Name, Recipe);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
+  /// IsReplicator is true, the region is a replicate region. The returned block
+  /// is owned by the VPlan and deleted once the VPlan is destroyed.
+  VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
+                                     const std::string &Name = "",
+                                     bool IsReplicator = false) {
+    auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
+  /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
+  /// The returned block is owned by the VPlan and deleted once the VPlan is
+  /// destroyed.
+  VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
+                                     bool IsReplicator = false) {
+    auto *VPB = new VPRegionBlock(Name, IsReplicator);
+    CreatedBlocks.push_back(VPB);
+    return VPB;
+  }
+
+  /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create
+  /// VPIRInstructions wrapping the instructions in t\p IRBB.  The returned
+  /// block is owned by the VPlan and deleted once the VPlan is destroyed.
+  VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB);
+
+  /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
+  /// instructions in \p IRBB, except its terminator which is managed by the
+  /// successors of the block in VPlan. The returned block is owned by the VPlan
+  /// and deleted once the VPlan is destroyed.
+  VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB);
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 6e63373..76ed578 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -182,7 +182,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
   // Create new VPBB.
   StringRef Name = isHeaderBB(BB, TheLoop) ? "vector.body" : BB->getName();
   LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << Name << "\n");
-  VPBasicBlock *VPBB = new VPBasicBlock(Name);
+  VPBasicBlock *VPBB = Plan.createVPBasicBlock(Name);
   BB2VPBB[BB] = VPBB;
 
   // Get or create a region for the loop containing BB.
@@ -204,7 +204,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
   if (LoopOfBB == TheLoop) {
     RegionOfVPBB = Plan.getVectorLoopRegion();
   } else {
-    RegionOfVPBB = new VPRegionBlock(Name.str(), false /*isReplicator*/);
+    RegionOfVPBB = Plan.createVPRegionBlock(Name.str(), false /*isReplicator*/);
     RegionOfVPBB->setParent(Loop2Region[LoopOfBB->getParentLoop()]);
   }
   RegionOfVPBB->setEntry(VPBB);
@@ -357,12 +357,10 @@ void PlainCFGBuilder::buildPlainCFG() {
   BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB;
   VectorHeaderVPBB->clearSuccessors();
   VectorLatchVPBB->clearPredecessors();
-  if (TheLoop->getHeader() != TheLoop->getLoopLatch()) {
+  if (TheLoop->getHeader() != TheLoop->getLoopLatch())
     BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB;
-  } else {
+  else
     TheRegion->setExiting(VectorHeaderVPBB);
-    delete VectorLatchVPBB;
-  }
 
   // 1. Scan the body of the loop in a topological order to visit each basic
   // block after having visited its predecessor basic blocks. Create a VPBB for
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index 9e8f9f3..ad6e2ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -32,11 +32,11 @@ class Loop;
 class LoopInfo;
 class VPRegionBlock;
 class VPlan;
-class VPlanTestBase;
+class VPlanTestIRBase;
 
 /// Main class to build the VPlan H-CFG for an incoming IR.
 class VPlanHCFGBuilder {
-  friend VPlanTestBase;
+  friend VPlanTestIRBase;
 
 private:
   // The outermost loop of the input loop nest considered for vectorization.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index ec3c203..4866426 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -139,7 +139,8 @@ struct MatchRecipeAndOpcode<Opcode, RecipeTy> {
     if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
                   std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
                   std::is_same<RecipeTy, VPWidenSelectRecipe>::value ||
-                  std::is_same<RecipeTy, VPDerivedIVRecipe>::value)
+                  std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
+                  std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
       return DefR;
     else
       return DefR && DefR->getOpcode() == Opcode;
@@ -309,6 +310,12 @@ m_Binary(const Op0_t &Op0, const Op1_t &Op1) {
   return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>(Op0, Op1);
 }
 
+template <unsigned Opcode, typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>
+m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
+  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>(Op0, Op1);
+}
+
 template <typename Op0_t, typename Op1_t>
 inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul>
 m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
@@ -339,6 +346,18 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1);
 }
 
+template <typename Op0_t, typename Op1_t>
+using GEPLikeRecipe_match =
+    BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false,
+                       VPWidenRecipe, VPReplicateRecipe, VPWidenGEPRecipe,
+                       VPInstruction>;
+
+template <typename Op0_t, typename Op1_t>
+inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
+                                                         const Op1_t &Op1) {
+  return GEPLikeRecipe_match<Op0_t, Op1_t>(Op0, Op1);
+}
+
 template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
 using AllTernaryRecipe_match =
     Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, Opcode, false,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8be2b89..e54df8bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -51,24 +51,7 @@ extern cl::opt<unsigned> ForceTargetInstructionCost;
 bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
   case VPInstructionSC:
-    if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode()))
-      return false;
-    switch (cast<VPInstruction>(this)->getOpcode()) {
-    case Instruction::Or:
-    case Instruction::ICmp:
-    case Instruction::Select:
-    case VPInstruction::AnyOf:
-    case VPInstruction::Not:
-    case VPInstruction::CalculateTripCountMinusVF:
-    case VPInstruction::CanonicalIVIncrementForPart:
-    case VPInstruction::ExtractFromEnd:
-    case VPInstruction::FirstOrderRecurrenceSplice:
-    case VPInstruction::LogicalAnd:
-    case VPInstruction::PtrAdd:
-      return false;
-    default:
-      return true;
-    }
+    return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -115,6 +98,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
 
 bool VPRecipeBase::mayReadFromMemory() const {
   switch (getVPDefID()) {
+  case VPInstructionSC:
+    return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
     return true;
@@ -707,6 +692,26 @@ void VPInstruction::execute(VPTransformState &State) {
             /*IsScalar*/ GeneratesPerFirstLaneOnly);
 }
 
+bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
+  if (Instruction::isBinaryOp(getOpcode()))
+    return false;
+  switch (getOpcode()) {
+  case Instruction::ICmp:
+  case Instruction::Select:
+  case VPInstruction::AnyOf:
+  case VPInstruction::CalculateTripCountMinusVF:
+  case VPInstruction::CanonicalIVIncrementForPart:
+  case VPInstruction::ExtractFromEnd:
+  case VPInstruction::FirstOrderRecurrenceSplice:
+  case VPInstruction::LogicalAnd:
+  case VPInstruction::Not:
+  case VPInstruction::PtrAdd:
+    return false;
+  default:
+    return true;
+  }
+}
+
 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
@@ -1352,10 +1357,9 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     Value *C = nullptr;
     if (FCmp) {
       // Propagate fast math flags.
-      IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-      if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
-        Builder.setFastMathFlags(I->getFastMathFlags());
-      C = Builder.CreateFCmp(getPredicate(), A, B);
+      C = Builder.CreateFCmpFMF(
+          getPredicate(), A, B,
+          dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     } else {
       C = Builder.CreateICmp(getPredicate(), A, B);
     }
@@ -1582,7 +1586,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CAST ";
   printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(Opcode) << " ";
+  O << " = " << Instruction::getOpcodeName(Opcode);
   printFlags(O);
   printOperands(O, SlotTracker);
   O << " to " << *getResultType();
@@ -2001,7 +2005,6 @@ void VPReverseVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = reverse-vector-pointer";
   printFlags(O);
-  O << " ";
   printOperands(O, SlotTracker);
 }
 #endif
@@ -2116,6 +2119,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
   // Propagate the fast-math flags carried by the underlying instruction.
   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+  State.setDebugLocFrom(getDebugLoc());
   Value *NewVecOp = State.get(getVecOp());
   if (VPValue *Cond = getCondOp()) {
     Value *NewCond = State.get(Cond, State.VF.isScalar());
@@ -2328,6 +2332,7 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 Value *VPScalarCastRecipe ::generate(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
   assert(vputils::onlyFirstLaneUsed(this) &&
          "Codegen only implemented for first lane.");
   switch (Opcode) {
@@ -3365,7 +3370,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
                           : VectorType::get(StartV->getType(), State.VF);
 
   BasicBlock *HeaderBB = State.CFG.PrevBB;
-  assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
+  assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
          "recipe must be in the vector loop header");
   auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
   Phi->insertBefore(HeaderBB->getFirstInsertionPt());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0b809c2..3e3f5ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -217,7 +217,7 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
 // is connected to a successor replicate region with the same predicate by a
 // single, empty VPBasicBlock.
 static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
-  SetVector<VPRegionBlock *> DeletedRegions;
+  SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
 
   // Collect replicate regions followed by an empty block, followed by another
   // replicate region with matching masks to process front. This is to avoid
@@ -248,7 +248,7 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
 
   // Move recipes from Region1 to its successor region, if both are triangles.
   for (VPRegionBlock *Region1 : WorkList) {
-    if (DeletedRegions.contains(Region1))
+    if (TransformedRegions.contains(Region1))
       continue;
     auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
     auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
@@ -294,12 +294,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
       VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
     }
     VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
-    DeletedRegions.insert(Region1);
+    TransformedRegions.insert(Region1);
   }
 
-  for (VPRegionBlock *ToDelete : DeletedRegions)
-    delete ToDelete;
-  return !DeletedRegions.empty();
+  return !TransformedRegions.empty();
 }
 
 static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
@@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   assert(Instr->getParent() && "Predicated instruction not in any basic block");
   auto *BlockInMask = PredRecipe->getMask();
   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
-  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+  auto *Entry =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
 
   // Replace predicated replicate recipe with a replicate recipe without a
   // mask but in the replicate region.
@@ -318,7 +317,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
       PredRecipe->getUnderlyingInstr(),
       make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
       PredRecipe->isUniform());
-  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
+  auto *Pred =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
 
   VPPredInstPHIRecipe *PHIRecipe = nullptr;
   if (PredRecipe->getNumUsers() != 0) {
@@ -328,8 +328,10 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
     PHIRecipe->setOperand(0, RecipeWithoutMask);
   }
   PredRecipe->eraseFromParent();
-  auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
-  VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
+  auto *Exiting =
+      Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+  VPRegionBlock *Region =
+      Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
 
   // Note: first set Entry as region entry and then connect successors starting
   // from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -396,7 +398,7 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
       VPBlockUtils::disconnectBlocks(VPBB, Succ);
       VPBlockUtils::connectBlocks(PredVPBB, Succ);
     }
-    delete VPBB;
+    // VPBB is now dead and will be cleaned up when the plan gets destroyed.
   }
   return !WorkList.empty();
 }
@@ -525,7 +527,8 @@ static VPScalarIVStepsRecipe *
 createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
                     Instruction::BinaryOps InductionOpcode,
                     FPMathOperator *FPBinOp, Instruction *TruncI,
-                    VPValue *StartV, VPValue *Step, VPBuilder &Builder) {
+                    VPValue *StartV, VPValue *Step, DebugLoc DL,
+                    VPBuilder &Builder) {
   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
   VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
@@ -540,7 +543,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
     assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
            "Not truncating.");
     assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
-    BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy);
+    BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
     ResultTy = TruncTy;
   }
 
@@ -554,26 +557,68 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
         cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());
     VPBuilder::InsertPointGuard Guard(Builder);
     Builder.setInsertPoint(VecPreheader);
-    Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy);
+    Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
   }
   return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step);
 }
 
+static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
+  SetVector<VPUser *> Users(V->user_begin(), V->user_end());
+  for (unsigned I = 0; I != Users.size(); ++I) {
+    VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);
+    if (isa<VPHeaderPHIRecipe>(Cur))
+      continue;
+    for (VPValue *V : Cur->definedValues())
+      Users.insert(V->user_begin(), V->user_end());
+  }
+  return Users.takeVector();
+}
+
 /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
 /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
 /// VPWidenPointerInductionRecipe will generate vectors only. If some users
 /// require vectors while other require scalars, the scalar uses need to extract
 /// the scalars from the generated vectors (Note that this is different to how
-/// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe,
-/// if any of its users needs scalar values, by providing them scalar steps
-/// built on the canonical scalar IV and update the original IV's users. This is
-/// an optional optimization to reduce the needs of vector extracts.
+/// int/fp inductions are handled). Legalize extract-from-ends using uniform
+/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
+/// the correct end value is available. Also optimize
+/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
+/// providing them scalar steps built on the canonical scalar IV and update the
+/// original IV's users. This is an optional optimization to reduce the needs of
+/// vector extracts.
 static void legalizeAndOptimizeInductions(VPlan &Plan) {
+  using namespace llvm::VPlanPatternMatch;
   SmallVector<VPRecipeBase *> ToRemove;
   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
   bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
   VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+    auto *PhiR = dyn_cast<VPHeaderPHIRecipe>(&Phi);
+    if (!PhiR)
+      break;
+
+    // Check if any uniform VPReplicateRecipes using the phi recipe are used by
+    // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
+    // ensure the final value is available.
+    // TODO: Remove once uniformity analysis is done on VPlan.
+    for (VPUser *U : collectUsersRecursively(PhiR)) {
+      auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
+      VPValue *Op;
+      if (!ExitIRI || !match(ExitIRI->getOperand(0),
+                             m_VPInstruction<VPInstruction::ExtractFromEnd>(
+                                 m_VPValue(Op), m_VPValue())))
+        continue;
+      auto *RepR = dyn_cast<VPReplicateRecipe>(Op);
+      if (!RepR || !RepR->isUniform())
+        continue;
+      assert(!RepR->isPredicated() && "RepR must not be predicated");
+      Instruction *I = RepR->getUnderlyingInstr();
+      auto *Clone =
+          new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false);
+      Clone->insertAfter(RepR);
+      RepR->replaceAllUsesWith(Clone);
+    }
+
     // Replace wide pointer inductions which have only their scalars used by
     // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
     if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
@@ -586,7 +631,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
       VPValue *StepV = PtrIV->getOperand(1);
       VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
           Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
-          nullptr, StartV, StepV, Builder);
+          nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
 
       VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
                                              PtrIV->getDebugLoc(), "next.gep");
@@ -610,7 +655,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
         Plan, ID.getKind(), ID.getInductionOpcode(),
         dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
         WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
-        Builder);
+        WideIV->getDebugLoc(), Builder);
 
     // Update scalar users of IV to use Step instead.
     if (!HasOnlyVectorVFs)
@@ -660,13 +705,158 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
   }
 }
 
+/// Try to simplify recipe \p R.
+static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
+  using namespace llvm::VPlanPatternMatch;
+
+  if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
+    // Try to remove redundant blend recipes.
+    SmallPtrSet<VPValue *, 4> UniqueValues;
+    if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
+      UniqueValues.insert(Blend->getIncomingValue(0));
+    for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
+      if (!match(Blend->getMask(I), m_False()))
+        UniqueValues.insert(Blend->getIncomingValue(I));
+
+    if (UniqueValues.size() == 1) {
+      Blend->replaceAllUsesWith(*UniqueValues.begin());
+      Blend->eraseFromParent();
+      return;
+    }
+
+    if (Blend->isNormalized())
+      return;
+
+    // Normalize the blend so its first incoming value is used as the initial
+    // value with the others blended into it.
+
+    unsigned StartIndex = 0;
+    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+      // If a value's mask is used only by the blend then is can be deadcoded.
+      // TODO: Find the most expensive mask that can be deadcoded, or a mask
+      // that's used by multiple blends where it can be removed from them all.
+      VPValue *Mask = Blend->getMask(I);
+      if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
+        StartIndex = I;
+        break;
+      }
+    }
+
+    SmallVector<VPValue *, 4> OperandsWithMask;
+    OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
+
+    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+      if (I == StartIndex)
+        continue;
+      OperandsWithMask.push_back(Blend->getIncomingValue(I));
+      OperandsWithMask.push_back(Blend->getMask(I));
+    }
+
+    auto *NewBlend = new VPBlendRecipe(
+        cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
+    NewBlend->insertBefore(&R);
+
+    VPValue *DeadMask = Blend->getMask(StartIndex);
+    Blend->replaceAllUsesWith(NewBlend);
+    Blend->eraseFromParent();
+    recursivelyDeleteDeadRecipes(DeadMask);
+    return;
+  }
+
+  VPValue *A;
+  if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
+    VPValue *Trunc = R.getVPSingleValue();
+    Type *TruncTy = TypeInfo.inferScalarType(Trunc);
+    Type *ATy = TypeInfo.inferScalarType(A);
+    if (TruncTy == ATy) {
+      Trunc->replaceAllUsesWith(A);
+    } else {
+      // Don't replace a scalarizing recipe with a widened cast.
+      if (isa<VPReplicateRecipe>(&R))
+        return;
+      if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+
+        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
+                                 ? Instruction::SExt
+                                 : Instruction::ZExt;
+        auto *VPC =
+            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
+          // UnderlyingExt has distinct return type, used to retain legacy cost.
+          VPC->setUnderlyingValue(UnderlyingExt);
+        }
+        VPC->insertBefore(&R);
+        Trunc->replaceAllUsesWith(VPC);
+      } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+        auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+        VPC->insertBefore(&R);
+        Trunc->replaceAllUsesWith(VPC);
+      }
+    }
+#ifndef NDEBUG
+    // Verify that the cached type info is for both A and its users is still
+    // accurate by comparing it to freshly computed types.
+    VPTypeAnalysis TypeInfo2(
+        R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
+    assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
+    for (VPUser *U : A->users()) {
+      auto *R = cast<VPRecipeBase>(U);
+      for (VPValue *VPV : R->definedValues())
+        assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
+    }
+#endif
+  }
+
+  // Simplify (X && Y) || (X && !Y) -> X.
+  // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
+  // && (Y || Z) and (X || !X) into true. This requires queuing newly created
+  // recipes to be visited during simplification.
+  VPValue *X, *Y, *X1, *Y1;
+  if (match(&R,
+            m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+                         m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
+      X == X1 && Y == Y1) {
+    R.getVPSingleValue()->replaceAllUsesWith(X);
+    R.eraseFromParent();
+    return;
+  }
+
+  if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
+    return R.getVPSingleValue()->replaceAllUsesWith(A);
+
+  if (match(&R, m_Not(m_Not(m_VPValue(A)))))
+    return R.getVPSingleValue()->replaceAllUsesWith(A);
+
+  // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
+  if ((match(&R,
+             m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
+       match(&R,
+             m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
+      TypeInfo.inferScalarType(R.getOperand(1)) ==
+          TypeInfo.inferScalarType(R.getVPSingleValue()))
+    return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
+}
+
+/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all
+/// un-typed live-ins in VPTypeAnalysis.
+static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) {
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  VPTypeAnalysis TypeInfo(CanonicalIVTy);
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      simplifyRecipe(R, TypeInfo);
+    }
+  }
+}
+
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE) {
   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
-  VPBasicBlock *ExitingVPBB =
-      Plan.getVectorLoopRegion()->getExitingBasicBlock();
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
   auto *Term = &ExitingVPBB->back();
   // Try to simplify the branch condition if TC <= VF * UF when preparing to
   // execute the plan for the main vector loop. We only do this if the
@@ -690,16 +880,44 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
       !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
     return;
 
-  LLVMContext &Ctx = SE.getContext();
-  auto *BOC = new VPInstruction(
-      VPInstruction::BranchOnCond,
-      {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
+  // The vector loop region only executes once. If possible, completely remove
+  // the region, otherwise replace the terminator controlling the latch with
+  // (BranchOnCond true).
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
+  if (all_of(
+          Header->phis(),
+          IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
+    for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
+      auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
+      HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
+      HeaderPhiR->eraseFromParent();
+    }
+
+    VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
+    VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
+    VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
+    VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
+
+    for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
+      B->setParent(nullptr);
+
+    VPBlockUtils::connectBlocks(Preheader, Header);
+    VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
+    simplifyRecipes(Plan, CanIVTy);
+  } else {
+    // The vector region contains header phis for which we cannot remove the
+    // loop region yet.
+    LLVMContext &Ctx = SE.getContext();
+    auto *BOC = new VPInstruction(
+        VPInstruction::BranchOnCond,
+        {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
+    ExitingVPBB->appendRecipe(BOC);
+  }
 
-  SmallVector<VPValue *> PossiblyDead(Term->operands());
   Term->eraseFromParent();
-  for (VPValue *Op : PossiblyDead)
-    recursivelyDeleteDeadRecipes(Op);
-  ExitingVPBB->appendRecipe(BOC);
+  VPlanTransforms::removeDeadRecipes(Plan);
+
   Plan.setVF(BestVF);
   Plan.setUF(BestUF);
   // TODO: Further simplifications are possible
@@ -910,18 +1128,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
   return true;
 }
 
-static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
-  SetVector<VPUser *> Users(V->user_begin(), V->user_end());
-  for (unsigned I = 0; I != Users.size(); ++I) {
-    VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);
-    if (isa<VPHeaderPHIRecipe>(Cur))
-      continue;
-    for (VPValue *V : Cur->definedValues())
-      Users.insert(V->user_begin(), V->user_end());
-  }
-  return Users.takeVector();
-}
-
 void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
   for (VPRecipeBase &R :
        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
@@ -940,138 +1146,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
   }
 }
 
-/// Try to simplify recipe \p R.
-static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
-  using namespace llvm::VPlanPatternMatch;
-
-  if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
-    // Try to remove redundant blend recipes.
-    SmallPtrSet<VPValue *, 4> UniqueValues;
-    if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
-      UniqueValues.insert(Blend->getIncomingValue(0));
-    for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
-      if (!match(Blend->getMask(I), m_False()))
-        UniqueValues.insert(Blend->getIncomingValue(I));
-
-    if (UniqueValues.size() == 1) {
-      Blend->replaceAllUsesWith(*UniqueValues.begin());
-      Blend->eraseFromParent();
-      return;
-    }
-
-    if (Blend->isNormalized())
-      return;
-
-    // Normalize the blend so its first incoming value is used as the initial
-    // value with the others blended into it.
-
-    unsigned StartIndex = 0;
-    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
-      // If a value's mask is used only by the blend then is can be deadcoded.
-      // TODO: Find the most expensive mask that can be deadcoded, or a mask
-      // that's used by multiple blends where it can be removed from them all.
-      VPValue *Mask = Blend->getMask(I);
-      if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
-        StartIndex = I;
-        break;
-      }
-    }
-
-    SmallVector<VPValue *, 4> OperandsWithMask;
-    OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
-
-    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
-      if (I == StartIndex)
-        continue;
-      OperandsWithMask.push_back(Blend->getIncomingValue(I));
-      OperandsWithMask.push_back(Blend->getMask(I));
-    }
-
-    auto *NewBlend = new VPBlendRecipe(
-        cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
-    NewBlend->insertBefore(&R);
-
-    VPValue *DeadMask = Blend->getMask(StartIndex);
-    Blend->replaceAllUsesWith(NewBlend);
-    Blend->eraseFromParent();
-    recursivelyDeleteDeadRecipes(DeadMask);
-    return;
-  }
-
-  VPValue *A;
-  if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
-    VPValue *Trunc = R.getVPSingleValue();
-    Type *TruncTy = TypeInfo.inferScalarType(Trunc);
-    Type *ATy = TypeInfo.inferScalarType(A);
-    if (TruncTy == ATy) {
-      Trunc->replaceAllUsesWith(A);
-    } else {
-      // Don't replace a scalarizing recipe with a widened cast.
-      if (isa<VPReplicateRecipe>(&R))
-        return;
-      if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
-
-        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
-                                 ? Instruction::SExt
-                                 : Instruction::ZExt;
-        auto *VPC =
-            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
-        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
-          // UnderlyingExt has distinct return type, used to retain legacy cost.
-          VPC->setUnderlyingValue(UnderlyingExt);
-        }
-        VPC->insertBefore(&R);
-        Trunc->replaceAllUsesWith(VPC);
-      } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
-        auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
-        VPC->insertBefore(&R);
-        Trunc->replaceAllUsesWith(VPC);
-      }
-    }
-#ifndef NDEBUG
-    // Verify that the cached type info is for both A and its users is still
-    // accurate by comparing it to freshly computed types.
-    VPTypeAnalysis TypeInfo2(
-        R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
-    assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
-    for (VPUser *U : A->users()) {
-      auto *R = cast<VPRecipeBase>(U);
-      for (VPValue *VPV : R->definedValues())
-        assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
-    }
-#endif
-  }
-
-  // Simplify (X && Y) || (X && !Y) -> X.
-  // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
-  // && (Y || Z) and (X || !X) into true. This requires queuing newly created
-  // recipes to be visited during simplification.
-  VPValue *X, *Y, *X1, *Y1;
-  if (match(&R,
-            m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
-                         m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
-      X == X1 && Y == Y1) {
-    R.getVPSingleValue()->replaceAllUsesWith(X);
-    R.eraseFromParent();
-    return;
-  }
-
-  if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
-    return R.getVPSingleValue()->replaceAllUsesWith(A);
-
-  if (match(&R, m_Not(m_Not(m_VPValue(A)))))
-    return R.getVPSingleValue()->replaceAllUsesWith(A);
-
-  // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
-  if ((match(&R,
-             m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
-       match(&R,
-             m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
-      TypeInfo.inferScalarType(R.getOperand(1)) ==
-          TypeInfo.inferScalarType(R.getVPSingleValue()))
-    return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
-}
-
 /// Move loop-invariant recipes out of the vector loop region in \p Plan.
 static void licm(VPlan &Plan) {
   VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -1106,19 +1180,6 @@ static void licm(VPlan &Plan) {
   }
 }
 
-/// Try to simplify the recipes in \p Plan.
-static void simplifyRecipes(VPlan &Plan) {
-  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
-      Plan.getEntry());
-  Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
-  VPTypeAnalysis TypeInfo(CanonicalIVType);
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
-    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      simplifyRecipe(R, TypeInfo);
-    }
-  }
-}
-
 void VPlanTransforms::truncateToMinimalBitwidths(
     VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
 #ifndef NDEBUG
@@ -1256,10 +1317,10 @@ void VPlanTransforms::optimize(VPlan &Plan) {
   removeRedundantCanonicalIVs(Plan);
   removeRedundantInductionCasts(Plan);
 
-  simplifyRecipes(Plan);
+  simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType());
   legalizeAndOptimizeInductions(Plan);
   removeRedundantExpandSCEVRecipes(Plan);
-  simplifyRecipes(Plan);
+  simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType());
   removeDeadRecipes(Plan);
 
   createAndOptimizeReplicateRegions(Plan);
@@ -1496,10 +1557,13 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
               auto *CastR = cast<VPWidenCastRecipe>(CR);
               VPID = VPIntrinsic::getForOpcode(CastR->getOpcode());
             }
-            assert(VPID != Intrinsic::not_intrinsic && "Expected VP intrinsic");
+
+            // Not all intrinsics have a corresponding VP intrinsic.
+            if (VPID == Intrinsic::not_intrinsic)
+              return nullptr;
             assert(VPIntrinsic::getMaskParamPos(VPID) &&
                    VPIntrinsic::getVectorLengthParamPos(VPID) &&
-                   "Expected VP intrinsic");
+                   "Expected VP intrinsic to have mask and EVL");
 
             SmallVector<VPValue *> Ops(CR->operands());
             Ops.push_back(&AllOneMask);
@@ -1656,9 +1720,9 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
   VPSingleDefRecipe *OpVPEVL = VPEVL;
   if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
       IVSize != 32) {
-    OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc
-                                                 : Instruction::ZExt,
-                                     OpVPEVL, CanonicalIVPHI->getScalarType());
+    OpVPEVL = new VPScalarCastRecipe(
+        IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL,
+        CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc());
     OpVPEVL->insertBefore(CanonicalIVIncrement);
   }
   auto *NextEVLIV =
@@ -1898,7 +1962,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
   if (OrigLoop->getUniqueExitBlock()) {
     VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
   } else {
-    VPEarlyExitBlock = VPIRBasicBlock::fromBasicBlock(
+    VPEarlyExitBlock = Plan.createVPIRBasicBlock(
         !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
   }
 
@@ -1908,7 +1972,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
   IsEarlyExitTaken =
       Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
 
-  VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+  VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
   VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
   VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
   NewMiddle->swapSuccessors();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 9657770..7779442 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -49,7 +49,8 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
     return all_of(GEP->operands(), isUniformAfterVectorization);
   if (auto *VPI = dyn_cast<VPInstruction>(Def))
     return VPI->isSingleScalar() || VPI->isVectorToScalar();
-  return false;
+  // VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
+  return isa<VPExpandSCEVRecipe>(Def);
 }
 
 /// Return true if \p V is a header mask in \p Plan.
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index ecbc13d..1a669b5 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -128,6 +128,8 @@ private:
   bool shrinkType(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
+    LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
+    LLVM_DEBUG(dbgs() << "         With: " << New << '\n');
     Old.replaceAllUsesWith(&New);
     if (auto *NewI = dyn_cast<Instruction>(&New)) {
       New.takeName(&Old);
@@ -139,10 +141,17 @@ private:
 
   void eraseInstruction(Instruction &I) {
     LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
-    for (Value *Op : I.operands())
-      Worklist.pushValue(Op);
+    SmallVector<Value *> Ops(I.operands());
     Worklist.remove(&I);
     I.eraseFromParent();
+
+    // Push remaining users of the operands and then the operand itself - allows
+    // further folds that were hindered by OneUse limits.
+    for (Value *Op : Ops)
+      if (auto *OpI = dyn_cast<Instruction>(Op)) {
+        Worklist.pushUsersToWorkList(*OpI);
+        Worklist.pushValue(OpI);
+      }
   }
 };
 } // namespace
@@ -696,7 +705,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
 
   InstructionCost NewCost =
       TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind);
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask,
+                         CostKind);
 
   bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
   // If the lengths of the two vectors are not equal,
@@ -1335,6 +1345,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
                              MemoryLocation::get(SI), AA))
       return false;
 
+    // Ensure we add the load back to the worklist BEFORE its users so they can
+    // erased in the correct order.
+    Worklist.push(Load);
+
     if (ScalarizableIdx.isSafeWithFreeze())
       ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
     Value *GEP = Builder.CreateInBoundsGEP(
@@ -1360,8 +1374,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   if (!match(&I, m_Load(m_Value(Ptr))))
     return false;
 
-  auto *VecTy = cast<VectorType>(I.getType());
   auto *LI = cast<LoadInst>(&I);
+  auto *VecTy = cast<VectorType>(LI->getType());
   if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
     return false;
 
@@ -1401,7 +1415,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
       LastCheckedInst = UI;
     }
 
-    auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT);
+    auto ScalarIdx =
+        canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
     if (ScalarIdx.isUnsafe())
       return false;
     if (ScalarIdx.isSafeWithFreeze()) {
@@ -1409,7 +1424,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
       ScalarIdx.discard();
     }
 
-    auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
+    auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
     OriginalCost +=
         TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
                                Index ? Index->getZExtValue() : -1);
@@ -1422,10 +1437,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   if (ScalarizedCost >= OriginalCost)
     return false;
 
+  // Ensure we add the load back to the worklist BEFORE its users so they can
+  // erased in the correct order.
+  Worklist.push(LI);
+
   // Replace extracts with narrow scalar loads.
   for (User *U : LI->users()) {
     auto *EI = cast<ExtractElementInst>(U);
-    Value *Idx = EI->getOperand(1);
+    Value *Idx = EI->getIndexOperand();
 
     // Insert 'freeze' for poison indexes.
     auto It = NeedFreeze.find(EI);
@@ -1669,7 +1688,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
 
   Value *X, *Y, *Z, *W;
   bool IsCommutative = false;
-  CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+  CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
+  CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
   if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
       match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
     auto *BO = cast<BinaryOperator>(LHS);
@@ -1677,8 +1697,9 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
     if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
       return false;
     IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
-  } else if (match(LHS, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
-             match(RHS, m_SpecificCmp(Pred, m_Value(Z), m_Value(W)))) {
+  } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
+             match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
+             (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
     IsCommutative = cast<CmpInst>(LHS)->isCommutative();
   } else
     return false;
@@ -1723,18 +1744,48 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
                          OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
 
+  // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
+  // where one use shuffles have gotten split across the binop/cmp. These
+  // often allow a major reduction in total cost that wouldn't happen as
+  // individual folds.
+  auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
+                        TTI::TargetCostKind CostKind) -> bool {
+    Value *InnerOp;
+    ArrayRef<int> InnerMask;
+    if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
+                                     m_Mask(InnerMask)))) &&
+        InnerOp->getType() == Op->getType() &&
+        all_of(InnerMask,
+               [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
+      for (int &M : Mask)
+        if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
+          M = InnerMask[M - Offset];
+          M = 0 <= M ? M + Offset : M;
+        }
+      OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind);
+      Op = InnerOp;
+      return true;
+    }
+    return false;
+  };
+  bool ReducedInstCount = false;
+  ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
+  ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
+  ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
+  ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
+
   InstructionCost NewCost =
       TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
       TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
 
-  if (Pred == CmpInst::BAD_ICMP_PREDICATE) {
+  if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
     NewCost +=
         TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
   } else {
     auto *ShuffleCmpTy =
         FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
     NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
-                                      ShuffleDstTy, Pred, CostKind);
+                                      ShuffleDstTy, PredLHS, CostKind);
   }
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
@@ -1743,17 +1794,17 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
 
   // If either shuffle will constant fold away, then fold for the same cost as
   // we will reduce the instruction count.
-  bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) ||
-                          (isa<Constant>(Y) && isa<Constant>(W));
+  ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
+                      (isa<Constant>(Y) && isa<Constant>(W));
   if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
     return false;
 
   Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
   Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
-  Value *NewBO = Pred == CmpInst::BAD_ICMP_PREDICATE
+  Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
                      ? Builder.CreateBinOp(
                            cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
-                     : Builder.CreateCmp(Pred, Shuf0, Shuf1);
+                     : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
 
   // Intersect flags from the old binops.
   if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
@@ -1972,9 +2023,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
   if (Match1)
     InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind);
 
-  InstructionCost OuterCost = TTI.getShuffleCost(
-      TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind,
-      0, nullptr, {OuterV0, OuterV1}, &I);
+  InstructionCost OuterCost = TTI.getInstructionCost(&I, CostKind);
 
   InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
 
@@ -3047,12 +3096,16 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
       TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
   InstructionCost OldCost = ExtCost + InsCost;
 
-  InstructionCost NewCost = TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0,
-                                               nullptr, {DstVec, SrcVec});
+  // Ignore 'free' identity insertion shuffle.
+  // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
+  InstructionCost NewCost = 0;
+  if (!ShuffleVectorInst::isIdentityMask(Mask, NumElts))
+    NewCost += TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, nullptr,
+                                  {DstVec, SrcVec});
   if (!Ext->hasOneUse())
     NewCost += ExtCost;
 
-  LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair : " << I
+  LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
                     << "\n");
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index c363b58..b82c2f1 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+bf16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16,CHECK-BF16 %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -1237,28 +1238,51 @@ define void @fp16cast() {
 }
 
 define void @bf16cast() {
-; CHECK-LABEL: 'bf16cast'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f64 = fpext bfloat undef to double
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f32 = fptrunc float undef to bfloat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f64 = fptrunc double undef to bfloat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NOFP16-LABEL: 'bf16cast'
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extf16f64 = fpext bfloat undef to double
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncf16f32 = fptrunc float undef to bfloat
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %truncf16f64 = fptrunc double undef to bfloat
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-BF16-LABEL: 'bf16cast'
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extf16f64 = fpext bfloat undef to double
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f32 = fptrunc float undef to bfloat
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f64 = fptrunc double undef to bfloat
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %extf16f32 = fpext bfloat undef to float
   %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll
index ada0be6..ef52d0d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/div.ll
@@ -11,14 +11,20 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
@@ -32,16 +38,22 @@ define i32 @sdiv() {
   %V8i64 = sdiv <8 x i64> undef, undef
 
   %I32 = sdiv i32 undef, undef
+  %V2i32 = sdiv <2 x i32> undef, undef
   %V4i32 = sdiv <4 x i32> undef, undef
   %V8i32 = sdiv <8 x i32> undef, undef
   %V16i32 = sdiv <16 x i32> undef, undef
 
   %I16 = sdiv i16 undef, undef
+  %V2i16 = sdiv <2 x i16> undef, undef
+  %V4i16 = sdiv <4 x i16> undef, undef
   %V8i16 = sdiv <8 x i16> undef, undef
   %V16i16 = sdiv <16 x i16> undef, undef
   %V32i16 = sdiv <32 x i16> undef, undef
 
   %I8 = sdiv i8 undef, undef
+  %V2i8 = sdiv <2 x i8> undef, undef
+  %V4i8 = sdiv <4 x i8> undef, undef
+  %V8i8 = sdiv <8 x i8> undef, undef
   %V16i8 = sdiv <16 x i8> undef, undef
   %V32i8 = sdiv <32 x i8> undef, undef
   %V64i8 = sdiv <64 x i8> undef, undef
@@ -57,14 +69,20 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef
@@ -78,16 +96,22 @@ define i32 @udiv() {
   %V8i64 = udiv <8 x i64> undef, undef
 
   %I32 = udiv i32 undef, undef
+  %V2i32 = udiv <2 x i32> undef, undef
   %V4i32 = udiv <4 x i32> undef, undef
   %V8i32 = udiv <8 x i32> undef, undef
   %V16i32 = udiv <16 x i32> undef, undef
 
   %I16 = udiv i16 undef, undef
+  %V2i16 = udiv <2 x i16> undef, undef
+  %V4i16 = udiv <4 x i16> undef, undef
   %V8i16 = udiv <8 x i16> undef, undef
   %V16i16 = udiv <16 x i16> undef, undef
   %V32i16 = udiv <32 x i16> undef, undef
 
   %I8 = udiv i8 undef, undef
+  %V2i8 = udiv <2 x i8> undef, undef
+  %V4i8 = udiv <4 x i8> undef, undef
+  %V8i8 = udiv <8 x i8> undef, undef
   %V16i8 = udiv <16 x i8> undef, undef
   %V32i8 = udiv <32 x i8> undef, undef
   %V64i8 = udiv <64 x i8> undef, undef
@@ -103,14 +127,20 @@ define i32 @sdiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -124,16 +154,22 @@ define i32 @sdiv_const() {
   %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = sdiv i32 undef, 7
+  %V2i32 = sdiv <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = sdiv i16 undef, 7
+  %V2i16 = sdiv <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = sdiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = sdiv i8 undef, 7
+  %V2i8 = sdiv <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = sdiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = sdiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -149,14 +185,20 @@ define i32 @udiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -171,16 +213,22 @@ define i32 @udiv_const() {
   %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = udiv i32 undef, 7
+  %V2i32 = udiv <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = udiv i16 undef, 7
+  %V2i16 = udiv <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = udiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = udiv i8 undef, 7
+  %V2i8 = udiv <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = udiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = udiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -196,14 +244,20 @@ define i32 @sdiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 7)
@@ -217,16 +271,22 @@ define i32 @sdiv_uniformconst() {
   %V8i64 = sdiv <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = sdiv i32 undef, 7
+  %V2i32 = sdiv <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = sdiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = sdiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = sdiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = sdiv i16 undef, 7
+  %V2i16 = sdiv <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = sdiv <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = sdiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = sdiv i8 undef, 7
+  %V2i8 = sdiv <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = sdiv <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = sdiv <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -242,14 +302,20 @@ define i32 @udiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 7)
@@ -263,16 +329,22 @@ define i32 @udiv_uniformconst() {
   %V8i64 = udiv <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = udiv i32 undef, 7
+  %V2i32 = udiv <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = udiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = udiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = udiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = udiv i16 undef, 7
+  %V2i16 = udiv <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = udiv <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = udiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = udiv i8 undef, 7
+  %V2i8 = udiv <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = udiv <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = udiv <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -288,14 +360,20 @@ define i32 @sdiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -309,16 +387,22 @@ define i32 @sdiv_constpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = sdiv i32 undef, 16
+  %V2i32 = sdiv <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = sdiv i16 undef, 16
+  %V2i16 = sdiv <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = sdiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = sdiv i8 undef, 16
+  %V2i8 = sdiv <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = sdiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -334,14 +418,20 @@ define i32 @udiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -355,16 +445,22 @@ define i32 @udiv_constpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = udiv i32 undef, 16
+  %V2i32 = udiv <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = udiv i16 undef, 16
+  %V2i16 = udiv <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = udiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = udiv i8 undef, 16
+  %V2i8 = udiv <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = udiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = udiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -380,14 +476,20 @@ define i32 @sdiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 16)
@@ -401,16 +503,22 @@ define i32 @sdiv_uniformconstpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = sdiv i32 undef, 16
+  %V2i32 = sdiv <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = sdiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = sdiv i16 undef, 16
+  %V2i16 = sdiv <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = sdiv <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = sdiv i8 undef, 16
+  %V2i8 = sdiv <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = sdiv <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = sdiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -426,14 +534,20 @@ define i32 @udiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 16)
@@ -447,16 +561,22 @@ define i32 @udiv_uniformconstpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = udiv i32 undef, 16
+  %V2i32 = udiv <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = udiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = udiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = udiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = udiv i16 undef, 16
+  %V2i16 = udiv <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = udiv <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = udiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = udiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = udiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = udiv i8 undef, 16
+  %V2i8 = udiv <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = udiv <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = udiv <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = udiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = udiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = udiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -472,14 +592,20 @@ define i32 @sdiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -493,16 +619,22 @@ define i32 @sdiv_constnegpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = sdiv i32 undef, -16
+  %V2i32 = sdiv <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = sdiv i16 undef, -16
+  %V2i16 = sdiv <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = sdiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = sdiv i8 undef, -16
+  %V2i8 = sdiv <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = sdiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -518,14 +650,20 @@ define i32 @udiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -539,16 +677,22 @@ define i32 @udiv_constnegpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = udiv i32 undef, -16
+  %V2i32 = udiv <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = udiv i16 undef, -16
+  %V2i16 = udiv <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = udiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = udiv i8 undef, -16
+  %V2i8 = udiv <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = udiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = udiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -564,14 +708,20 @@ define i32 @sdiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
@@ -585,16 +735,22 @@ define i32 @sdiv_uniformconstnegpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = sdiv i32 undef, -16
+  %V2i32 = sdiv <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = sdiv <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = sdiv <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = sdiv <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = sdiv i16 undef, -16
+  %V2i16 = sdiv <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = sdiv <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = sdiv <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = sdiv <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = sdiv <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = sdiv i8 undef, -16
+  %V2i8 = sdiv <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = sdiv <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = sdiv <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = sdiv <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = sdiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
@@ -610,14 +766,20 @@ define i32 @udiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 -16)
@@ -631,16 +793,22 @@ define i32 @udiv_uniformconstnegpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = udiv i32 undef, -16
+  %V2i32 = udiv <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = udiv <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = udiv <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = udiv <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = udiv i16 undef, -16
+  %V2i16 = udiv <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = udiv <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = udiv <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = udiv <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = udiv <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = udiv i8 undef, -16
+  %V2i8 = udiv <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = udiv <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = udiv <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = udiv <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = udiv <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = udiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
diff --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll
index 2f1e8c8..06c05ae 100644
--- a/llvm/test/Analysis/CostModel/AArch64/rem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll
@@ -5,40 +5,55 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i32 @srem() {
 ; CHECK-LABEL: 'srem'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, undef
+
   %I64 = srem i64 undef, undef
   %V2i64 = srem <2 x i64> undef, undef
   %V4i64 = srem <4 x i64> undef, undef
   %V8i64 = srem <8 x i64> undef, undef
 
   %I32 = srem i32 undef, undef
+  %V2i32 = srem <2 x i32> undef, undef
   %V4i32 = srem <4 x i32> undef, undef
   %V8i32 = srem <8 x i32> undef, undef
   %V16i32 = srem <16 x i32> undef, undef
 
   %I16 = srem i16 undef, undef
+  %V2i16 = srem <2 x i16> undef, undef
+  %V4i16 = srem <4 x i16> undef, undef
   %V8i16 = srem <8 x i16> undef, undef
   %V16i16 = srem <16 x i16> undef, undef
   %V32i16 = srem <32 x i16> undef, undef
 
   %I8 = srem i8 undef, undef
+  %V2i8 = srem <2 x i8> undef, undef
+  %V4i8 = srem <4 x i8> undef, undef
+  %V8i8 = srem <8 x i8> undef, undef
   %V16i8 = srem <16 x i8> undef, undef
   %V32i8 = srem <32 x i8> undef, undef
   %V64i8 = srem <64 x i8> undef, undef
@@ -48,40 +63,55 @@ define i32 @srem() {
 
 define i32 @urem() {
 ; CHECK-LABEL: 'urem'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, undef
+
   %I64 = urem i64 undef, undef
   %V2i64 = urem <2 x i64> undef, undef
   %V4i64 = urem <4 x i64> undef, undef
   %V8i64 = urem <8 x i64> undef, undef
 
   %I32 = urem i32 undef, undef
+  %V2i32 = urem <2 x i32> undef, undef
   %V4i32 = urem <4 x i32> undef, undef
   %V8i32 = urem <8 x i32> undef, undef
   %V16i32 = urem <16 x i32> undef, undef
 
   %I16 = urem i16 undef, undef
+  %V2i16 = urem <2 x i16> undef, undef
+  %V4i16 = urem <4 x i16> undef, undef
   %V8i16 = urem <8 x i16> undef, undef
   %V16i16 = urem <16 x i16> undef, undef
   %V32i16 = urem <32 x i16> undef, undef
 
   %I8 = urem i8 undef, undef
+  %V2i8 = urem <2 x i8> undef, undef
+  %V4i8 = urem <4 x i8> undef, undef
+  %V8i8 = urem <8 x i8> undef, undef
   %V16i8 = urem <16 x i8> undef, undef
   %V32i8 = urem <32 x i8> undef, undef
   %V64i8 = urem <64 x i8> undef, undef
@@ -91,40 +121,55 @@ define i32 @urem() {
 
 define i32 @srem_const() {
 ; CHECK-LABEL: 'srem_const'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 7
+
   %I64 = srem i64 undef, 7
   %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
   %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
   %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = srem i32 undef, 7
+  %V2i32 = srem <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = srem i16 undef, 7
+  %V2i16 = srem <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = srem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = srem i8 undef, 7
+  %V2i8 = srem <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = srem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = srem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -134,40 +179,56 @@ define i32 @srem_const() {
 
 define i32 @urem_const() {
 ; CHECK-LABEL: 'urem_const'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+
+  %I128 = urem i128 undef, 7
+
   %I64 = urem i64 undef, 7
   %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
   %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
   %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = urem i32 undef, 7
+  %V2i32 = urem <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = urem i16 undef, 7
+  %V2i16 = urem <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = urem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = urem i8 undef, 7
+  %V2i8 = urem <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = urem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = urem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -177,40 +238,55 @@ define i32 @urem_const() {
 
 define i32 @srem_uniformconst() {
 ; CHECK-LABEL: 'srem_uniformconst'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 7
+
   %I64 = srem i64 undef, 7
   %V2i64 = srem <2 x i64> undef, <i64 7, i64 7>
   %V4i64 = srem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
   %V8i64 = srem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = srem i32 undef, 7
+  %V2i32 = srem <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = srem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = srem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = srem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = srem i16 undef, 7
+  %V2i16 = srem <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = srem <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = srem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = srem i8 undef, 7
+  %V2i8 = srem <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = srem <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = srem <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -220,40 +296,55 @@ define i32 @srem_uniformconst() {
 
 define i32 @urem_uniformconst() {
 ; CHECK-LABEL: 'urem_uniformconst'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 7
+
   %I64 = urem i64 undef, 7
   %V2i64 = urem <2 x i64> undef, <i64 7, i64 7>
   %V4i64 = urem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
   %V8i64 = urem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = urem i32 undef, 7
+  %V2i32 = urem <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = urem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = urem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = urem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = urem i16 undef, 7
+  %V2i16 = urem <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = urem <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = urem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = urem i8 undef, 7
+  %V2i8 = urem <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = urem <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = urem <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -263,40 +354,55 @@ define i32 @urem_uniformconst() {
 
 define i32 @srem_constpow2() {
 ; CHECK-LABEL: 'srem_constpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 16
+
   %I64 = srem i64 undef, 16
   %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
   %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
   %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = srem i32 undef, 16
+  %V2i32 = srem <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = srem i16 undef, 16
+  %V2i16 = srem <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = srem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = srem i8 undef, 16
+  %V2i8 = srem <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = srem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = srem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -306,40 +412,55 @@ define i32 @srem_constpow2() {
 
 define i32 @urem_constpow2() {
 ; CHECK-LABEL: 'urem_constpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 16
+
   %I64 = urem i64 undef, 16
   %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
   %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
   %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = urem i32 undef, 16
+  %V2i32 = urem <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = urem i16 undef, 16
+  %V2i16 = urem <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = urem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = urem i8 undef, 16
+  %V2i8 = urem <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = urem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = urem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -349,40 +470,55 @@ define i32 @urem_constpow2() {
 
 define i32 @srem_uniformconstpow2() {
 ; CHECK-LABEL: 'srem_uniformconstpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 16
+
   %I64 = srem i64 undef, 16
   %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
   %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
   %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = srem i32 undef, 16
+  %V2i32 = srem <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = srem i16 undef, 16
+  %V2i16 = srem <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = srem <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = srem i8 undef, 16
+  %V2i8 = srem <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = srem <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = srem <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -392,40 +528,55 @@ define i32 @srem_uniformconstpow2() {
 
 define i32 @urem_uniformconstpow2() {
 ; CHECK-LABEL: 'urem_uniformconstpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 16
+
   %I64 = urem i64 undef, 16
   %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
   %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
   %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = urem i32 undef, 16
+  %V2i32 = urem <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = urem i16 undef, 16
+  %V2i16 = urem <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = urem <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = urem i8 undef, 16
+  %V2i8 = urem <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = urem <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = urem <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -435,40 +586,55 @@ define i32 @urem_uniformconstpow2() {
 
 define i32 @srem_constnegpow2() {
 ; CHECK-LABEL: 'srem_constnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, -16
+
   %I64 = srem i64 undef, -16
   %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
   %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
   %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = srem i32 undef, -16
+  %V2i32 = srem <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = srem i16 undef, -16
+  %V2i16 = srem <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = srem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = srem i8 undef, -16
+  %V2i8 = srem <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = srem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = srem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -478,40 +644,55 @@ define i32 @srem_constnegpow2() {
 
 define i32 @urem_constnegpow2() {
 ; CHECK-LABEL: 'urem_constnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, -16
+
   %I64 = urem i64 undef, -16
   %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
   %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
   %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = urem i32 undef, -16
+  %V2i32 = urem <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = urem i16 undef, -16
+  %V2i16 = urem <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = urem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = urem i8 undef, -16
+  %V2i8 = urem <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = urem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = urem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -521,40 +702,55 @@ define i32 @urem_constnegpow2() {
 
 define i32 @srem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'srem_uniformconstnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, -16
+
   %I64 = srem i64 undef, -16
   %V2i64 = srem <2 x i64> undef, <i64 -16, i64 -16>
   %V4i64 = srem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
   %V8i64 = srem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = srem i32 undef, -16
+  %V2i32 = srem <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = srem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = srem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = srem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = srem i16 undef, -16
+  %V2i16 = srem <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = srem <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = srem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = srem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = srem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = srem i8 undef, -16
+  %V2i8 = srem <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = srem <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = srem <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = srem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = srem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = srem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
@@ -564,40 +760,55 @@ define i32 @srem_uniformconstnegpow2() {
 
 define i32 @urem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'urem_uniformconstnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, -16
+
   %I64 = urem i64 undef, -16
   %V2i64 = urem <2 x i64> undef, <i64 -16, i64 -16>
   %V4i64 = urem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
   %V8i64 = urem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = urem i32 undef, -16
+  %V2i32 = urem <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = urem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = urem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = urem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = urem i16 undef, -16
+  %V2i16 = urem <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = urem <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = urem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = urem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = urem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = urem i8 undef, -16
+  %V2i8 = urem <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = urem <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = urem <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = urem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = urem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = urem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
new file mode 100644
index 0000000..b81b6a9
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @extract_half() {
+; CHECK-LABEL: 'extract_half'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i8_lo = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_hi = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8_lo = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 0>
+  %v2i8_hi = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 1>
+  %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 1, i32 2>
+  %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+  %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 0>
+  %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 1>
+  %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+  %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 1, i32 2>
+  %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 0>
+  %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 1>
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 0>
+  %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  ret void
+}
+
+define void @extract_qtr() {
+; CHECK-LABEL: 'extract_qtr'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 0>
+  %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 1>
+  %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 2>
+  %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 0, i32 1>
+  %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 2, i32 3>
+  %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 4, i32 5>
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 0>
+  %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 1>
+  %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 2>
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 0>
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 0>
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
index 075397a..9c573c7 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -1,97 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefix=COST
-; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
-; COST-LABEL: sel.v8i8
-; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
-; CODE-LABEL: sel.v8i8
-; CODE:       tbl v0.8b, { v0.16b }, v1.8b
-define <8 x i8> @sel.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+; COST-LABEL: 'sel_v8i8'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %tmp0
+;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i8> %tmp0
 }
 
-; COST-LABEL: sel.v16i8
-; COST:       Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-; CODE-LABEL: sel.v16i8
-; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-define <16 x i8> @sel.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+; COST-LABEL: 'sel_v16i8'
+; COST-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %tmp0
+;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
   ret <16 x i8> %tmp0
 }
 
-; COST-LABEL: sel.v4i16
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CODE-LABEL: sel.v4i16
-; CODE:       rev32 v0.4h, v0.4h
-; CODE:       trn2 v0.4h, v0.4h, v1.4h
-define <4 x i16> @sel.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+; COST-LABEL: 'sel_v4i16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %tmp0
+;
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x i16> %tmp0
 }
 
-; COST-LABEL: sel.v8i16
-; COST:       Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
-; CODE-LABEL: sel.v8i16
-; CODE:       tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-define <8 x i16> @sel.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+; COST-LABEL: 'sel_v8i16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %tmp0
+;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   ret <8 x i16> %tmp0
 }
 
-; COST-LABEL: sel.v2i32
-; COST:        Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 3>
-; CODE-LABEL: sel.v2i32
-; CODE:       mov v0.s[1], v1.s[1]
-define <2 x i32> @sel.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+define <2 x i32> @sel_v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+; COST-LABEL: 'sel_v2i32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 3>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %tmp0
+;
   %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 3>
   ret <2 x i32> %tmp0
 }
 
-; COST-LABEL: sel.v4i32
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CODE-LABEL: sel.v4i32
-; CODE:       rev64 v0.4s, v0.4s
-; CODE:       trn2 v0.4s, v0.4s, v1.4s
-define <4 x i32> @sel.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+define <4 x i32> @sel_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; COST-LABEL: 'sel_v4i32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %tmp0
+;
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x i32> %tmp0
 }
 
-; COST-LABEL: sel.v2i64
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 3>
-; CODE-LABEL: sel.v2i64
-; CODE:       mov v0.d[1], v1.d[1]
-define <2 x i64> @sel.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+define <2 x i64> @sel_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; COST-LABEL: 'sel_v2i64'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 3>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %tmp0
+;
   %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %tmp0
 }
 
-; COST-LABEL: sel.v2f32
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 3>
-; CODE-LABEL: sel.v2f32
-; CODE:       mov v0.s[1], v1.s[1]
-define <2 x float> @sel.v2f32(<2 x float> %v0, <2 x float> %v1) {
+define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
+; COST-LABEL: 'sel_v4f16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %tmp0
+;
+  %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x half> %tmp0
+}
+
+define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
+; COST-LABEL: 'sel_v8f16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %tmp0
+;
+  %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x half> %tmp0
+}
+
+define <2 x float> @sel_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; COST-LABEL: 'sel_v2f32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 3>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %tmp0
+;
   %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 3>
   ret <2 x float> %tmp0
 }
 
-; COST-LABEL: sel.v4f32
-; COST:       Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CODE-LABEL: sel.v4f32
-; CODE:       rev64 v0.4s, v0.4s
-; CODE:       trn2 v0.4s, v0.4s, v1.4s
-define <4 x float> @sel.v4f32(<4 x float> %v0, <4 x float> %v1) {
+define <4 x float> @sel_v4f32(<4 x float> %v0, <4 x float> %v1) {
+; COST-LABEL: 'sel_v4f32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %tmp0
+;
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x float> %tmp0
 }
 
-; COST-LABEL: sel.v2f64
-; COST:       Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 3>
-; CODE-LABEL: sel.v2f64
-; CODE:       mov v0.d[1], v1.d[1]
-define <2 x double> @sel.v2f64(<2 x double> %v0, <2 x double> %v1) {
+define <2 x double> @sel_v2f64(<2 x double> %v0, <2 x double> %v1) {
+; COST-LABEL: 'sel_v2f64'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 3>
+; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %tmp0
+;
   %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %tmp0
 }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index a181567..7107d2b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -399,13 +399,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
@@ -436,13 +436,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
@@ -476,13 +476,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
@@ -513,13 +513,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
index 56f9e18..d1b230c 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
@@ -875,15 +875,6 @@ define void @icmp_sle() {
 
 define void @fcmp_oeq() {
 ; CHECK-LABEL: 'fcmp_oeq'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp oeq <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp oeq <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp oeq <8 x float> undef, undef
@@ -902,16 +893,7 @@ define void @fcmp_oeq() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp oeq <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp oeq <2 x half> undef, undef
-  %v4f16 = fcmp oeq <4 x half> undef, undef
-  %v8f16 = fcmp oeq <8 x half> undef, undef
-  %v16f16 = fcmp oeq <16 x half> undef, undef
 
-  %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp oeq <2 x float> undef, undef
   %v4f32 = fcmp oeq <4 x float> undef, undef
@@ -938,15 +920,6 @@ define void @fcmp_oeq() {
 
 define void @fcmp_one() {
 ; CHECK-LABEL: 'fcmp_one'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp one <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp one <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp one <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp one <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = fcmp one <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = fcmp one <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = fcmp one <8 x float> undef, undef
@@ -965,16 +938,7 @@ define void @fcmp_one() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv8f64 = fcmp one <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp one <2 x half> undef, undef
-  %v4f16 = fcmp one <4 x half> undef, undef
-  %v8f16 = fcmp one <8 x half> undef, undef
-  %v16f16 = fcmp one <16 x half> undef, undef
 
-  %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp one <2 x float> undef, undef
   %v4f32 = fcmp one <4 x float> undef, undef
@@ -1001,15 +965,6 @@ define void @fcmp_one() {
 
 define void @fcmp_olt() {
 ; CHECK-LABEL: 'fcmp_olt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef
@@ -1028,16 +983,7 @@ define void @fcmp_olt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp olt <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp olt <2 x half> undef, undef
-  %v4f16 = fcmp olt <4 x half> undef, undef
-  %v8f16 = fcmp olt <8 x half> undef, undef
-  %v16f16 = fcmp olt <16 x half> undef, undef
 
-  %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp olt <2 x float> undef, undef
   %v4f32 = fcmp olt <4 x float> undef, undef
@@ -1064,15 +1010,6 @@ define void @fcmp_olt() {
 
 define void @fcmp_ole() {
 ; CHECK-LABEL: 'fcmp_ole'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp ole <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp ole <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp ole <8 x float> undef, undef
@@ -1091,16 +1028,7 @@ define void @fcmp_ole() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp ole <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ole <2 x half> undef, undef
-  %v4f16 = fcmp ole <4 x half> undef, undef
-  %v8f16 = fcmp ole <8 x half> undef, undef
-  %v16f16 = fcmp ole <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ole <2 x float> undef, undef
   %v4f32 = fcmp ole <4 x float> undef, undef
@@ -1127,15 +1055,6 @@ define void @fcmp_ole() {
 
 define void @fcmp_ogt() {
 ; CHECK-LABEL: 'fcmp_ogt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp ogt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp ogt <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp ogt <8 x float> undef, undef
@@ -1154,16 +1073,7 @@ define void @fcmp_ogt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp ogt <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ogt <2 x half> undef, undef
-  %v4f16 = fcmp ogt <4 x half> undef, undef
-  %v8f16 = fcmp ogt <8 x half> undef, undef
-  %v16f16 = fcmp ogt <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ogt <2 x float> undef, undef
   %v4f32 = fcmp ogt <4 x float> undef, undef
@@ -1190,15 +1100,6 @@ define void @fcmp_ogt() {
 
 define void @fcmp_oge() {
 ; CHECK-LABEL: 'fcmp_oge'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp oge <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp oge <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp oge <8 x float> undef, undef
@@ -1217,16 +1118,7 @@ define void @fcmp_oge() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp oge <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp oge <2 x half> undef, undef
-  %v4f16 = fcmp oge <4 x half> undef, undef
-  %v8f16 = fcmp oge <8 x half> undef, undef
-  %v16f16 = fcmp oge <16 x half> undef, undef
 
-  %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp oge <2 x float> undef, undef
   %v4f32 = fcmp oge <4 x float> undef, undef
@@ -1253,15 +1145,6 @@ define void @fcmp_oge() {
 
 define void @fcmp_ueq() {
 ; CHECK-LABEL: 'fcmp_ueq'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = fcmp ueq <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = fcmp ueq <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32 = fcmp ueq <8 x float> undef, undef
@@ -1280,16 +1163,7 @@ define void @fcmp_ueq() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv8f64 = fcmp ueq <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ueq <2 x half> undef, undef
-  %v4f16 = fcmp ueq <4 x half> undef, undef
-  %v8f16 = fcmp ueq <8 x half> undef, undef
-  %v16f16 = fcmp ueq <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ueq <2 x float> undef, undef
   %v4f32 = fcmp ueq <4 x float> undef, undef
@@ -1316,15 +1190,6 @@ define void @fcmp_ueq() {
 
 define void @fcmp_une() {
 ; CHECK-LABEL: 'fcmp_une'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp une <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp une <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp une <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = fcmp une <8 x float> undef, undef
@@ -1343,16 +1208,7 @@ define void @fcmp_une() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64 = fcmp une <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp une <2 x half> undef, undef
-  %v4f16 = fcmp une <4 x half> undef, undef
-  %v8f16 = fcmp une <8 x half> undef, undef
-  %v16f16 = fcmp une <16 x half> undef, undef
 
-  %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp une <2 x float> undef, undef
   %v4f32 = fcmp une <4 x float> undef, undef
@@ -1379,15 +1235,6 @@ define void @fcmp_une() {
 
 define void @fcmp_ult() {
 ; CHECK-LABEL: 'fcmp_ult'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ult <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ult <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ult <8 x float> undef, undef
@@ -1406,16 +1253,7 @@ define void @fcmp_ult() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ult <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ult <2 x half> undef, undef
-  %v4f16 = fcmp ult <4 x half> undef, undef
-  %v8f16 = fcmp ult <8 x half> undef, undef
-  %v16f16 = fcmp ult <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ult <2 x float> undef, undef
   %v4f32 = fcmp ult <4 x float> undef, undef
@@ -1442,15 +1280,6 @@ define void @fcmp_ult() {
 
 define void @fcmp_ule() {
 ; CHECK-LABEL: 'fcmp_ule'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ule <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ule <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ule <8 x float> undef, undef
@@ -1469,16 +1298,7 @@ define void @fcmp_ule() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ule <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ule <2 x half> undef, undef
-  %v4f16 = fcmp ule <4 x half> undef, undef
-  %v8f16 = fcmp ule <8 x half> undef, undef
-  %v16f16 = fcmp ule <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ule <2 x float> undef, undef
   %v4f32 = fcmp ule <4 x float> undef, undef
@@ -1505,15 +1325,6 @@ define void @fcmp_ule() {
 
 define void @fcmp_ugt() {
 ; CHECK-LABEL: 'fcmp_ugt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp ugt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp ugt <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp ugt <8 x float> undef, undef
@@ -1532,16 +1343,7 @@ define void @fcmp_ugt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp ugt <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp ugt <2 x half> undef, undef
-  %v4f16 = fcmp ugt <4 x half> undef, undef
-  %v8f16 = fcmp ugt <8 x half> undef, undef
-  %v16f16 = fcmp ugt <16 x half> undef, undef
 
-  %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp ugt <2 x float> undef, undef
   %v4f32 = fcmp ugt <4 x float> undef, undef
@@ -1568,15 +1370,6 @@ define void @fcmp_ugt() {
 
 define void @fcmp_uge() {
 ; CHECK-LABEL: 'fcmp_uge'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp uge <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp uge <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = fcmp uge <8 x float> undef, undef
@@ -1595,16 +1388,7 @@ define void @fcmp_uge() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv8f64 = fcmp uge <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp uge <2 x half> undef, undef
-  %v4f16 = fcmp uge <4 x half> undef, undef
-  %v8f16 = fcmp uge <8 x half> undef, undef
-  %v16f16 = fcmp uge <16 x half> undef, undef
 
-  %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp uge <2 x float> undef, undef
   %v4f32 = fcmp uge <4 x float> undef, undef
@@ -1631,15 +1415,6 @@ define void @fcmp_uge() {
 
 define void @fcmp_true() {
 ; CHECK-LABEL: 'fcmp_true'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp true <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp true <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = fcmp true <8 x float> undef, undef
@@ -1658,16 +1433,7 @@ define void @fcmp_true() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64 = fcmp true <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp true <2 x half> undef, undef
-  %v4f16 = fcmp true <4 x half> undef, undef
-  %v8f16 = fcmp true <8 x half> undef, undef
-  %v16f16 = fcmp true <16 x half> undef, undef
 
-  %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
 
   %v2f32 = fcmp true <2 x float> undef, undef
   %v4f32 = fcmp true <4 x float> undef, undef
@@ -1694,15 +1460,6 @@ define void @fcmp_true() {
 
 define void @fcmp_false() {
 ; CHECK-LABEL: 'fcmp_false'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fcmp false <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = fcmp false <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32 = fcmp false <8 x float> undef, undef
@@ -1721,16 +1478,7 @@ define void @fcmp_false() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64 = fcmp false <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16 = fcmp false <2 x half> undef, undef
-  %v4f16 = fcmp false <4 x half> undef, undef
-  %v8f16 = fcmp false <8 x half> undef, undef
-  %v16f16 = fcmp false <16 x half> undef, undef
-
-  %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
-  %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
-  %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
-  %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
-  %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+
 
   %v2f32 = fcmp false <2 x float> undef, undef
   %v4f32 = fcmp false <4 x float> undef, undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll
new file mode 100644
index 0000000..8396e80
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-fcmp-f16.ll
@@ -0,0 +1,677 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=NOF16
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=VFH
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin -riscv-v-vector-bits-min=-1 < %s | FileCheck %s --check-prefix=VFHMIN
+
+define void @fcmp_oeq() {
+; NOF16-LABEL: 'fcmp_oeq'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_oeq'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_oeq'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oeq <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oeq <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oeq <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp oeq <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp oeq <2 x half> undef, undef
+  %v4f16 = fcmp oeq <4 x half> undef, undef
+  %v8f16 = fcmp oeq <8 x half> undef, undef
+  %v16f16 = fcmp oeq <16 x half> undef, undef
+  %nxv1f16 = fcmp oeq <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp oeq <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_one() {
+; NOF16-LABEL: 'fcmp_one'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp one <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp one <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp one <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp one <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_one'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp one <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp one <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp one <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp one <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_one'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp one <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp one <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp one <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp one <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp one <2 x half> undef, undef
+  %v4f16 = fcmp one <4 x half> undef, undef
+  %v8f16 = fcmp one <8 x half> undef, undef
+  %v16f16 = fcmp one <16 x half> undef, undef
+  %nxv1f16 = fcmp one <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp one <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp one <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp one <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp one <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_olt() {
+; NOF16-LABEL: 'fcmp_olt'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_olt'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_olt'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp olt <2 x half> undef, undef
+  %v4f16 = fcmp olt <4 x half> undef, undef
+  %v8f16 = fcmp olt <8 x half> undef, undef
+  %v16f16 = fcmp olt <16 x half> undef, undef
+  %nxv1f16 = fcmp olt <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp olt <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp olt <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp olt <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp olt <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ole() {
+; NOF16-LABEL: 'fcmp_ole'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ole'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ole'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ole <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ole <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ole <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ole <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ole <2 x half> undef, undef
+  %v4f16 = fcmp ole <4 x half> undef, undef
+  %v8f16 = fcmp ole <8 x half> undef, undef
+  %v16f16 = fcmp ole <16 x half> undef, undef
+  %nxv1f16 = fcmp ole <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ole <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ole <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ole <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ole <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ogt() {
+; NOF16-LABEL: 'fcmp_ogt'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ogt'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ogt'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ogt <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ogt <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ogt <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ogt <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ogt <2 x half> undef, undef
+  %v4f16 = fcmp ogt <4 x half> undef, undef
+  %v8f16 = fcmp ogt <8 x half> undef, undef
+  %v16f16 = fcmp ogt <16 x half> undef, undef
+  %nxv1f16 = fcmp ogt <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ogt <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_oge() {
+; NOF16-LABEL: 'fcmp_oge'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_oge'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_oge'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp oge <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp oge <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp oge <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp oge <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp oge <2 x half> undef, undef
+  %v4f16 = fcmp oge <4 x half> undef, undef
+  %v8f16 = fcmp oge <8 x half> undef, undef
+  %v16f16 = fcmp oge <16 x half> undef, undef
+  %nxv1f16 = fcmp oge <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp oge <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp oge <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp oge <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp oge <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ueq() {
+; NOF16-LABEL: 'fcmp_ueq'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ueq'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ueq'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ueq <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ueq <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ueq <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ueq <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ueq <2 x half> undef, undef
+  %v4f16 = fcmp ueq <4 x half> undef, undef
+  %v8f16 = fcmp ueq <8 x half> undef, undef
+  %v16f16 = fcmp ueq <16 x half> undef, undef
+  %nxv1f16 = fcmp ueq <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ueq <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_une() {
+; NOF16-LABEL: 'fcmp_une'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp une <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp une <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp une <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp une <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_une'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = fcmp une <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_une'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp une <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp une <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp une <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp une <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp une <2 x half> undef, undef
+  %v4f16 = fcmp une <4 x half> undef, undef
+  %v8f16 = fcmp une <8 x half> undef, undef
+  %v16f16 = fcmp une <16 x half> undef, undef
+  %nxv1f16 = fcmp une <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp une <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp une <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp une <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp une <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ult() {
+; NOF16-LABEL: 'fcmp_ult'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ult'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ult'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ult <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ult <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ult <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ult <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ult <2 x half> undef, undef
+  %v4f16 = fcmp ult <4 x half> undef, undef
+  %v8f16 = fcmp ult <8 x half> undef, undef
+  %v16f16 = fcmp ult <16 x half> undef, undef
+  %nxv1f16 = fcmp ult <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ult <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ult <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ult <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ult <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ule() {
+; NOF16-LABEL: 'fcmp_ule'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ule'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ule'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ule <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ule <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ule <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ule <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ule <2 x half> undef, undef
+  %v4f16 = fcmp ule <4 x half> undef, undef
+  %v8f16 = fcmp ule <8 x half> undef, undef
+  %v16f16 = fcmp ule <16 x half> undef, undef
+  %nxv1f16 = fcmp ule <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ule <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ule <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ule <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ule <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_ugt() {
+; NOF16-LABEL: 'fcmp_ugt'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_ugt'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_ugt'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp ugt <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp ugt <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp ugt <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp ugt <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp ugt <2 x half> undef, undef
+  %v4f16 = fcmp ugt <4 x half> undef, undef
+  %v8f16 = fcmp ugt <8 x half> undef, undef
+  %v16f16 = fcmp ugt <16 x half> undef, undef
+  %nxv1f16 = fcmp ugt <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp ugt <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_uge() {
+; NOF16-LABEL: 'fcmp_uge'
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_uge'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_uge'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp uge <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp uge <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp uge <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp uge <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp uge <2 x half> undef, undef
+  %v4f16 = fcmp uge <4 x half> undef, undef
+  %v8f16 = fcmp uge <8 x half> undef, undef
+  %v16f16 = fcmp uge <16 x half> undef, undef
+  %nxv1f16 = fcmp uge <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp uge <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp uge <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp uge <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp uge <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_true() {
+; NOF16-LABEL: 'fcmp_true'
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v2f16 = fcmp true <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v4f16 = fcmp true <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v8f16 = fcmp true <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v16f16 = fcmp true <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_true'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_true'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp true <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp true <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp true <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp true <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp true <2 x half> undef, undef
+  %v4f16 = fcmp true <4 x half> undef, undef
+  %v8f16 = fcmp true <8 x half> undef, undef
+  %v16f16 = fcmp true <16 x half> undef, undef
+  %nxv1f16 = fcmp true <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp true <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp true <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp true <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp true <vscale x 16 x half> undef, undef
+  ret void
+}
+define void @fcmp_false() {
+; NOF16-LABEL: 'fcmp_false'
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v2f16 = fcmp false <2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v4f16 = fcmp false <4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v8f16 = fcmp false <8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Invalid cost for instruction: %v16f16 = fcmp false <16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+; NOF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFH-LABEL: 'fcmp_false'
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+; VFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; VFHMIN-LABEL: 'fcmp_false'
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fcmp false <2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = fcmp false <4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = fcmp false <8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = fcmp false <16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+; VFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp false <2 x half> undef, undef
+  %v4f16 = fcmp false <4 x half> undef, undef
+  %v8f16 = fcmp false <8 x half> undef, undef
+  %v16f16 = fcmp false <16 x half> undef, undef
+  %nxv1f16 = fcmp false <vscale x 1 x half> undef, undef
+  %nxv2f16 = fcmp false <vscale x 2 x half> undef, undef
+  %nxv4f16 = fcmp false <vscale x 4 x half> undef, undef
+  %nxv8f16 = fcmp false <vscale x 8 x half> undef, undef
+  %nxv16f16 = fcmp false <vscale x 16 x half> undef, undef
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll
index 30bae7e..cada8ab 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll
@@ -734,7 +734,7 @@ define void @shuffle2() vscale_range(2,2) {
 define void @multipart() vscale_range(2,2) {
 ; RV32-LABEL: 'multipart'
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
@@ -743,18 +743,18 @@ define void @multipart() vscale_range(2,2) {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> <i32 2, i32 3, i32 0>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 4, i32 4>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'multipart'
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
@@ -763,18 +763,18 @@ define void @multipart() vscale_range(2,2) {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> <i32 2, i32 3, i32 0>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 4, i32 4>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SIZE-LABEL: 'multipart'
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16a = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
@@ -783,11 +783,11 @@ define void @multipart() vscale_range(2,2) {
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v323 = shufflevector <3 x i32> poison, <3 x i32> poison, <3 x i32> <i32 2, i32 3, i32 0>
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64ab = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 4, i32 4>
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64a = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64b = shufflevector <2 x double> poison, <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64ab = shufflevector <4 x double> poison, <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll b/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
index 20d2736..033537d 100644
--- a/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
@@ -247,21 +247,9 @@ define <4 x float> @test_v4f32_2(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_v4f32_3(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: 'test_v4f32_3'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
-;
-; SSSE3-LABEL: 'test_v4f32_3'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
-;
-; SSE42-LABEL: 'test_v4f32_3'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
-;
-; AVX-LABEL: 'test_v4f32_3'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
+; CHECK-LABEL: 'test_v4f32_3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %1
 ;
   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   ret <4 x float> %1
@@ -294,13 +282,9 @@ define <4 x i64> @test_v4i64_2(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <4 x i64> @test_v4i64_3(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: 'test_v4i64_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
-;
-; AVX-LABEL: 'test_v4i64_3'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
+; CHECK-LABEL: 'test_v4i64_3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
 ;
   %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x i64> %1
@@ -333,13 +317,9 @@ define <4 x double> @test_v4f64_2(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @test_v4f64_3(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: 'test_v4f64_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
-;
-; AVX-LABEL: 'test_v4f64_3'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
+; CHECK-LABEL: 'test_v4f64_3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
 ;
   %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
   ret <4 x double> %1
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index c555459..d7cf8e6 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -926,8 +926,8 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 
 define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'pairwise_reduction8float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -939,8 +939,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8float'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -952,8 +952,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8float'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -991,8 +991,8 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8float'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
index 4e42351..11ee433 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -133,98 +133,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -234,14 +172,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -265,14 +203,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +301,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +315,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +332,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +346,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +400,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +431,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +532,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +540,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +567,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +575,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +602,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +610,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
index 61978ba..ac407cb 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -133,98 +133,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -234,14 +172,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -265,14 +203,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +301,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +315,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +332,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +346,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +400,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +431,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +532,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +540,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +567,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +575,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +602,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +610,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
index 94e3bc3..46d9e46 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -133,98 +133,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -234,14 +172,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -265,14 +203,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +301,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +315,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +332,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +346,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +400,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +431,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +532,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +540,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +567,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +575,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +602,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +610,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
index fe3e61d..bc3af51 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -133,98 +133,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_256 = shufflevector <2 x float> %src64, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_512 = shufflevector <2 x float> %src64, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <4 x float> %src128, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src64_128 = shufflevector <2 x float> %src64, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -234,14 +172,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -265,14 +203,14 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <4 x float> %src128, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -363,13 +301,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +315,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -394,13 +332,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +346,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -425,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -462,7 +400,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -493,7 +431,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -594,7 +532,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +540,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -629,7 +567,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +575,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -664,7 +602,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +610,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
index e66bce0..0173439 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
index 77f0072..d45c96c 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
index d01e011..0f8b32e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
index 7b882e3..c483d5d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll
index 3fa5458..330cbc0 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-latency.ll
@@ -47,7 +47,7 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -86,7 +86,7 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -127,7 +127,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -174,7 +174,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -336,7 +336,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -345,7 +345,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -354,7 +354,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
index 39c935f..0215f65 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
@@ -4,9 +4,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
@@ -19,20 +19,20 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
@@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
@@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXf16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXf16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXf16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
@@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
@@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
@@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
@@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V2  = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
@@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; SSE-LABEL: 'test_upper_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
index 2a89924..b20986e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
@@ -4,9 +4,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
@@ -19,26 +19,26 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -50,26 +50,26 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
@@ -82,29 +82,29 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -118,29 +118,29 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXf16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXf16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXf16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
@@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
@@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
@@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
@@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V2  = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
@@ -483,29 +433,29 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; SSE-LABEL: 'test_upper_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
index 848e7b4..56d8cad 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat-sizelatency.ll
@@ -4,9 +4,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
@@ -19,20 +19,20 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
@@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
@@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXf16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXf16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXf16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
@@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
@@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
@@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
@@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V2  = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
@@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; SSE-LABEL: 'test_upper_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
index 4c6d1ccd..56f56c3c 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
@@ -4,9 +4,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
@@ -19,20 +19,20 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
@@ -50,20 +50,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
@@ -82,22 +82,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -118,22 +118,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -154,58 +154,42 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXf16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXf16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXf16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'test_vXf16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -260,66 +244,50 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'test_vXi16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -332,11 +300,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -344,8 +312,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -353,8 +321,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -362,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -371,36 +339,18 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'test_vXi8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -415,10 +365,10 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
@@ -426,8 +376,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
@@ -435,8 +385,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
@@ -444,8 +394,8 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
@@ -453,17 +403,17 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V2  = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
@@ -483,22 +433,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; SSE-LABEL: 'test_upper_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
index f67d681..3183331 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
@@ -71,7 +71,7 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
index 8b02b82..4783f23 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
@@ -71,7 +71,7 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
index 65558da..86eabc7 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
@@ -71,7 +71,7 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
index b687df5..e0bf638 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
@@ -71,7 +71,7 @@ define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
index 027af62..b672df5 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
index f9f045f..fc8c0cd 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
index 76690af..b48b620 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
index 034ec0a..efa0f2e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
index 2524976..ee82e10 100644
--- a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
@@ -76,58 +76,58 @@ define i32 @insert_double(i32 %arg, double %val, <2 x double> %src128, <4 x doub
 define i32 @insert_float(i32 %arg, float %val, <2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
 ; SSE2-LABEL: 'insert_float'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'insert_float'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'insert_float'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
diff --git a/llvm/test/Analysis/Lint/abi-attrs.ll b/llvm/test/Analysis/Lint/abi-attrs.ll
new file mode 100644
index 0000000..5a3ece6
--- /dev/null
+++ b/llvm/test/Analysis/Lint/abi-attrs.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -passes=lint -disable-output 2>&1 | FileCheck %s
+
+declare void @fn_nothing_i8(i8 %x)
+declare void @fn_zeroext(i8 zeroext %x)
+declare void @fn_signext(i8 signext %x)
+declare void @fn_inreg(i8 inreg %x)
+
+declare void @fn_nothing_ptr(ptr %x)
+declare void @fn_byval(ptr byval(i8) %x)
+declare void @fn_byref(ptr byref(i8) %x)
+declare void @fn_inalloca(ptr inalloca(i8) %x)
+declare void @fn_preallocated(ptr preallocated(i8) %x)
+declare void @fn_sret(ptr sret(i8) %x)
+
+define void @caller_zeroext(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute zeroext not present on both function and call-site
+; CHECK:  call void @fn_zeroext(i8 %x)
+  call void @fn_zeroext(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute zeroext not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 zeroext %x)
+  call void @fn_nothing_i8(i8 zeroext %x)
+  ret void
+}
+
+define void @caller_signext(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute signext not present on both function and call-site
+; CHECK:  call void @fn_signext(i8 %x)
+  call void @fn_signext(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute signext not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 signext %x)
+  call void @fn_nothing_i8(i8 signext %x)
+  ret void
+}
+
+define void @caller_inreg(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute inreg not present on both function and call-site
+; CHECK:  call void @fn_inreg(i8 %x)
+  call void @fn_inreg(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute inreg not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 inreg %x)
+  call void @fn_nothing_i8(i8 inreg %x)
+  ret void
+}
+
+define void @caller_byval(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute byval not present on both function and call-site
+; CHECK:  call void @fn_byval(ptr %x)
+  call void @fn_byval(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute byval not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr byval(i8) %x)
+  call void @fn_nothing_ptr(ptr byval(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute byval does not have same argument for function and call-site
+; CHECK:  call void @fn_byval(ptr byval(i16) %x)
+  call void @fn_byval(ptr byval(i16) %x)
+  ret void
+}
+
+define void @caller_byref(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute byref not present on both function and call-site
+; CHECK:  call void @fn_byref(ptr %x)
+  call void @fn_byref(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute byref not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr byref(i8) %x)
+  call void @fn_nothing_ptr(ptr byref(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute byref does not have same argument for function and call-site
+; CHECK:  call void @fn_byref(ptr byref(i16) %x)
+  call void @fn_byref(ptr byref(i16) %x)
+  ret void
+}
+
+define void @caller_inalloca(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute inalloca not present on both function and call-site
+; CHECK:  call void @fn_inalloca(ptr %x)
+  call void @fn_inalloca(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute inalloca not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr inalloca(i8) %x)
+  call void @fn_nothing_ptr(ptr inalloca(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute inalloca does not have same argument for function and call-site
+; CHECK:  call void @fn_inalloca(ptr inalloca(i16) %x)
+  call void @fn_inalloca(ptr inalloca(i16) %x)
+  ret void
+}
+
+define void @caller_sret(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute sret not present on both function and call-site
+; CHECK:  call void @fn_sret(ptr %x)
+  call void @fn_sret(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute sret not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr sret(i8) %x)
+  call void @fn_nothing_ptr(ptr sret(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute sret does not have same argument for function and call-site
+; CHECK:  call void @fn_sret(ptr sret(i16) %x)
+  call void @fn_sret(ptr sret(i16) %x)
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
index 81fe96a..46dccf4 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
@@ -310,3 +310,29 @@ inner.header:
 exit:
   ret void
 }
+
+; Checks correct traversal for loops without a unique predecessor
+; outside the loop.
+define void @pr120615() {
+; CHECK-LABEL: pr120615
+; CHECK-NEXT:  Determining loop execution counts for: @pr120615
+; CHECK-NEXT:  Loop %header: backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+entry:
+  br label %header
+
+bb:
+  br label %header
+
+header:
+  %0 = phi i32 [ %1, %header ], [ 0, %bb ], [ 0, %entry ]
+  %1 = add i32 %0, 1
+  %icmp = icmp slt i32 %0, 0
+  br i1 %icmp, label %header, label %exit
+
+exit:
+  ret void
+
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
index 89d8c5a..14f33d7 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define i32 @daorder(i32 %n) {
+define ptx_kernel i32 @daorder(i32 %n) {
 ; CHECK-LABEL: for function 'daorder'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -43,6 +43,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @daorder, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
index 0ac1b5f5..cf8ffad 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
@@ -4,7 +4,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
-define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'no_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -27,7 +27,7 @@ merge:
 ; if (threadIdx.x < 5)    // divergent: data dependent
 ;   c = b;
 ; return c;               // c is divergent: sync dependent
-define i32 @sync(i32 %a, i32 %b) {
+define ptx_kernel i32 @sync(i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'sync'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -49,7 +49,7 @@ bb3:
 ; }
 ; // c here is divergent because it is sync dependent on threadIdx.x >= 5
 ; return c;
-define i32 @mixed(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @mixed(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'mixed'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
@@ -101,7 +101,7 @@ merge:
 ; return i == 10 ? 0 : 1; // i here is divergent
 ;
 ; The i defined in the loop is used outside.
-define i32 @loop() {
+define ptx_kernel i32 @loop() {
 ; CHECK-LABEL: for function 'loop'
 entry:
   %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
@@ -149,7 +149,7 @@ else:
 }
 
 ; Verifies sync-dependence is computed correctly in the absense of loops.
-define i32 @sync_no_loop(i32 %arg) {
+define ptx_kernel i32 @sync_no_loop(i32 %arg) {
 ; CHECK-LABEL: for function 'sync_no_loop'
 entry:
   %0 = add i32 %arg, 1
@@ -174,9 +174,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
 
-!nvvm.annotations = !{!0, !1, !2, !3, !4}
-!0 = !{ptr @no_diverge, !"kernel", i32 1}
-!1 = !{ptr @sync, !"kernel", i32 1}
-!2 = !{ptr @mixed, !"kernel", i32 1}
-!3 = !{ptr @loop, !"kernel", i32 1}
-!4 = !{ptr @sync_no_loop, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
index e319211..65512bf 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'hidden_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -27,6 +27,3 @@ merge:
 }
 
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @hidden_diverge, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
index cd729a9..e1ecc69 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
@@ -23,7 +23,7 @@ target triple = "nvptx64-nvidia-cuda"
 ;                             V
 ;                        if (i3 == 5) // divergent
 ; because sync dependent on (tid / i3).
-define i32 @unstructured_loop(i1 %entry_cond) {
+define ptx_kernel i32 @unstructured_loop(i1 %entry_cond) {
 ; CHECK-LABEL: for function 'unstructured_loop'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -59,5 +59,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @unstructured_loop, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll b/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll
new file mode 100644
index 0000000..52f12a6
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/knownbits-trunc-with-min-max-clamp.ll
@@ -0,0 +1,388 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s
+
+; The LIT tests rely on i32, i16 and i8 being valid machine types.
+; The bounds checking tests require also i64 and i128.
+target datalayout = "n8:16:32:64:128"
+
+; This LIT test checks if TruncInstCombine pass correctly recognizes the
+; constraints from a signed min-max clamp. The clamp is a sequence of smin and
+; smax instructions limiting a variable into a range, smin <= x <= smax.
+;
+; Each LIT test (except the last ones) has two versions depending on the order
+; of smin and smax:
+; a) y = smax(smin(x, upper_limit), lower_limit)
+; b) y = smin(smax(x, lower_limit), upper_limit)
+;
+; The clamp is used in TruncInstCombine.cpp pass (as part of aggressive-instcombine)
+; to optimize extensions and truncations of lshr. This is what is tested here.
+; The pass also optimizes extensions and truncations of other binary operators,
+; but in such cases the smin-smax clamp may not be used.
+
+define i8 @test_0a(i16 %x) {
+; CHECK-LABEL: define i8 @test_0a(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_0b(i16 %x) {
+; CHECK-LABEL: define i8 @test_0b(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 0)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; The following two tests contain add instead of lshr.
+; The optimization works here as well.
+define i8 @test_1a(i16 %x) {
+; CHECK-LABEL: define i8 @test_1a(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = add i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %a = sext i16 %2 to i32
+  %b = add i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_1b(i16 %x) {
+; CHECK-LABEL: define i8 @test_1b(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = add i8 [[A]], 2
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 0)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31)
+  %a = sext i16 %2 to i32
+  %b = add i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Tests for clamping with negative min and max.
+
+; With sext no optimization occurs.
+define i8 @test_2a(i16 %x) {
+; CHECK-LABEL: define i8 @test_2a(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 -1)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
+; CHECK-NEXT:    [[A:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 -1)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_2b(i16 %x) {
+; CHECK-LABEL: define i8 @test_2b(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 -1)
+; CHECK-NEXT:    [[A:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 -1)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; With zext the optimization occurs.
+define i8 @test_2c(i16 %x) {
+; CHECK-LABEL: define i8 @test_2c(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 -1)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 -1)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_2d(i16 %x) {
+; CHECK-LABEL: define i8 @test_2d(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 -1)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 -1)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Tests for clamping with mixed-signed min and max.
+; With zext the optimization occurs.
+define i8 @test_3a(i16 %x) {
+; CHECK-LABEL: define i8 @test_3a(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -31)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 31)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -31)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_3b(i16 %x) {
+; CHECK-LABEL: define i8 @test_3b(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smax.i16(i16 [[X]], i16 -31)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP1]], i16 31)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -31)
+  %2 = tail call i16 @llvm.smin.i16(i16 %1, i16 31)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Optimizations with vector types.
+define <16 x i8> @test_vec_1a(<16 x i16> %x) {
+; CHECK-LABEL: define <16 x i8> @test_vec_1a(
+; CHECK-SAME: <16 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[X]], <16 x i16> splat (i16 127))
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> zeroinitializer)
+; CHECK-NEXT:    [[A:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[B:%.*]] = lshr <16 x i8> [[A]], splat (i8 2)
+; CHECK-NEXT:    ret <16 x i8> [[B]]
+;
+  %1 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %x, <16 x i16> splat (i16 127))
+  %2 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %1, <16 x i16> zeroinitializer)
+  %a = sext <16 x i16> %2 to <16 x i32>
+  %b = lshr <16 x i32> %a, splat (i32 2)
+  %b.trunc = trunc <16 x i32> %b to <16 x i8>
+  ret <16 x i8> %b.trunc
+}
+
+define <16 x i8> @test_vec_1b(<16 x i16> %x) {
+; CHECK-LABEL: define <16 x i8> @test_vec_1b(
+; CHECK-SAME: <16 x i16> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[X]], <16 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> splat (i16 127))
+; CHECK-NEXT:    [[A:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+; CHECK-NEXT:    [[B:%.*]] = lshr <16 x i8> [[A]], splat (i8 2)
+; CHECK-NEXT:    ret <16 x i8> [[B]]
+;
+  %1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %x, <16 x i16> zeroinitializer)
+  %2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 127))
+  %a = sext <16 x i16> %2 to <16 x i32>
+  %b = lshr <16 x i32> %a, splat (i32 2)
+  %b.trunc = trunc <16 x i32> %b to <16 x i8>
+  ret <16 x i8> %b.trunc
+}
+
+; A longer test that was the original motivation for the smin-smax clamping.
+define i8 @test_final(i16 %x, i16 %y) {
+; CHECK-LABEL: define i8 @test_final(
+; CHECK-SAME: i16 [[X:%.*]], i16 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 127)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.smax.i16(i16 [[Y]], i16 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 127)
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[MUL]], 7
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[SHR]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 127)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %x.clamp = zext nneg i16 %2 to i32
+  %3 = tail call i16 @llvm.smax.i16(i16 %y, i16 0)
+  %4 = tail call i16 @llvm.smin.i16(i16 %3, i16 127)
+  %y.clamp = zext nneg i16 %4 to i32
+  %mul = mul nuw nsw i32 %x.clamp, %y.clamp
+  %shr = lshr i32 %mul, 7
+  %trunc= trunc nuw nsw i32 %shr to i8
+  ret i8 %trunc
+}
+
+; Range tests below check if the bounds are dealt with correctly.
+
+; This gets optimized.
+define i8 @test_bounds_1(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_1(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 127)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 7
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 127)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 7
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; While this does not.
+define i8 @test_bounds_2(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_2(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 128)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 0)
+; CHECK-NEXT:    [[A:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[B:%.*]] = lshr i8 [[A]], 7
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 128)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 0)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 7
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; This should get optimized. We test here if the optimization works correctly
+; if the upper limit is signed max int.
+define i8 @test_bounds_3(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_3(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 32767)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 32752)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 32767)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 32752)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; Here min = 128 is greater than max = 0.
+define i8 @test_bounds_4(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_4(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 128)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 0)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 128)
+  %a = sext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+; The following 3 tests check the situation where min and max are minimal and
+; maximal signed values. No transformations should occur here.
+define i8 @test_bounds_5(i16 %x) {
+; CHECK-LABEL: define i8 @test_bounds_5(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i16 @llvm.smin.i16(i16 [[X]], i16 32767)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP1]], i16 -32768)
+; CHECK-NEXT:    [[B:%.*]] = lshr i16 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i16 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i16 @llvm.smin.i16(i16 %x, i16 32767)
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 -32768)
+  %a = zext i16 %2 to i32
+  %b = lshr i32 %a, 2
+  %b.trunc = trunc i32 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_bounds_6(i32 %x) {
+; CHECK-LABEL: define i8 @test_bounds_6(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smin.i32(i32 [[X]], i32 2147483647)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP1]], i32 -2147483648)
+; CHECK-NEXT:    [[B:%.*]] = lshr i32 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i32 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i32 @llvm.smin.i32(i32 %x, i32 2147483647)
+  %2 = tail call i32 @llvm.smax.i32(i32 %1, i32 -2147483648)
+  %a = zext i32 %2 to i64
+  %b = lshr i64 %a, 2
+  %b.trunc = trunc i64 %b to i8
+  ret i8 %b.trunc
+}
+
+define i8 @test_bounds_7(i64 %x) {
+; CHECK-LABEL: define i8 @test_bounds_7(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.smin.i64(i64 [[X]], i64 9223372036854775807)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 -9223372036854775808)
+; CHECK-NEXT:    [[B:%.*]] = lshr i64 [[TMP2]], 2
+; CHECK-NEXT:    [[B_TRUNC:%.*]] = trunc i64 [[B]] to i8
+; CHECK-NEXT:    ret i8 [[B_TRUNC]]
+;
+  %1 = tail call i64 @llvm.smin.i64(i64 %x, i64 9223372036854775807)
+  %2 = tail call i64 @llvm.smax.i64(i64 %1, i64 -9223372036854775808)
+  %a = zext i64 %2 to i128
+  %b = lshr i128 %a, 2
+  %b.trunc = trunc i128 %b to i8
+  ret i8 %b.trunc
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 86fa12a..4afa0d4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -436,6 +436,36 @@ body:             |
     $w0 = COPY %ext(s32)
 ...
 ---
+# select cond, 0, 64 --> (zext (!Cond)) << log2(Pow2)
+name:            select_cond_0_64_to_shift
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_64_to_shift
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT [[XOR]](s1)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
+    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[ZEXT]], [[C1]](s8)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 0
+    %one:_(s8) = G_CONSTANT i8 64
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
 # select cond, -1, 0 --> sext Cond
 name:            select_cond_minus_1_0_to_sext_cond
 body:             |
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
index e3a633c..4d461c9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
@@ -48,3 +48,73 @@ body:             |
     G_BR %bb.2
 
 ...
+---
+name:            boolean_vector_to_scalar
+tracksRegLiveness: true
+body:             |
+    bb.1:
+    ; CHECK-LABEL: name: boolean_vector_to_scalar
+    ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C1]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C1]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C1]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C8]](s64)
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C1]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[DEF]](<8 x s8>), [[C9]](s64)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C1]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s8), [[FRAME_INDEX]](p0) :: (store (s8) into %stack.0)
+    ; CHECK-NEXT: %bc:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s8) from %stack.0)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %bc(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+      %vec:_(<8 x s1>) = G_IMPLICIT_DEF
+      %bc:_(s8) = G_BITCAST %vec(<8 x s1>)
+      %ext:_(s32) = G_ANYEXT %bc(s8)
+      $w0 = COPY %ext(s32)
+      RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
new file mode 100644
index 0000000..b529577
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple aarch64 -O0 -global-isel -o - %s | FileCheck %s
+
+define <1 x i1> @shuffle_extract_4(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    umov w8, v0.h[4]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 4>
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i1> @shuffle_extract_12(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_12:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.8h, v1.8b, #0
+; CHECK-NEXT:    umov w8, v0.h[4]
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 12>
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i1> @shuffle_extract_p(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: shuffle_extract_p:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // implicit-def: $w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 poison>
+  ret <1 x i1> %extractvec60
+}
+
+define <1 x i32> @shufflevector_v1i32(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: shufflevector_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+    %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 1>
+    ret <1 x i32> %c
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index 2464026..af03a21 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -618,3 +618,32 @@ body:             |
     RET_ReallyLR implicit $q0
 
 ...
+---
+name:            shuffle_v8i1_v1i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+    ; CHECK-LABEL: name: shuffle_v8i1_v1i8
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[ANYEXT]](<8 x s16>), [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %2:_(<8 x s8>) = COPY $d0
+    %0:_(<8 x s1>) = G_TRUNC %2:_(<8 x s8>)
+    %3:_(<8 x s8>) = COPY $d1
+    %1:_(<8 x s1>) = G_TRUNC %3:_(<8 x s8>)
+    %4:_(s1) = G_SHUFFLE_VECTOR %0:_(<8 x s1>), %1:_, shufflemask(12)
+    %5:_(s8) = G_ZEXT %4:_(s1)
+    %6:_(s32) = G_ANYEXT %5:_(s8)
+    $w0 = COPY %6:_(s32)
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
new file mode 100644
index 0000000..1df6297
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-store-vector-bools.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -O0 -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s
+---
+name:            store_8xs1
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $q0, $q1, $x0
+    ; CHECK-LABEL: name: store_8xs1
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY]](<4 x s32>), [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(slt), [[COPY1]](<4 x s32>), [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C1]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s8)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s8)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC2]](s8)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s8)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC4]](s8)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[C2]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s8)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT5]], [[C2]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C7]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C8]](s64)
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC6]](s8)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[C2]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C8]](s64)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s8) = G_EXTRACT_VECTOR_ELT [[TRUNC2]](<8 x s8>), [[C9]](s64)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s8)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT7]], [[C2]]
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s64)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[OR7]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC3]](s8), %ptr(p0) :: (store (s8))
+    ; CHECK-NEXT: RET_ReallyLR
+    %1:_(<4 x s32>) = COPY $q0
+    %2:_(<4 x s32>) = COPY $q1
+    %ptr:_(p0) = COPY $x0
+    %0:_(<8 x s32>) = G_CONCAT_VECTORS %1(<4 x s32>), %2(<4 x s32>)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<8 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32), %4(s32)
+    %5:_(<8 x s1>) = G_ICMP intpred(slt), %0(<8 x s32>), %3
+    G_STORE %5(<8 x s1>), %ptr(p0) :: (store (<8 x s1>))
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index ee9fff7..f0c9dcc 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -440,11 +440,10 @@ define <8 x i16> @shufsext_v8i8_v8i16(<8 x i8> %src, <8 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufsext_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI14_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -493,11 +492,10 @@ define <8 x i16> @shufzext_v8i8_v8i16(<8 x i8> %src, <8 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufzext_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
index 1aa28f5..9a1203f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
@@ -156,11 +156,10 @@ define i32 @fptosi_bf(bfloat %a) nounwind ssp {
 ; CHECK-LABEL: fptosi_bf:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s1, s0
-; CHECK-NEXT:    // implicit-def: $s0
+; CHECK-NEXT:    // implicit-def: $d0
 ; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -173,11 +172,10 @@ define i32 @fptoui_sbf(bfloat %a) nounwind ssp {
 ; CHECK-LABEL: fptoui_sbf:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s1, s0
-; CHECK-NEXT:    // implicit-def: $s0
+; CHECK-NEXT:    // implicit-def: $d0
 ; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcvtzu w0, s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index ed9c1b0..fb40dfc 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -182,17 +182,14 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fadd s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
@@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fadd s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 888b795..818dcf3 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -184,17 +184,14 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
@@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
@@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT:    mov h1, v0.h[1]
-; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    dup v1.4h, v0.h[1]
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmov w9, s1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s0, w9
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
+; NOLSE-NEXT:    shll v1.4s, v1.4h, #16
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    mov h3, v2.h[1]
-; NOLSE-NEXT:    fmov w11, s2
-; NOLSE-NEXT:    lsl w11, w11, #16
-; NOLSE-NEXT:    fmov w10, s3
-; NOLSE-NEXT:    fmov s3, w11
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmaxnm s3, s3, s1
-; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    dup v3.4h, v2.h[1]
+; NOLSE-NEXT:    shll v2.4s, v2.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s0
-; NOLSE-NEXT:    fmov w11, s3
+; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
+; NOLSE-NEXT:    fmaxnm s3, s3, s1
+; NOLSE-NEXT:    fmov w11, s2
 ; NOLSE-NEXT:    ubfx w13, w11, #16, #1
 ; NOLSE-NEXT:    add w11, w11, w8
-; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:    fmov w10, s3
 ; NOLSE-NEXT:    add w11, w13, w11
 ; NOLSE-NEXT:    lsr w11, w11, #16
 ; NOLSE-NEXT:    ubfx w12, w10, #16, #1
@@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT:    mov h1, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    dup v1.4h, v0.h[1]
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr s0, [x0]
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s1
-; LSE-NEXT:    fmov s2, w10
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:    shll v1.4s, v1.4h, #16
 ; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    mov h3, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s3
-; LSE-NEXT:    fmov s4, w10
-; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    dup v3.4h, v0.h[1]
+; LSE-NEXT:    shll v4.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s4, s4, s2
-; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    shll v3.4s, v3.4h, #16
 ; LSE-NEXT:    fmaxnm s3, s3, s1
 ; LSE-NEXT:    fmov w10, s4
 ; LSE-NEXT:    ubfx w12, w10, #16, #1
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index a3665c6..b969241e 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -184,17 +184,14 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
@@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
@@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT:    mov h1, v0.h[1]
-; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    dup v1.4h, v0.h[1]
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmov w9, s1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s0, w9
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
+; NOLSE-NEXT:    shll v1.4s, v1.4h, #16
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    mov h3, v2.h[1]
-; NOLSE-NEXT:    fmov w11, s2
-; NOLSE-NEXT:    lsl w11, w11, #16
-; NOLSE-NEXT:    fmov w10, s3
-; NOLSE-NEXT:    fmov s3, w11
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fminnm s3, s3, s1
-; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    dup v3.4h, v2.h[1]
+; NOLSE-NEXT:    shll v2.4s, v2.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s0
-; NOLSE-NEXT:    fmov w11, s3
+; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
+; NOLSE-NEXT:    fminnm s3, s3, s1
+; NOLSE-NEXT:    fmov w11, s2
 ; NOLSE-NEXT:    ubfx w13, w11, #16, #1
 ; NOLSE-NEXT:    add w11, w11, w8
-; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:    fmov w10, s3
 ; NOLSE-NEXT:    add w11, w13, w11
 ; NOLSE-NEXT:    lsr w11, w11, #16
 ; NOLSE-NEXT:    ubfx w12, w10, #16, #1
@@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT:    mov h1, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    dup v1.4h, v0.h[1]
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr s0, [x0]
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s1
-; LSE-NEXT:    fmov s2, w10
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:    shll v1.4s, v1.4h, #16
 ; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    mov h3, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s3
-; LSE-NEXT:    fmov s4, w10
-; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    dup v3.4h, v0.h[1]
+; LSE-NEXT:    shll v4.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s4, s4, s2
-; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    shll v3.4s, v3.4h, #16
 ; LSE-NEXT:    fminnm s3, s3, s1
 ; LSE-NEXT:    fmov w10, s4
 ; LSE-NEXT:    ubfx w12, w10, #16, #1
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
index 7725ce0..e603337 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -182,17 +182,14 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fsub s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fsub s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
@@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fsub s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fsub s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
index 3399761..bc06453 100644
--- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -5,16 +5,12 @@
 define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fadd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fadd s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -26,15 +22,11 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fadd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fadd s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fadd bfloat %a, %b
@@ -44,16 +36,12 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fsub:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fsub s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fsub s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -65,15 +53,11 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fsub:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fsub s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsub s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fsub bfloat %a, %b
@@ -83,16 +67,12 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fmul:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -104,15 +84,11 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmul:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fmul bfloat %a, %b
@@ -122,27 +98,21 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fmadd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
 ; CHECK-CVT-NEXT:    add w8, w9, w8
-; CHECK-CVT-NEXT:    fmov w9, s2
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
@@ -155,23 +125,15 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmadd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov w9, s2
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -183,16 +145,12 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fdiv:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fdiv s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fdiv s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -204,15 +162,11 @@ define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fdiv:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fdiv s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fdiv s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fdiv bfloat %a, %b
@@ -223,14 +177,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_frem:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl fmodf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -246,14 +198,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_frem:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl fmodf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -334,17 +284,13 @@ define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 {
 define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 ; CHECK-LABEL: test_select_cc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $d2
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    fcmp s2, s3
 ; CHECK-NEXT:    fcsel s0, s0, s1, ne
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
@@ -356,15 +302,11 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 {
 ; CHECK-LABEL: test_select_cc_f32_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    fcmp s2, s3
 ; CHECK-NEXT:    fcsel s0, s0, s1, ne
 ; CHECK-NEXT:    ret
   %cc = fcmp une bfloat %c, %d
@@ -389,15 +331,11 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d)
 define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %r = fcmp une bfloat %a, %b
@@ -407,15 +345,11 @@ define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ueq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w8, eq
 ; CHECK-NEXT:    csinc w0, w8, wzr, vc
 ; CHECK-NEXT:    ret
@@ -426,15 +360,11 @@ define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ugt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
   %r = fcmp ugt bfloat %a, %b
@@ -444,15 +374,11 @@ define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_uge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, pl
 ; CHECK-NEXT:    ret
   %r = fcmp uge bfloat %a, %b
@@ -462,15 +388,11 @@ define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ult:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, lt
 ; CHECK-NEXT:    ret
   %r = fcmp ult bfloat %a, %b
@@ -480,15 +402,11 @@ define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ule:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, le
 ; CHECK-NEXT:    ret
   %r = fcmp ule bfloat %a, %b
@@ -498,15 +416,11 @@ define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_uno:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, vs
 ; CHECK-NEXT:    ret
   %r = fcmp uno bfloat %a, %b
@@ -516,15 +430,11 @@ define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w8, mi
 ; CHECK-NEXT:    csinc w0, w8, wzr, le
 ; CHECK-NEXT:    ret
@@ -535,15 +445,11 @@ define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_oeq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %r = fcmp oeq bfloat %a, %b
@@ -553,15 +459,11 @@ define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ogt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
   %r = fcmp ogt bfloat %a, %b
@@ -571,15 +473,11 @@ define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_oge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ge
 ; CHECK-NEXT:    ret
   %r = fcmp oge bfloat %a, %b
@@ -589,15 +487,11 @@ define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_olt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, mi
 ; CHECK-NEXT:    ret
   %r = fcmp olt bfloat %a, %b
@@ -607,15 +501,11 @@ define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ole:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ls
 ; CHECK-NEXT:    ret
   %r = fcmp ole bfloat %a, %b
@@ -625,15 +515,11 @@ define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ord:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, vc
 ; CHECK-NEXT:    ret
   %r = fcmp ord bfloat %a, %b
@@ -643,13 +529,11 @@ define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
 define void @test_fccmp(bfloat %in, ptr %out) {
 ; CHECK-LABEL: test_fccmp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    movi v1.2s, #69, lsl #24
-; CHECK-NEXT:    movi v3.2s, #72, lsl #24
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v2.4s, v0.4h, #16
 ; CHECK-NEXT:    adrp x8, .LCPI29_0
+; CHECK-NEXT:    movi v3.2s, #72, lsl #24
 ; CHECK-NEXT:    fcmp s2, s1
 ; CHECK-NEXT:    ldr h1, [x8, :lo12:.LCPI29_0]
 ; CHECK-NEXT:    fccmp s2, s3, #4, mi
@@ -667,15 +551,11 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 {
 ; CHECK-LABEL: test_br_cc:
 ; CHECK:       // %bb.0: // %common.ret
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    csel x8, x0, x1, pl
 ; CHECK-NEXT:    str wzr, [x8]
 ; CHECK-NEXT:    ret
@@ -725,10 +605,8 @@ declare i1 @test_dummy(ptr %p1) #0
 define i32 @test_fptosi_i32(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptosi_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
   %r = fptosi bfloat %a to i32
@@ -738,10 +616,8 @@ define i32 @test_fptosi_i32(bfloat %a) #0 {
 define i64 @test_fptosi_i64(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptosi_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
   %r = fptosi bfloat %a to i64
@@ -751,10 +627,8 @@ define i64 @test_fptosi_i64(bfloat %a) #0 {
 define i32 @test_fptoui_i32(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptoui_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w0, s0
 ; CHECK-NEXT:    ret
   %r = fptoui bfloat %a to i32
@@ -764,10 +638,8 @@ define i32 @test_fptoui_i32(bfloat %a) #0 {
 define i64 @test_fptoui_i64(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptoui_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu x0, s0
 ; CHECK-NEXT:    ret
   %r = fptoui bfloat %a to i64
@@ -927,7 +799,8 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    ucvtf d1, w0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fcvtxn s1, d1
 ; CHECK-CVT-NEXT:    fmov w9, s1
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -935,12 +808,7 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    add w9, w10, w9
 ; CHECK-CVT-NEXT:    lsr w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov w9, s0
-; CHECK-CVT-NEXT:    fmov w10, s1
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -954,15 +822,11 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_uitofp_i32_fadd:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    ucvtf d1, w0
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fcvtxn s1, d1
-; CHECK-BF16-NEXT:    fmov s0, w8
 ; CHECK-BF16-NEXT:    bfcvt h1, s1
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -976,7 +840,8 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    scvtf d1, w0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fcvtxn s1, d1
 ; CHECK-CVT-NEXT:    fmov w9, s1
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -984,12 +849,7 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    add w9, w10, w9
 ; CHECK-CVT-NEXT:    lsr w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov w9, s0
-; CHECK-CVT-NEXT:    fmov w10, s1
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -1003,15 +863,11 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_sitofp_i32_fadd:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    scvtf d1, w0
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fcvtxn s1, d1
-; CHECK-BF16-NEXT:    fmov s0, w8
 ; CHECK-BF16-NEXT:    bfcvt h1, s1
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1070,10 +926,9 @@ define bfloat @test_fptrunc_double(double %a) #0 {
 define float @test_fpext_float(bfloat %a) #0 {
 ; CHECK-LABEL: test_fpext_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
   %r = fpext bfloat %a to float
   ret float %r
@@ -1082,10 +937,8 @@ define float @test_fpext_float(bfloat %a) #0 {
 define double @test_fpext_double(bfloat %a) #0 {
 ; CHECK-LABEL: test_fpext_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvt d0, s0
 ; CHECK-NEXT:    ret
   %r = fpext bfloat %a to double
@@ -1148,11 +1001,9 @@ declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0
 define bfloat @test_sqrt(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sqrt:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fsqrt s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -1165,10 +1016,8 @@ define bfloat @test_sqrt(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_sqrt:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fsqrt s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1180,10 +1029,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 {
 ; CHECK-CVT-LABEL: test_powi:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl __powisf2
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1199,10 +1047,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 {
 ; CHECK-BF16-LABEL: test_powi:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl __powisf2
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1216,10 +1063,9 @@ define bfloat @test_sin(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sin:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl sinf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1235,10 +1081,9 @@ define bfloat @test_sin(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_sin:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl sinf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1251,10 +1096,9 @@ define bfloat @test_cos(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_cos:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl cosf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1270,10 +1114,9 @@ define bfloat @test_cos(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_cos:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl cosf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1286,10 +1129,9 @@ define bfloat @test_tan(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_tan:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl tanf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1305,10 +1147,9 @@ define bfloat @test_tan(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_tan:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl tanf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1321,10 +1162,9 @@ define bfloat @test_acos(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_acos:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl acosf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1340,10 +1180,9 @@ define bfloat @test_acos(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_acos:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl acosf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1356,10 +1195,9 @@ define bfloat @test_asin(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_asin:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl asinf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1375,10 +1213,9 @@ define bfloat @test_asin(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_asin:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl asinf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1391,10 +1228,9 @@ define bfloat @test_atan(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_atan:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl atanf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1410,10 +1246,9 @@ define bfloat @test_atan(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_atan:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl atanf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1426,14 +1261,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_atan2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl atan2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1449,14 +1282,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_atan2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl atan2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1469,10 +1300,9 @@ define bfloat @test_cosh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_cosh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl coshf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1488,10 +1318,9 @@ define bfloat @test_cosh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_cosh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl coshf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1504,10 +1333,9 @@ define bfloat @test_sinh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sinh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl sinhf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1523,10 +1351,9 @@ define bfloat @test_sinh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_sinh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl sinhf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1539,10 +1366,9 @@ define bfloat @test_tanh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_tanh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl tanhf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1558,10 +1384,9 @@ define bfloat @test_tanh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_tanh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl tanhf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1574,14 +1399,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_pow:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl powf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1597,14 +1420,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_pow:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl powf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1617,10 +1438,9 @@ define bfloat @test_exp(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_exp:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl expf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1636,10 +1456,9 @@ define bfloat @test_exp(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_exp:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl expf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1652,10 +1471,9 @@ define bfloat @test_exp2(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_exp2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl exp2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1671,10 +1489,9 @@ define bfloat @test_exp2(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_exp2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl exp2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1687,10 +1504,9 @@ define bfloat @test_log(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl logf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1706,10 +1522,9 @@ define bfloat @test_log(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl logf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1722,10 +1537,9 @@ define bfloat @test_log10(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log10:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl log10f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1741,10 +1555,9 @@ define bfloat @test_log10(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log10:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl log10f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1757,10 +1570,9 @@ define bfloat @test_log2(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl log2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1776,10 +1588,9 @@ define bfloat @test_log2(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl log2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1791,20 +1602,14 @@ define bfloat @test_log2(bfloat %a) #0 {
 define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fma:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s2
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-CVT-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmadd s0, s0, s1, s2
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
@@ -1816,19 +1621,13 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fma:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s2
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    fmov w10, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    lsl w10, w10, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov s2, w10
-; CHECK-BF16-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmadd s0, s0, s1, s2
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
@@ -1851,16 +1650,12 @@ define bfloat @test_fabs(bfloat %a) #0 {
 define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_minnum:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fminnm s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fminnm s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -1872,15 +1667,11 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_minnum:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fminnm s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fminnm s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
@@ -1890,16 +1681,12 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_maxnum:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fmaxnm s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -1911,15 +1698,11 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_maxnum:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmaxnm s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
@@ -1929,16 +1712,12 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
@@ -1947,16 +1726,12 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
@@ -1966,12 +1741,10 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_f32:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
@@ -1981,12 +1754,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign_f32:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1998,12 +1769,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_f64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    fcvt s1, d1
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
@@ -2013,12 +1782,10 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign_f64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    fcvt s1, d1
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2032,34 +1799,33 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_extended:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    movi v2.4s, #16
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: test_copysign_extended:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    movi v2.4s, #16
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
@@ -2070,11 +1836,9 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
 define bfloat @test_floor(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_floor:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintm s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2087,10 +1851,8 @@ define bfloat @test_floor(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_floor:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintm s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2101,11 +1863,9 @@ define bfloat @test_floor(bfloat %a) #0 {
 define bfloat @test_ceil(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_ceil:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintp s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2118,10 +1878,8 @@ define bfloat @test_ceil(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_ceil:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintp s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2132,11 +1890,9 @@ define bfloat @test_ceil(bfloat %a) #0 {
 define bfloat @test_trunc(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_trunc:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintz s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2149,10 +1905,8 @@ define bfloat @test_trunc(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_trunc:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintz s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2163,11 +1917,9 @@ define bfloat @test_trunc(bfloat %a) #0 {
 define bfloat @test_rint(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_rint:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintx s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2180,10 +1932,8 @@ define bfloat @test_rint(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_rint:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintx s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2194,11 +1944,9 @@ define bfloat @test_rint(bfloat %a) #0 {
 define bfloat @test_nearbyint(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_nearbyint:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frinti s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2211,10 +1959,8 @@ define bfloat @test_nearbyint(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_nearbyint:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frinti s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2225,11 +1971,9 @@ define bfloat @test_nearbyint(bfloat %a) #0 {
 define bfloat @test_round(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_round:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frinta s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2242,10 +1986,8 @@ define bfloat @test_round(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_round:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frinta s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2256,11 +1998,9 @@ define bfloat @test_round(bfloat %a) #0 {
 define bfloat @test_roundeven(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_roundeven:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintn s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2273,10 +2013,8 @@ define bfloat @test_roundeven(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_roundeven:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintn s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2287,27 +2025,21 @@ define bfloat @test_roundeven(bfloat %a) #0 {
 define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fmuladd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
 ; CHECK-CVT-NEXT:    add w8, w9, w8
-; CHECK-CVT-NEXT:    fmov w9, s2
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
@@ -2320,23 +2052,15 @@ define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmuladd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov w9, s2
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index c03e2e5..a609e33 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -272,9 +272,8 @@ define <8 x bfloat> @d_to_h(<8 x double> %a) {
 define <8 x float> @h_to_s(<8 x bfloat> %a) {
 ; CHECK-LABEL: h_to_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    ret
   %1 = fpext <8 x bfloat> %a to <8 x float>
   ret <8 x float> %1
@@ -283,13 +282,12 @@ define <8 x float> @h_to_s(<8 x bfloat> %a) {
 define <8 x double> @h_to_d(<8 x bfloat> %a) {
 ; CHECK-LABEL: h_to_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    shll v2.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtl v0.2d, v2.2s
-; CHECK-NEXT:    shll v4.4s, v1.4h, #16
-; CHECK-NEXT:    fcvtl2 v1.2d, v2.4s
-; CHECK-NEXT:    fcvtl2 v3.2d, v4.4s
-; CHECK-NEXT:    fcvtl v2.2d, v4.2s
+; CHECK-NEXT:    shll v1.4s, v0.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v0.8h, #16
+; CHECK-NEXT:    fcvtl v0.2d, v1.2s
+; CHECK-NEXT:    fcvtl2 v3.2d, v2.4s
+; CHECK-NEXT:    fcvtl2 v1.2d, v1.4s
+; CHECK-NEXT:    fcvtl v2.2d, v2.2s
 ; CHECK-NEXT:    ret
   %1 = fpext <8 x bfloat> %a to <8 x double>
   ret <8 x double> %1
@@ -788,11 +786,10 @@ define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
 define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptosi_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
@@ -803,11 +800,10 @@ define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
 define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptosi_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %1 = fptosi<8 x bfloat> %a to <8 x i16>
@@ -817,11 +813,10 @@ define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
 define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptoui_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
@@ -832,11 +827,10 @@ define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
 define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptoui_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %1 = fptoui<8 x bfloat> %a to <8 x i16>
@@ -846,90 +840,58 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
 define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ne
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ne
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ne
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ne
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -941,96 +903,64 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ueq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    lsl w9, w11, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s7, w9
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csinv w10, w10, wzr, vc
-; CHECK-NEXT:    fcmp s7, s6
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h4, v0.h[4]
-; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
 ; CHECK-NEXT:    csetm w9, eq
 ; CHECK-NEXT:    csinv w9, w9, wzr, vc
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    lsl w11, w11, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s5, w11
-; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    mov v2.h[1], w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov s6, w10
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    fmov w10, s5
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    lsl w8, w9, #16
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    csinv w10, w10, wzr, vc
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
@@ -1044,90 +974,58 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ugt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, hi
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, hi
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, hi
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, hi
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1139,90 +1037,58 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_uge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, pl
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, pl
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, pl
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, pl
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1234,90 +1100,58 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ult:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, lt
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, lt
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, lt
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, lt
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1329,90 +1163,58 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ule:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, le
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, le
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, le
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, le
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1424,90 +1226,58 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_uno:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, vs
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, vs
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, vs
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, vs
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1519,96 +1289,64 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    lsl w9, w11, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s7, w9
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csinv w10, w10, wzr, le
-; CHECK-NEXT:    fcmp s7, s6
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h4, v0.h[4]
-; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
 ; CHECK-NEXT:    csetm w9, mi
 ; CHECK-NEXT:    csinv w9, w9, wzr, le
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    lsl w11, w11, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s5, w11
-; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    mov v2.h[1], w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov s6, w10
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    fmov w10, s5
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    lsl w8, w9, #16
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    csinv w10, w10, wzr, le
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
@@ -1622,90 +1360,58 @@ define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_oeq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, eq
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, eq
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1717,90 +1423,58 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ogt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, gt
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, gt
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, gt
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, gt
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1812,90 +1486,58 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_oge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ge
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ge
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ge
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ge
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1907,90 +1549,58 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_olt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, mi
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, mi
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -2002,90 +1612,58 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ole:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ls
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ls
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ls
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ls
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -2097,90 +1675,58 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ord:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, vc
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, vc
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, vc
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, vc
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll
new file mode 100644
index 0000000..1163314
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-attached-call-garget.ll
@@ -0,0 +1,37 @@
+; This test verifies that two similar functions, f1 and f2, are not merged
+; when their attached call targets differ, since these targets cannot be parameterized.
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true < %s | FileCheck %s
+
+; CHECK-NOT: _f1.Tgm
+; CHECK-NOT: _f2.Tgm
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+define i64 @f1(ptr %0) {
+  %2 = call ptr @g1(ptr %0, i32 0) minsize [ "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ]
+  tail call void (...) @llvm.objc.clang.arc.noop.use(ptr %2)
+  %3 = call i64 @g2(ptr %2)
+  tail call void @objc_release(ptr %2)
+  %4 = tail call i64 @g3(i64 %3)
+  ret i64 %4
+}
+
+define i64 @f2(ptr %0) {
+  %2 = call ptr @g1(ptr %0, i32 0) minsize [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void (...) @llvm.objc.clang.arc.noop.use(ptr %2)
+  %3 = call i64 @g2(ptr %2)
+  tail call void @objc_release(ptr %2)
+  %4 = tail call i64 @g3(i64 %3)
+  ret i64 %4
+}
+
+declare ptr @g1(ptr, i32)
+declare i64 @g2(ptr)
+declare i64 @g3(i64)
+
+declare void @llvm.objc.clang.arc.noop.use(...)
+declare ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue(ptr)
+declare ptr @llvm.objc.retainAutoreleasedReturnValue(ptr)
+declare void @objc_release(ptr)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
new file mode 100644
index 0000000..11cf4c3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -0,0 +1,1136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot90(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot180(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot270(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
+  %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i64 @cdotp_i16_rot0(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot90(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot180(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot270(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
+  %real.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul.neg)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+
+define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @not_cdotp(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @not_cdotp(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i32 @not_cdotp(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i16 @invalid_type(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i16 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i16 @invalid_type(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i16 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i16 @invalid_type(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 8 x i16> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %partial.reduce.sub)
+  ret i16 %0
+}
+
+define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a)
+  %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b)
+  %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <16 x i8> %a.real to <16 x i32>
+  %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32>
+  %b.real.ext = sext <16 x i8> %b.real to <16 x i32>
+  %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32>
+  %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul)
+  %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
+
+declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 68cb29f..7542e9c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -29,3 +29,92 @@ bb193:                                            ; preds = %bb173
   store volatile i32 0, ptr null, align 4
   unreachable
 }
+
+; Check that the deinterleaving pass doesn't try to transform isolated patterns without a relevant deinterleaving pattern
+define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
+; CHECK-LABEL: check_deinterleaving_has_deinterleave:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    add x8, x0, #16
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp q17, q18, [x8, #-16]
+; CHECK-NEXT:    subs x9, x9, #32
+; CHECK-NEXT:    add x8, x8, #32
+; CHECK-NEXT:    cmeq v17.16b, v17.16b, #0
+; CHECK-NEXT:    cmeq v18.16b, v18.16b, #0
+; CHECK-NEXT:    ushll2 v19.8h, v17.16b, #0
+; CHECK-NEXT:    ushll v17.8h, v17.8b, #0
+; CHECK-NEXT:    ushll2 v20.8h, v18.16b, #0
+; CHECK-NEXT:    ushll v18.8h, v18.8b, #0
+; CHECK-NEXT:    ushll v21.4s, v19.4h, #0
+; CHECK-NEXT:    ushll2 v19.4s, v19.8h, #0
+; CHECK-NEXT:    ushll v22.4s, v17.4h, #0
+; CHECK-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-NEXT:    ushll2 v23.4s, v20.8h, #0
+; CHECK-NEXT:    ushll v24.4s, v18.4h, #0
+; CHECK-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-NEXT:    ushll v20.4s, v20.4h, #0
+; CHECK-NEXT:    and v21.16b, v21.16b, v1.16b
+; CHECK-NEXT:    and v19.16b, v19.16b, v1.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v1.16b
+; CHECK-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v1.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v1.16b
+; CHECK-NEXT:    and v18.16b, v18.16b, v1.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v1.16b
+; CHECK-NEXT:    add v4.4s, v4.4s, v19.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v22.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v17.4s
+; CHECK-NEXT:    add v16.4s, v16.4s, v23.4s
+; CHECK-NEXT:    add v5.4s, v5.4s, v24.4s
+; CHECK-NEXT:    add v6.4s, v6.4s, v20.4s
+; CHECK-NEXT:    add v7.4s, v7.4s, v18.4s
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %middle.block
+; CHECK-NEXT:    add v1.4s, v7.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v4.4s
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+    br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %9, %vector.body ]
+  %vec.phi50 = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+  %next.gep = getelementptr i8, ptr %a, i64 %index
+  %4 = getelementptr i8, ptr %next.gep, i64 16
+  %wide.load = load <16 x i8>, ptr %next.gep, align 1
+  %wide.load51 = load <16 x i8>, ptr %4, align 1
+  %5 = icmp eq <16 x i8> %wide.load, zeroinitializer
+  %6 = icmp eq <16 x i8> %wide.load51, zeroinitializer
+  %7 = zext <16 x i1> %5 to <16 x i32>
+  %8 = zext <16 x i1> %6 to <16 x i32>
+  %9 = add <16 x i32> %vec.phi, %7
+  %10 = add <16 x i32> %vec.phi50, %8
+  %index.next = add nuw i64 %index, 32
+  %11 = icmp eq i64 %index.next, 32
+  br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+  %bin.rdx = add <16 x i32> %10, %9
+  %12 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
+  ret i32 %12
+}
diff --git a/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll b/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
new file mode 100644
index 0000000..7c628cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/csel-subs-swapped.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i32 @eq_i32(i32 %x) {
+; CHECK-LABEL: eq_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ne_i32(i32 %x) {
+; CHECK-LABEL: ne_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sgt_i32(i32 %x) {
+; CHECK-LABEL: sgt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sge_i32(i32 %x) {
+; CHECK-LABEL: sge_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097153 // =0xffdfffff
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @slt_i32(i32 %x) {
+; CHECK-LABEL: slt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @sle_i32(i32 %x) {
+; CHECK-LABEL: sle_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097151 // =0xffe00001
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ugt_i32(i32 %x) {
+; CHECK-LABEL: ugt_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @uge_i32(i32 %x) {
+; CHECK-LABEL: uge_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w9, w0, #21
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w9, #2046
+; CHECK-NEXT:    csel w0, w0, w8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ult_i32(i32 %x) {
+; CHECK-LABEL: ult_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    cmn w0, #512, lsl #12 // =2097152
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+define i32 @ule_i32(i32 %x) {
+; CHECK-LABEL: ule_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2097152 // =0xffe00000
+; CHECK-NEXT:    mov w9, #-2097151 // =0xffe00001
+; CHECK-NEXT:    sub w8, w8, w0
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i32 %x, -2097152
+  %sub = sub i32 -2097152, %x
+  %retval.0 = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %retval.0
+}
+
+
+define i64 @eq_i64(i64 %x) {
+; CHECK-LABEL: eq_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, eq
+; CHECK-NEXT:    ret
+  %cmp = icmp eq i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ne_i64(i64 %x) {
+; CHECK-LABEL: ne_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, ne
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sgt_i64(i64 %x) {
+; CHECK-LABEL: sgt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sge_i64(i64 %x) {
+; CHECK-LABEL: sge_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #99
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, gt
+; CHECK-NEXT:    ret
+  %cmp = icmp sge i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @slt_i64(i64 %x) {
+; CHECK-LABEL: slt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @sle_i64(i64 %x) {
+; CHECK-LABEL: sle_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lt
+; CHECK-NEXT:    ret
+  %cmp = icmp sle i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ugt_i64(i64 %x) {
+; CHECK-LABEL: ugt_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @uge_i64(i64 %x) {
+; CHECK-LABEL: uge_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #99
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, hi
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ult_i64(i64 %x) {
+; CHECK-LABEL: ult_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #100
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+define i64 @ule_i64(i64 %x) {
+; CHECK-LABEL: ule_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x0, x8, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i64 %x, 100
+  %sub = sub i64 100, %x
+  %retval.0 = select i1 %cmp, i64 %x, i64 %sub
+  ret i64 %retval.0
+}
+
+
+define i64 @both(i64 %x) {
+; CHECK-LABEL: both:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    sub x9, x0, #100
+; CHECK-NEXT:    cmp x0, #101
+; CHECK-NEXT:    sub x8, x8, x0
+; CHECK-NEXT:    csel x0, x8, x9, lo
+; CHECK-NEXT:    ret
+  %cmp = icmp ule i64 %x, 100
+  %sub1 = sub i64 100, %x
+  %sub2 = sub i64 %x, 100
+  %retval.0 = select i1 %cmp, i64 %sub1, i64 %sub2
+  ret i64 %retval.0
+}
+
+define i32 @qabs(i32 %0) {
+; CHECK-LABEL: qabs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    csneg w8, w8, w0, eq
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    csel w0, w0, w8, gt
+; CHECK-NEXT:    ret
+  %cmp1 = icmp sgt i32 %0, 0
+  %cmp2 = icmp eq i32 %0, -2147483648
+  %sub = sub nsw i32 0, %0
+  %cond = select i1 %cmp2, i32 2147483647, i32 %sub
+  %cond6 = select i1 %cmp1, i32 %0, i32 %cond
+  ret i32 %cond6
+}
diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
index 40684b0..e3263252 100644
--- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
@@ -76,11 +76,9 @@ entry:
 define bfloat @t7(bfloat %x)  {
 ; CHECK-LABEL: t7:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w9, s0
 ; CHECK-NEXT:    scvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -101,11 +99,9 @@ entry:
 define bfloat @t8(bfloat %x)  {
 ; CHECK-LABEL: t8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w9, s0
 ; CHECK-NEXT:    ucvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -198,11 +194,9 @@ entry:
 define bfloat @t7_strict(bfloat %x) #0 {
 ; CHECK-LABEL: t7_strict:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w9, s0
 ; CHECK-NEXT:    scvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -223,11 +217,9 @@ entry:
 define bfloat @t8_strict(bfloat %x) #0 {
 ; CHECK-LABEL: t8_strict:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w9, s0
 ; CHECK-NEXT:    ucvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
diff --git a/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll b/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll
new file mode 100644
index 0000000..dca39fe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -filetype asm -o - %s | FileCheck %s
+
+; This shows that when dereferencing a null pointer, HWASan will call
+; __hwasan_check_x4294967071_19_fixed_0_short_v2
+; (N.B. 4294967071 == 2**32 - 239 + 14 == 2**32 - X0 + XZR
+;
+; The source was generated from llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll.
+
+; ModuleID = '<stdin>'
+source_filename = "<stdin>"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+$hwasan.module_ctor = comdat any
+
+@__start_hwasan_globals = external hidden constant [0 x i8]
+@__stop_hwasan_globals = external hidden constant [0 x i8]
+@hwasan.note = private constant { i32, i32, i32, [8 x i8], i32, i32 } { i32 8, i32 8, i32 3, [8 x i8] c"LLVM\00\00\00\00", i32 trunc (i64 sub (i64 ptrtoint (ptr @__start_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @__stop_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32) }, section ".note.hwasan.globals", comdat($hwasan.module_ctor), align 4
+
+; Function Attrs: sanitize_hwaddress
+define void @test_store_to_zeroptr() #0 {
+; CHECK-LABEL: test_store_to_zeroptr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __hwasan_check_x4294967071_19_fixed_0_short_v2
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, #42 // =0x2a
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %.hwasan.shadow = call ptr asm "", "=r,0"(ptr null)
+  %b = inttoptr i64 0 to ptr
+  call void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr %b, i32 19, i64 0)
+  store i64 42, ptr %b, align 8
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr, i32 immarg, i64 immarg) #1
+
+attributes #0 = { sanitize_hwaddress }
+attributes #1 = { nounwind }
+
+declare void @__hwasan_init()
+
+; Function Attrs: nounwind
+define internal void @hwasan.module_ctor() #1 comdat {
+; CHECK-LABEL: hwasan.module_ctor:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __hwasan_init
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @__hwasan_init()
+  ret void
+}
+
+!llvm.module.flags = !{!1}
+
+!0 = !{ptr @hwasan.note}
+!1 = !{i32 4, !"nosanitize_hwaddress", i32 1}
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.ll b/llvm/test/CodeGen/AArch64/machine-combiner.ll
index 70a6388..c8df283 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner.ll
@@ -262,8 +262,8 @@ define half @reassociate_adds_half(half %x0, half %x1, half %x2, half %x3) {
 ; CHECK-UNSAFE-LABEL: reassociate_adds_half:
 ; CHECK-UNSAFE:       // %bb.0:
 ; CHECK-UNSAFE-NEXT:    fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT:    fadd h1, h3, h2
-; CHECK-UNSAFE-NEXT:    fadd h0, h1, h0
+; CHECK-UNSAFE-NEXT:    fadd h2, h3, h2
+; CHECK-UNSAFE-NEXT:    fadd h0, h2, h0
 ; CHECK-UNSAFE-NEXT:    ret
   %t0 = fdiv half %x0, %x1
   %t1 = fadd half %x2, %t0
@@ -284,8 +284,8 @@ define half @reassociate_muls_half(half %x0, half %x1, half %x2, half %x3) {
 ; CHECK-UNSAFE-LABEL: reassociate_muls_half:
 ; CHECK-UNSAFE:       // %bb.0:
 ; CHECK-UNSAFE-NEXT:    fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT:    fmul h1, h3, h2
-; CHECK-UNSAFE-NEXT:    fmul h0, h1, h0
+; CHECK-UNSAFE-NEXT:    fmul h2, h3, h2
+; CHECK-UNSAFE-NEXT:    fmul h0, h2, h0
 ; CHECK-UNSAFE-NEXT:    ret
   %t0 = fdiv half %x0, %x1
   %t1 = fmul half %x2, %t0
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 17f8263..a32c53a 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -313,9 +313,8 @@ define void @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N) {
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w9, w10
 ; CHECK-NEXT:    cset w9, hi
-; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    csinv w9, w9, wzr, hs
 ; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    sub w9, w9, w10
 ; CHECK-NEXT:    strb w9, [x2], #1
 ; CHECK-NEXT:    b.ne .LBB4_1
 ; CHECK-NEXT:  // %bb.2: // %for.exit
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
index 4bbbe40..e7de540 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
@@ -9,9 +9,9 @@ define void @a() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; CHECK-LABEL:         a:                     // @a
 ; CHECK:               // %bb.0:
 ; CHECK-NEXT:          .cfi_b_key_frame
+; CHECK-NEXT:          .cfi_negate_ra_state
 ; V8A-NEXT:            hint #27
 ; V83A-NEXT:           pacibsp
-; CHECK-NEXT:          .cfi_negate_ra_state
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
index 6a11bef..a26dda1 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
@@ -5,9 +5,9 @@
 
 define void @a() "sign-return-address"="all" {
 ; CHECK-LABEL:      a:                                     // @a
-; V8A:              hint #25
-; V83A:             paciasp
-; CHECK-NEXT:      .cfi_negate_ra_state
+; CHECK:      .cfi_negate_ra_state
+; V8A-NEXT:              hint #25
+; V83A-NEXT:             paciasp
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
@@ -52,9 +52,9 @@ define void @b() "sign-return-address"="non-leaf" {
 
 define void @c() "sign-return-address"="all" {
 ; CHECK-LABEL:         c:              // @c
-; V8A:                 hint #25
-; V83A:                paciasp
-; CHECK-NEXT          .cfi_negate_ra_state
+; CHECK:      .cfi_negate_ra_state
+; V8A-NEXT:              hint #25
+; V83A-NEXT:             paciasp
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
index 1e72246..064b2b7 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
@@ -8,8 +8,8 @@ define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -26,8 +26,8 @@ define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
@@ -59,8 +59,8 @@ define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -77,8 +77,8 @@ define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
@@ -110,8 +110,8 @@ define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -128,8 +128,8 @@ define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
index 9a983cb..218ee66 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
@@ -81,8 +81,8 @@ body:             |
 # CHECK:         name:            bar
 # CHECK:          bb.0:
 # CHECK:            frame-setup EMITBKEY
-# CHECK-NEXT:       frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:       frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:       frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:        OUTLINED_FUNCTION_
 # CHECK:          bb.1:
 # CHECK-NOT:        OUTLINED_FUNCTION_
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
index 87771f5..5c45373 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
@@ -7,8 +7,8 @@
 define void @a() "sign-return-address"="all" {
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0:
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -26,8 +26,8 @@ define void @a() "sign-return-address"="all" {
 ;
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
@@ -60,8 +60,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -80,8 +80,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
@@ -113,8 +113,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 define void @c() "sign-return-address"="all" {
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0:
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -132,8 +132,8 @@ define void @c() "sign-return-address"="all" {
 ;
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
index 22e5ede..d4a4b88 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
@@ -86,11 +86,11 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK:                BL @[[OUTLINED_FUNC:OUTLINED_FUNCTION_[0-9]+]]
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 ...
@@ -119,11 +119,11 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK:                BL @[[OUTLINED_FUNC]]
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 ...
@@ -174,22 +174,22 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 # CHECK-LABEL:    name:            illegal1
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 # Outlined function that contains only legal sp modifications
@@ -198,8 +198,8 @@ body:             |
 # CHECK-NEXT:       bb.0:
 # CHECK-NEXT: liveins: $lr
 # CHECK-NEXT: {{^  $}}
-# CHECK-NEXT:         frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:         frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:         frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:         $sp = frame-setup SUBXri $sp, 16, 0
 # CHECK:              $sp = frame-destroy ADDXri $sp, 16, 0
 # CHECK-NEXT:         frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
index a7ea329..cb43b3b 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
@@ -9,8 +9,8 @@ define void @a() #0 {
 ; CHECK-LABEL:      a:                                     // @a
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               pacibsp
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               pacibsp
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
@@ -33,8 +33,8 @@ define void @b() #0 {
 ; CHECK-LABEL:      b:                                     // @b
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               pacibsp
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               pacibsp
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
@@ -57,8 +57,8 @@ define void @c() #1 {
 ; CHECK-LABEL:      c:                                     // @c
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               hint #27
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               hint #27
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
index da68ea5..0ba4455 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
@@ -10,8 +10,8 @@ declare i32 @thunk_called_fn(i32, i32, i32, i32)
 define i32 @a() #0 {
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -27,8 +27,8 @@ define i32 @a() #0 {
 ;
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -49,8 +49,8 @@ entry:
 define i32 @b() #0 {
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -66,8 +66,8 @@ define i32 @b() #0 {
 ;
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -88,8 +88,8 @@ entry:
 define hidden i32 @c(ptr %fptr) #0 {
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -106,8 +106,8 @@ define hidden i32 @c(ptr %fptr) #0 {
 ;
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -129,8 +129,8 @@ entry:
 define hidden i32 @d(ptr %fptr) #0 {
 ; V8A-LABEL: d:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -147,8 +147,8 @@ define hidden i32 @d(ptr %fptr) #0 {
 ;
 ; V83A-LABEL: d:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -176,3 +176,5 @@ attributes #0 = { "sign-return-address"="non-leaf" minsize }
 ; CHECK-NOT:         .cfi_negate_ra_state
 ; CHECK-NOT:         auti{{[a,b]}}sp
 ; CHECK-NOT:         hint #{{[29,31]}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
index 4da7c8c..98ea86b 100644
--- a/llvm/test/CodeGen/AArch64/memcmp.ll
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -162,8 +162,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
   ret i32 %m
@@ -194,8 +193,7 @@ define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -259,6 +257,36 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
   ret i1 %c
 }
 
+define i1 @length4_le(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_le:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, ls
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 1
+  ret i1 %c
+}
+
+define i1 @length4_ge(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_ge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, -1
+  ret i1 %c
+}
+
 define i1 @length4_eq_const(ptr %X) nounwind {
 ; CHECK-LABEL: length4_eq_const:
 ; CHECK:       // %bb.0:
@@ -286,8 +314,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
   ret i32 %m
@@ -341,8 +368,7 @@ define i32 @length6(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
   ret i32 %m
@@ -375,18 +401,18 @@ define i32 @length7(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB24_3
+; CHECK-NEXT:    b.ne .LBB26_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur w8, [x0, #3]
 ; CHECK-NEXT:    ldur w9, [x1, #3]
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB24_3
+; CHECK-NEXT:    b.ne .LBB26_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB24_3: // %res_block
+; CHECK-NEXT:  .LBB26_3: // %res_block
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -403,18 +429,18 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB25_3
+; CHECK-NEXT:    b.ne .LBB27_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur w8, [x0, #3]
 ; CHECK-NEXT:    ldur w9, [x1, #3]
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB25_3
+; CHECK-NEXT:    b.ne .LBB27_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB25_3: // %res_block
+; CHECK-NEXT:  .LBB27_3: // %res_block
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -450,8 +476,7 @@ define i32 @length8(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
@@ -494,13 +519,13 @@ define i32 @length9(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB30_2
+; CHECK-NEXT:    b.ne .LBB32_2
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldrb w8, [x0, #8]
 ; CHECK-NEXT:    ldrb w9, [x1, #8]
 ; CHECK-NEXT:    sub w0, w8, w9
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB30_2: // %res_block
+; CHECK-NEXT:  .LBB32_2: // %res_block
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
 ; CHECK-NEXT:    ret
@@ -532,7 +557,7 @@ define i32 @length10(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
+; CHECK-NEXT:    b.ne .LBB34_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldrh w8, [x0, #8]
 ; CHECK-NEXT:    ldrh w9, [x1, #8]
@@ -541,11 +566,11 @@ define i32 @length10(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    lsr w8, w8, #16
 ; CHECK-NEXT:    lsr w9, w9, #16
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
+; CHECK-NEXT:    b.ne .LBB34_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB32_3: // %res_block
+; CHECK-NEXT:  .LBB34_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -578,18 +603,18 @@ define i32 @length11(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB34_3
+; CHECK-NEXT:    b.ne .LBB36_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur x8, [x0, #3]
 ; CHECK-NEXT:    ldur x9, [x1, #3]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB34_3
+; CHECK-NEXT:    b.ne .LBB36_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB34_3: // %res_block
+; CHECK-NEXT:  .LBB36_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -638,18 +663,18 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB37_3
+; CHECK-NEXT:    b.ne .LBB39_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr w8, [x0, #8]
 ; CHECK-NEXT:    ldr w9, [x1, #8]
 ; CHECK-NEXT:    rev w8, w8
 ; CHECK-NEXT:    rev w9, w9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB37_3
+; CHECK-NEXT:    b.ne .LBB39_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB37_3: // %res_block
+; CHECK-NEXT:  .LBB39_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -698,18 +723,18 @@ define i32 @length15(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB40_3
+; CHECK-NEXT:    b.ne .LBB42_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur x8, [x0, #7]
 ; CHECK-NEXT:    ldur x9, [x1, #7]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB40_3
+; CHECK-NEXT:    b.ne .LBB42_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB40_3: // %res_block
+; CHECK-NEXT:  .LBB42_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -726,18 +751,18 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB41_3
+; CHECK-NEXT:    b.ne .LBB43_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldur x8, [x0, #7]
 ; CHECK-NEXT:    ldur x9, [x1, #7]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB41_3
+; CHECK-NEXT:    b.ne .LBB43_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB41_3: // %res_block
+; CHECK-NEXT:  .LBB43_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -758,7 +783,7 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    movk x8, #12594, lsl #48
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
+; CHECK-NEXT:    b.ne .LBB44_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    mov x8, #13365 // =0x3435
 ; CHECK-NEXT:    ldur x9, [x0, #7]
@@ -767,11 +792,11 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    movk x8, #14393, lsl #48
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
+; CHECK-NEXT:    b.ne .LBB44_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB42_3: // %res_block
+; CHECK-NEXT:  .LBB44_3: // %res_block
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -806,7 +831,7 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    movk x8, #12594, lsl #48
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB44_3
+; CHECK-NEXT:    b.ne .LBB46_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    mov x8, #13365 // =0x3435
 ; CHECK-NEXT:    ldur x9, [x0, #7]
@@ -815,15 +840,15 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    movk x8, #14393, lsl #48
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB44_3
+; CHECK-NEXT:    b.ne .LBB46_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB44_4
-; CHECK-NEXT:  .LBB44_3: // %res_block
+; CHECK-NEXT:    b .LBB46_4
+; CHECK-NEXT:  .LBB46_3: // %res_block
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB44_4: // %endblock
+; CHECK-NEXT:  .LBB46_4: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -841,18 +866,18 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
+; CHECK-NEXT:    b.ne .LBB47_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
+; CHECK-NEXT:    b.ne .LBB47_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB45_3: // %res_block
+; CHECK-NEXT:  .LBB47_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -883,18 +908,18 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB47_3
+; CHECK-NEXT:    b.ne .LBB49_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB47_3
+; CHECK-NEXT:    b.ne .LBB49_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB47_3: // %res_block
+; CHECK-NEXT:  .LBB49_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -913,22 +938,22 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_3
+; CHECK-NEXT:    b.ne .LBB50_3
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_3
+; CHECK-NEXT:    b.ne .LBB50_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB48_4
-; CHECK-NEXT:  .LBB48_3: // %res_block
+; CHECK-NEXT:    b .LBB50_4
+; CHECK-NEXT:  .LBB50_3: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB48_4: // %endblock
+; CHECK-NEXT:  .LBB50_4: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -967,25 +992,25 @@ define i32 @length24(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:    b.ne .LBB52_4
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:    b.ne .LBB52_4
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:    b.ne .LBB52_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB50_4: // %res_block
+; CHECK-NEXT:  .LBB52_4: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1019,25 +1044,25 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
+; CHECK-NEXT:    b.ne .LBB54_4
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
+; CHECK-NEXT:    b.ne .LBB54_4
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
+; CHECK-NEXT:    b.ne .LBB54_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB52_4: // %res_block
+; CHECK-NEXT:  .LBB54_4: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1056,29 +1081,29 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
+; CHECK-NEXT:    b.ne .LBB55_4
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
+; CHECK-NEXT:    b.ne .LBB55_4
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
+; CHECK-NEXT:    b.ne .LBB55_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB53_5
-; CHECK-NEXT:  .LBB53_4: // %res_block
+; CHECK-NEXT:    b .LBB55_5
+; CHECK-NEXT:  .LBB55_4: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB53_5: // %endblock
+; CHECK-NEXT:  .LBB55_5: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -1122,32 +1147,32 @@ define i32 @length31(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:    b.ne .LBB57_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:    b.ne .LBB57_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:    b.ne .LBB57_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldur x8, [x0, #23]
 ; CHECK-NEXT:    ldur x9, [x1, #23]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:    b.ne .LBB57_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB55_5: // %res_block
+; CHECK-NEXT:  .LBB57_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1184,32 +1209,32 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
+; CHECK-NEXT:    b.ne .LBB59_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
+; CHECK-NEXT:    b.ne .LBB59_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
+; CHECK-NEXT:    b.ne .LBB59_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldur x8, [x0, #23]
 ; CHECK-NEXT:    ldur x9, [x1, #23]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
+; CHECK-NEXT:    b.ne .LBB59_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB57_5: // %res_block
+; CHECK-NEXT:  .LBB59_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1228,36 +1253,36 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
+; CHECK-NEXT:    b.ne .LBB60_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
+; CHECK-NEXT:    b.ne .LBB60_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
+; CHECK-NEXT:    b.ne .LBB60_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldur x8, [x0, #23]
 ; CHECK-NEXT:    ldur x9, [x1, #23]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
+; CHECK-NEXT:    b.ne .LBB60_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB58_6
-; CHECK-NEXT:  .LBB58_5: // %res_block
+; CHECK-NEXT:    b .LBB60_6
+; CHECK-NEXT:  .LBB60_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB58_6: // %endblock
+; CHECK-NEXT:  .LBB60_6: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -1327,32 +1352,32 @@ define i32 @length32(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:    b.ne .LBB63_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:    b.ne .LBB63_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:    b.ne .LBB63_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:    b.ne .LBB63_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB61_5: // %res_block
+; CHECK-NEXT:  .LBB63_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1388,32 +1413,32 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
+; CHECK-NEXT:    b.ne .LBB65_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
+; CHECK-NEXT:    b.ne .LBB65_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
+; CHECK-NEXT:    b.ne .LBB65_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
+; CHECK-NEXT:    b.ne .LBB65_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB63_5: // %res_block
+; CHECK-NEXT:  .LBB65_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1432,36 +1457,36 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
+; CHECK-NEXT:    b.ne .LBB66_5
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
+; CHECK-NEXT:    b.ne .LBB66_5
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
+; CHECK-NEXT:    b.ne .LBB66_5
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
+; CHECK-NEXT:    b.ne .LBB66_5
 ; CHECK-NEXT:  // %bb.4:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB64_6
-; CHECK-NEXT:  .LBB64_5: // %res_block
+; CHECK-NEXT:    b .LBB66_6
+; CHECK-NEXT:  .LBB66_5: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB64_6: // %endblock
+; CHECK-NEXT:  .LBB66_6: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -1528,46 +1553,46 @@ define i32 @length48(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:    b.ne .LBB69_7
 ; CHECK-NEXT:  // %bb.6:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB67_7: // %res_block
+; CHECK-NEXT:  .LBB69_7: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1606,46 +1631,46 @@ define i1 @length48_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
+; CHECK-NEXT:    b.ne .LBB71_7
 ; CHECK-NEXT:  // %bb.6:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB69_7: // %res_block
+; CHECK-NEXT:  .LBB71_7: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1664,50 +1689,50 @@ define i1 @length48_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
+; CHECK-NEXT:    b.ne .LBB72_7
 ; CHECK-NEXT:  // %bb.6:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB70_8
-; CHECK-NEXT:  .LBB70_7: // %res_block
+; CHECK-NEXT:    b .LBB72_8
+; CHECK-NEXT:  .LBB72_7: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB70_8: // %endblock
+; CHECK-NEXT:  .LBB72_8: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -1785,60 +1810,60 @@ define i32 @length63(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldur x8, [x0, #55]
 ; CHECK-NEXT:    ldur x9, [x1, #55]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:    b.ne .LBB75_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB73_9: // %res_block
+; CHECK-NEXT:  .LBB75_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -1883,60 +1908,60 @@ define i1 @length63_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldur x8, [x0, #55]
 ; CHECK-NEXT:    ldur x9, [x1, #55]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
+; CHECK-NEXT:    b.ne .LBB77_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB75_9: // %res_block
+; CHECK-NEXT:  .LBB77_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -1955,64 +1980,64 @@ define i1 @length63_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldur x8, [x0, #55]
 ; CHECK-NEXT:    ldur x9, [x1, #55]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:    b.ne .LBB78_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB76_10
-; CHECK-NEXT:  .LBB76_9: // %res_block
+; CHECK-NEXT:    b .LBB78_10
+; CHECK-NEXT:  .LBB78_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB76_10: // %endblock
+; CHECK-NEXT:  .LBB78_10: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -2076,60 +2101,60 @@ define i32 @length64(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldr x8, [x0, #56]
 ; CHECK-NEXT:    ldr x9, [x1, #56]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:    b.ne .LBB80_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB78_9: // %res_block
+; CHECK-NEXT:  .LBB80_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w0, w8, hs
@@ -2172,60 +2197,60 @@ define i1 @length64_lt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldr x8, [x0, #56]
 ; CHECK-NEXT:    ldr x9, [x1, #56]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
+; CHECK-NEXT:    b.ne .LBB82_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    lsr w0, wzr, #31
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB80_9: // %res_block
+; CHECK-NEXT:  .LBB82_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
@@ -2244,64 +2269,64 @@ define i1 @length64_gt(ptr %x, ptr %y) nounwind {
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldr x8, [x0, #8]
 ; CHECK-NEXT:    ldr x9, [x1, #8]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.2: // %loadbb2
 ; CHECK-NEXT:    ldr x8, [x0, #16]
 ; CHECK-NEXT:    ldr x9, [x1, #16]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.3: // %loadbb3
 ; CHECK-NEXT:    ldr x8, [x0, #24]
 ; CHECK-NEXT:    ldr x9, [x1, #24]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.4: // %loadbb4
 ; CHECK-NEXT:    ldr x8, [x0, #32]
 ; CHECK-NEXT:    ldr x9, [x1, #32]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.5: // %loadbb5
 ; CHECK-NEXT:    ldr x8, [x0, #40]
 ; CHECK-NEXT:    ldr x9, [x1, #40]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.6: // %loadbb6
 ; CHECK-NEXT:    ldr x8, [x0, #48]
 ; CHECK-NEXT:    ldr x9, [x1, #48]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.7: // %loadbb7
 ; CHECK-NEXT:    ldr x8, [x0, #56]
 ; CHECK-NEXT:    ldr x9, [x1, #56]
 ; CHECK-NEXT:    rev x8, x8
 ; CHECK-NEXT:    rev x9, x9
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
+; CHECK-NEXT:    b.ne .LBB83_9
 ; CHECK-NEXT:  // %bb.8:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB81_10
-; CHECK-NEXT:  .LBB81_9: // %res_block
+; CHECK-NEXT:    b .LBB83_10
+; CHECK-NEXT:  .LBB83_9: // %res_block
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB81_10: // %endblock
+; CHECK-NEXT:  .LBB83_10: // %endblock
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index c1b9a4c..9ece9ed 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -367,6 +367,166 @@ entry:
   ret <4 x i64> %partial.reduce
 }
 
+define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <16 x i8> %a to <16 x i32>
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    saddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <16 x i8> %a to <16 x i32>
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <8 x i8> %a to <8 x i32>
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
+  ret <2 x i32> %partial.reduce
+}
+
+define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <8 x i8> %a to <8 x i32>
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
+  ret <2 x i32> %partial.reduce
+}
+
+define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll v4.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-NODOT-NEXT:    uaddl2 v4.2d, v3.4s, v5.4s
+; CHECK-NODOT-NEXT:    uaddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <16 x i8> %a to <16 x i64>
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
+  ret <4 x i64> %partial.reduce
+}
+
+define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll v4.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-NODOT-NEXT:    saddl2 v4.2d, v3.4s, v5.4s
+; CHECK-NODOT-NEXT:    saddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <16 x i8> %a to <16 x i64>
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
+  ret <4 x i64> %partial.reduce
+}
+
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0:
@@ -398,3 +558,91 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult)
   ret <2 x i32> %partial.reduce
 }
+
+define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: udot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    umull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    umlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <8 x i16> %a to <8 x i64>
+  %b.wide = zext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: sdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <8 x i16> %a to <8 x i64>
+  %b.wide = sext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: usdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <8 x i16> %a to <8 x i64>
+  %b.wide = sext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: sudot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <8 x i16> %a to <8 x i64>
+  %b.wide = zext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
index abdfb99..db5b932 100644
--- a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
@@ -23,19 +23,11 @@ entry:
 }
 
 define <4 x i32> @v4i32(<4 x i32> %a) {
-; CHECK-SD-LABEL: v4i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v4i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI2_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i32> %V128
@@ -52,19 +44,11 @@ entry:
 }
 
 define <8 x i16> @v8i16(<8 x i16> %a) {
-; CHECK-SD-LABEL: v8i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI4_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i16> %V128
@@ -93,6 +77,22 @@ entry:
   ret <8 x i16> %V128
 }
 
+define <4 x i16> @v8i16_3(<8 x i16> %a) {
+; CHECK-SD-LABEL: v8i16_3:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    rev64 v0.4h, v0.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i16_3:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %V128 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i16> %V128
+}
+
 define <4 x i16> @v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0: // %entry
@@ -104,19 +104,11 @@ entry:
 }
 
 define <16 x i8> @v16i8(<16 x i8> %a) {
-; CHECK-SD-LABEL: v16i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.16b, v0.16b
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v16i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI7_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI7_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x i8> %V128
@@ -125,18 +117,18 @@ entry:
 define <16 x i8> @v16i8_2(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-LABEL: v16i8_2:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    adrp x8, .LCPI8_0
+; CHECK-SD-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
-; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: v16i8_2:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
@@ -166,19 +158,11 @@ entry:
 }
 
 define <4 x float> @v4f32(<4 x float> %a) {
-; CHECK-SD-LABEL: v4f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v4f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %V128
@@ -195,19 +179,11 @@ entry:
 }
 
 define <8 x half> @v8f16(<8 x half> %a) {
-; CHECK-SD-LABEL: v8f16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: v8f16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
-; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
 entry:
   %V128 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x half> %V128
diff --git a/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll b/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
index 373c496..f823d2a 100644
--- a/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
+++ b/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
@@ -34,8 +34,8 @@ entry:
 }
 ;; CHECK-LABEL: __llvm_gcov_writeout:
 ;; CHECK:       .cfi_b_key_frame
-;; CHECK-NEXT:  pacibsp
 ;; CHECK-NEXT:  .cfi_negate_ra_state
+;; CHECK-NEXT:  pacibsp
 
 define internal void @__llvm_gcov_reset() unnamed_addr #2 {
 entry:
@@ -54,9 +54,9 @@ entry:
 }
 ;; CHECK-LABEL: __llvm_gcov_init:
 ;; CHECK:      .cfi_b_key_frame
-;; CHECK-NEXT:  pacibsp
 ;; CHECK-NEXT:  .cfi_negate_ra_state
 ;; CHECK-NOT:   .cfi_
+;; CHECK-NEXT:  pacibsp
 ;; CHECK:       .cfi_endproc
 
 attributes #0 = { norecurse nounwind readnone "sign-return-address"="all" "sign-return-address-key"="b_key" }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll b/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll
new file mode 100644
index 0000000..89731e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-arm64-tls-dynamics.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -filetype=obj < %s | llvm-readelf -r -s - | FileCheck --check-prefix=CHECK-OBJ %s
+; RUN: not --crash llc -mtriple=aarch64-unknown-linux-gnu -mattr=+pauth -relocation-model=pic \
+; RUN:   -global-isel=1 < %s 2>&1 | FileCheck --check-prefix=CHECK-ERR %s
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32, ptr @general_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:general_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+
+; CHECK-ERR: LLVM ERROR: cannot select: %1:gpr64sp(p0) = G_GLOBAL_VALUE @general_dynamic_var (in function: test_generaldynamic)
+}
+
+define ptr @test_generaldynamic_addr() {
+; CHECK-LABEL: test_generaldynamic_addr:
+
+  ret ptr @general_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:general_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:general_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: add x0, [[TP]], x0
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+;; Note: with signed TLSDESC, general dynamic model is always used,
+;; even when local dynamic is requested.
+
+@local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK-LABEL: test_localdynamic:
+
+  %val = load i32, ptr @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:local_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+define ptr @test_localdynamic_addr() {
+; CHECK-LABEL: test_localdynamic_addr:
+
+  ret ptr @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:local_dynamic_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:local_dynamic_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: add x0, x[[TPIDR]], x0
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+}
+
+@extern_weak_var = extern_weak thread_local global i32
+
+define i32 @test_extern_weak() {
+; CHECK-LABEL: test_extern_weak:
+
+  %val = load i32, ptr @extern_weak_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc_auth:extern_weak_var
+; CHECK-NEXT: ldr x16, [x[[TLSDESC_HI]], :tlsdesc_auth_lo12:extern_weak_var]
+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_auth_lo12:extern_weak_var
+; CHECK-NEXT: blraa x16, x0
+; CHECK-NEXT: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+; CHECK-NEXT: ldr w0, [x[[TPIDR]], x0]
+
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_LD64_LO12
+; CHECK-OBJ: R_AARCH64_AUTH_TLSDESC_ADD_LO12
+; CHECK-OBJ-NOT: R_AARCH64_TLSDESC_CALL
+; CHECK-OBJ: 0000000000000000     0 TLS     WEAK   DEFAULT   UND extern_weak_var
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 8, !"ptrauth-elf-got", i32 1}
diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
new file mode 100644
index 0000000..f5df5ea
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_v2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    tbnz w9, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  .LBB0_3: // %middle.split
+; CHECK-NEXT:    and x0, x9, #0x1
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_v2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    tbnz w9, #0, .LBB1_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  .LBB1_3: // %middle.split
+; CHECK-NEXT:    tbz w9, #0, .LBB1_5
+; CHECK-NEXT:  // %bb.4: // %found
+; CHECK-NEXT:    mov w8, #56 // =0x38
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
+; CHECK-NEXT:  .LBB2_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    b.ne .LBB2_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    b.ne .LBB2_1
+; CHECK-NEXT:  .LBB2_3: // %middle.split
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
+; CHECK-NEXT:  .LBB3_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    b.ne .LBB3_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    b.ne .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %middle.split
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    b.eq .LBB3_5
+; CHECK-NEXT:  // %bb.4: // %found
+; CHECK-NEXT:    mov w8, #56 // =0x38
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_5:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
index b61fa4b..08fc47d 100644
--- a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
+++ b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
@@ -1,5 +1,5 @@
+# RUN: llc -mtriple=aarch64 -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG
 # RUN: llc -mtriple=aarch64 -verify-machineinstrs -o - -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking %s | FileCheck %s --check-prefix=CHECK
-# RUN: llc -mtriple=aarch64 -verify-machineinstrs -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG
 # REQUIRES: asserts
 
 # CHECK-DBG: ********** REGISTER COALESCER **********
@@ -36,3 +36,94 @@ body:             |
     RET_ReallyLR
 
 ...
+# CHECK-DBG: ********** REGISTER COALESCER **********
+# CHECK-DBG: ********** Function: reproducer
+# CHECK-DBG: ********** JOINING INTERVALS ***********
+# CHECK-DBG: ********** INTERVALS **********
+# CHECK-DBG: %1 [32r,48B:2)[48B,320r:0)[320r,368B:1) 0@48B-phi 1@320r 2@32r
+# CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: %3 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi
+# CHECK-DBG-SAME: L0000000000000080 [288r,304B:0)[304B,320r:3) 0@288r 1@x 2@x 3@304B-phi
+# CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi
+# CHECK-DBG-SAME: weight:0.000000e+00
+---
+name:              reproducer
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:gpr32 = MOVi32imm 1
+    %1:gpr64 = IMPLICIT_DEF
+
+  bb.1:
+
+  bb.2:
+    %3:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+
+  bb.3:
+    $nzcv = IMPLICIT_DEF
+    %4:gpr64 = COPY killed %3
+    Bcc 1, %bb.7, implicit killed $nzcv
+
+  bb.4:
+    $nzcv = IMPLICIT_DEF
+    Bcc 1, %bb.6, implicit killed $nzcv
+
+  bb.5:
+    %5:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+    %4:gpr64 = COPY killed %5
+    B %bb.7
+
+  bb.6:
+    %4:gpr64 = COPY $xzr
+
+  bb.7:
+    %7:gpr64 = ADDXrs killed %1, killed %4, 1
+    %1:gpr64 = COPY killed %7
+    B %bb.1
+
+...
+# CHECK-DBG: ********** REGISTER COALESCER **********
+# CHECK-DBG: ********** Function: reproducer2
+# CHECK-DBG: ********** JOINING INTERVALS ***********
+# CHECK-DBG: ********** INTERVALS **********
+# CHECK-DBG: %1 [32r,48B:2)[48B,304r:0)[304r,352B:1) 0@48B-phi 1@304r 2@32r
+# CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: %3 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi
+# CHECK-DBG-SAME: L0000000000000080 [224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@x 3@288B-phi
+# CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi
+# CHECK-DBG-SAME: weight:0.000000e+00
+---
+name:              reproducer2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:gpr32 = MOVi32imm 1
+    %1:gpr64 = IMPLICIT_DEF
+
+  bb.1:
+
+  bb.2:
+    %3:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+
+  bb.3:
+    $nzcv = IMPLICIT_DEF
+    %4:gpr64 = COPY killed %3
+    Bcc 1, %bb.7, implicit killed $nzcv
+
+  bb.4:
+    $nzcv = IMPLICIT_DEF
+    Bcc 1, %bb.6, implicit killed $nzcv
+
+  bb.5:
+    %4:gpr64 = IMPLICIT_DEF
+    B %bb.7
+
+  bb.6:
+    %4:gpr64 = COPY $xzr
+
+  bb.7:
+    %5:gpr64 = ADDXrs killed %1, killed %4, 1
+    %1:gpr64 = COPY killed %5
+    B %bb.1
+
+...
diff --git a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
index ec7548e..b7fae2b 100644
--- a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
@@ -7,19 +7,17 @@
 define i32 @testmswbf(bfloat %a) {
 ; CHECK-LABEL: testmswbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintm s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -31,19 +29,17 @@ entry:
 define i64 @testmsxbf(bfloat %a) {
 ; CHECK-LABEL: testmsxbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintm s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -141,19 +137,17 @@ entry:
 define i32 @testpswbf(bfloat %a) {
 ; CHECK-LABEL: testpswbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintp s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -165,19 +159,17 @@ entry:
 define i64 @testpsxbf(bfloat %a) {
 ; CHECK-LABEL: testpsxbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintp s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/selectopt.ll b/llvm/test/CodeGen/AArch64/selectopt.ll
index 54309dc..d72a956 100644
--- a/llvm/test/CodeGen/AArch64/selectopt.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt.ll
@@ -875,3 +875,124 @@ if.end:
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+declare i64 @payload(i64, ptr, ptr, i64)
+
+define void @outer_latch_heuristic(ptr %dst, ptr %src, i64 %p, i64 %dim) {
+; CHECKOO-LABEL: @outer_latch_heuristic(
+; CHECKOO-NEXT:  entry:
+; CHECKOO-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECKOO:       outer.loop:
+; CHECKOO-NEXT:    [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[SELECT_END:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECKOO-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
+; CHECKOO-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
+; CHECKOO-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
+; CHECKOO-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECKOO-NEXT:    [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
+; CHECKOO-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
+; CHECKOO-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECKOO:       inner.loop:
+; CHECKOO-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
+; CHECKOO-NEXT:    [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
+; CHECKOO-NEXT:    [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
+; CHECKOO-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECKOO-NEXT:    [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECKOO-NEXT:    br i1 [[EXITCOND_NOT_I_US]], label [[LATCH:%.*]], label [[INNER_LOOP]]
+; CHECKOO:       latch:
+; CHECKOO-NEXT:    [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
+; CHECKOO-NEXT:    [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
+; CHECKOO-NEXT:    [[CMP2_US_FROZEN:%.*]] = freeze i1 [[CMP2_US]]
+; CHECKOO-NEXT:    br i1 [[CMP2_US_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]]
+; CHECKOO:       select.true.sink:
+; CHECKOO-NEXT:    [[TMP2:%.*]] = add nsw i64 [[J]], 1
+; CHECKOO-NEXT:    br label [[SELECT_END]]
+; CHECKOO:       select.false.sink:
+; CHECKOO-NEXT:    [[TMP3:%.*]] = add nsw i64 1, [[I]]
+; CHECKOO-NEXT:    br label [[SELECT_END]]
+; CHECKOO:       select.end:
+; CHECKOO-NEXT:    [[I_NEXT]] = phi i64 [ [[I]], [[SELECT_TRUE_SINK]] ], [ [[TMP3]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[J_NEXT]] = phi i64 [ [[TMP2]], [[SELECT_TRUE_SINK]] ], [ [[J]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[COND_IN_US:%.*]] = phi ptr [ [[ARRAYIDX1_US]], [[SELECT_TRUE_SINK]] ], [ [[ARRAYIDX_US]], [[SELECT_FALSE_SINK]] ]
+; CHECKOO-NEXT:    [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
+; CHECKOO-NEXT:    [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
+; CHECKOO-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
+; CHECKOO-NEXT:    store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
+; CHECKOO-NEXT:    [[INC7_US]] = add i64 [[K_020_US]], 1
+; CHECKOO-NEXT:    [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
+; CHECKOO-NEXT:    br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
+; CHECKOO:       exit:
+; CHECKOO-NEXT:    ret void
+;
+; CHECKII-LABEL: @outer_latch_heuristic(
+; CHECKII-NEXT:  entry:
+; CHECKII-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECKII:       outer.loop:
+; CHECKII-NEXT:    [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECKII-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
+; CHECKII-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
+; CHECKII-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
+; CHECKII-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECKII-NEXT:    [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
+; CHECKII-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
+; CHECKII-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECKII:       inner.loop:
+; CHECKII-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
+; CHECKII-NEXT:    [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
+; CHECKII-NEXT:    [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
+; CHECKII-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECKII-NEXT:    [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECKII-NEXT:    br i1 [[EXITCOND_NOT_I_US]], label [[LATCH]], label [[INNER_LOOP]]
+; CHECKII:       latch:
+; CHECKII-NEXT:    [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
+; CHECKII-NEXT:    [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
+; CHECKII-NEXT:    [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]]
+; CHECKII-NEXT:    [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
+; CHECKII-NEXT:    [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]]
+; CHECKII-NEXT:    [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]]
+; CHECKII-NEXT:    [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
+; CHECKII-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
+; CHECKII-NEXT:    store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
+; CHECKII-NEXT:    [[INC7_US]] = add i64 [[K_020_US]], 1
+; CHECKII-NEXT:    [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
+; CHECKII-NEXT:    br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
+; CHECKII:       exit:
+; CHECKII-NEXT:    ret void
+;
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %k.020.us = phi i64 [ %inc7.us, %latch ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %latch ], [ 0, %entry ]
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %arrayidx.us = getelementptr inbounds ptr, ptr %src, i64 %i
+  %4 = load ptr, ptr %arrayidx.us, align 8
+  %arrayidx1.us = getelementptr inbounds ptr, ptr %src, i64 %j
+  %5 = load ptr, ptr %arrayidx1.us, align 8
+  br label %inner.loop
+
+inner.loop:
+  %lsr.iv = phi i64 [ %dim, %outer.loop ], [ %lsr.iv.next, %inner.loop ]
+  %diff.04.i.us = phi i64 [ %call.i.us, %inner.loop ], [ 0, %outer.loop ]
+  %call.i.us = tail call i64 @payload(i64 %diff.04.i.us, ptr %4, ptr %5, i64 %p)
+  %lsr.iv.next = add i64 %lsr.iv, -1
+  %exitcond.not.i.us = icmp eq i64 %lsr.iv.next, 0
+  br i1 %exitcond.not.i.us, label %latch, label %inner.loop
+
+latch:
+  %cmp2.us = icmp sgt i64 %call.i.us, -1
+  %diff.0.lcssa.i.lobit.us = lshr i64 %call.i.us, 63
+  %i.next = add nsw i64 %diff.0.lcssa.i.lobit.us, %i
+  %inc4.us = zext i1 %cmp2.us to i64
+  %j.next = add nsw i64 %j, %inc4.us
+  %cond.in.us = select i1 %cmp2.us, ptr %arrayidx1.us, ptr %arrayidx.us
+  %cond.us = load ptr, ptr %cond.in.us, align 8
+  %arrayidx6.us = getelementptr inbounds ptr, ptr %dst, i64 %k.020.us
+  store ptr %cond.us, ptr %arrayidx6.us, align 8
+  %inc7.us = add i64 %k.020.us, 1
+  %exitcond23.not = icmp eq i64 %k.020.us, 1000
+  br i1 %exitcond23.not, label %exit, label %outer.loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/shuffle-select.ll b/llvm/test/CodeGen/AArch64/shuffle-select.ll
new file mode 100644
index 0000000..eeccaa1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shuffle-select.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
+
+define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
+; CHECK-LABEL: sel_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8_poison(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <16 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8_unregular(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8_unregular:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %tmp0
+}
+
+define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
+; CHECK-LABEL: sel_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    trn2 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i16> %tmp0
+}
+
+define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: sel_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI5_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x i16> %tmp0
+}
+
+define <2 x i32> @sel_v2i32(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: sel_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 3>
+  ret <2 x i32> %tmp0
+}
+
+define <4 x i32> @sel_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: sel_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %tmp0
+}
+
+define <2 x i64> @sel_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; CHECK-LABEL: sel_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %tmp0
+}
+
+define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
+; CHECK-LABEL: sel_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    trn2 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x half> %tmp0
+}
+
+define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
+; CHECK-LABEL: sel_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI10_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x half> %tmp0
+}
+
+define <2 x float> @sel_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: sel_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 3>
+  ret <2 x float> %tmp0
+}
+
+define <4 x float> @sel_v4f32(<4 x float> %v0, <4 x float> %v1) {
+; CHECK-LABEL: sel_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %tmp0
+}
+
+define <2 x double> @sel_v2f64(<2 x double> %v0, <2 x double> %v1) {
+; CHECK-LABEL: sel_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %tmp0
+}
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll b/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
index 4d4b7c2..6ea0728 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
@@ -9,8 +9,8 @@
 define dso_local i32 @_Z3fooi(i32 %x) #0 {
 ; CHECK-V8A-LABEL: _Z3fooi:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -27,8 +27,8 @@ define dso_local i32 @_Z3fooi(i32 %x) #0 {
 ;
 ; CHECK-V83A-LABEL: _Z3fooi:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -62,8 +62,8 @@ return:                                           ; No predecessors!
 define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-LABEL: baz_async:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -74,8 +74,8 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-NEXT:    bl _Z3bari
 ; CHECK-V8A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_restore w30
 ; CHECK-V8A-NEXT:    b _Z3bari
 ; CHECK-V8A-NEXT:  .LBB1_2: // %if.else
@@ -84,15 +84,15 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-NEXT:    add w0, w0, #1
 ; CHECK-V8A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_restore w30
 ; CHECK-V8A-NEXT:    ret
 ;
 ; CHECK-V83A-LABEL: baz_async:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -103,8 +103,8 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V83A-NEXT:    bl _Z3bari
 ; CHECK-V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V83A-NEXT:    autiasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    autiasp
 ; CHECK-V83A-NEXT:    .cfi_restore w30
 ; CHECK-V83A-NEXT:    b _Z3bari
 ; CHECK-V83A-NEXT:  .LBB1_2: // %if.else
@@ -143,8 +143,8 @@ return:                                           ; preds = %if.else, %if.then
 define hidden noundef i32 @baz_sync(i32 noundef %a) #0 uwtable(sync) {
 ; CHECK-V8A-LABEL: baz_sync:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -164,8 +164,8 @@ define hidden noundef i32 @baz_sync(i32 noundef %a) #0 uwtable(sync) {
 ;
 ; CHECK-V83A-LABEL: baz_sync:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -216,7 +216,7 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP-NOT: DW_CFA_remember_state
 ; CHECK-DUMP-NOT: DW_CFA_restore_state
 
-; CHECK-DUMP: CFA=WSP{{$}}
+; CHECK-DUMP: CFA=WSP
 ; CHECK-DUMP: reg34=1
 ; CHECK-DUMP-NOT: reg34=0
 
@@ -229,6 +229,7 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP:   DW_CFA_restore_state:
 ; CHECK-DUMP:   DW_CFA_AARCH64_negate_ra_state:
 
+; CHECK-DUMP: CFA=WSP
 ;; First DW_CFA_AARCH64_negate_ra_state:
 ; CHECK-DUMP: reg34=1
 ;; Second DW_CFA_AARCH64_negate_ra_state:
@@ -237,7 +238,6 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP: reg34=1
 ;; Third DW_CFA_AARCH64_negate_ra_state:
 ; CHECK-DUMP: reg34=0
-; CHECK-DUMP-NOT: reg34=1
 
 ; baz_sync
 ; CHECK-DUMP-LABEL: FDE
@@ -246,6 +246,6 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP-NOT: DW_CFA_remember_state
 ; CHECK-DUMP-NOT: DW_CFA_restore_state
 
-; CHECK-DUMP: CFA=WSP{{$}}
+; CHECK-DUMP: CFA=WSP
 ; CHECK-DUMP: reg34=1
 ; CHECK-DUMP-NOT: reg34=0
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
index fa689d2..f37f122 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
@@ -60,9 +60,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ; COMPAT-LABEL: leaf_sign_all:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp0:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp0
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; COMPAT-NEXT:    hint #39
@@ -72,9 +72,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ; V83A-LABEL: leaf_sign_all:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp0:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp0
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; V83A-NEXT:    hint #39
@@ -82,9 +82,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ;
 ; PAUTHLR-LABEL: leaf_sign_all:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp0:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp0
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; PAUTHLR-NEXT:    retaasppc .Ltmp0
@@ -95,9 +95,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ; COMPAT-LABEL: leaf_clobbers_lr:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp1:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -114,9 +114,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ; V83A-LABEL: leaf_clobbers_lr:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp1:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -131,9 +131,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ;
 ; PAUTHLR-LABEL: leaf_clobbers_lr:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp1:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -154,9 +154,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; COMPAT-LABEL: non_leaf_sign_all:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp2:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -171,9 +171,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; V83A-LABEL: non_leaf_sign_all:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp2:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -186,9 +186,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ;
 ; PAUTHLR-LABEL: non_leaf_sign_all:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp2:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -205,9 +205,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ; COMPAT-LABEL: non_leaf_sign_non_leaf:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp3:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -222,9 +222,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ; V83A-LABEL: non_leaf_sign_non_leaf:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp3:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -237,9 +237,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ;
 ; PAUTHLR-LABEL: non_leaf_sign_non_leaf:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp3:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -259,9 +259,9 @@ define i32 @non_leaf_scs(i32 %x) "branch-protection-pauth-lr" "sign-return-addre
 ; CHECK-NEXT:    str x30, [x18], #8
 ; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp4:
 ; CHECK-NEXT:    paciasp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -278,9 +278,9 @@ define i32 @non_leaf_scs(i32 %x) "branch-protection-pauth-lr" "sign-return-addre
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    str x30, [x18], #8
 ; PAUTHLR-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp4:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -299,9 +299,9 @@ define i32 @leaf_sign_all_v83(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; CHECK-LABEL: leaf_sign_all_v83:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp5:
 ; CHECK-NEXT:    paciasp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp5
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp5
 ; CHECK-NEXT:    hint #39
@@ -309,9 +309,9 @@ define i32 @leaf_sign_all_v83(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_v83:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp5:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp5
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp5
 ; PAUTHLR-NEXT:    retaasppc .Ltmp5
@@ -324,9 +324,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ; COMPAT-LABEL: spill_lr_and_tail_call:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp6:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -343,9 +343,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ; V83A-LABEL: spill_lr_and_tail_call:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp6:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -361,9 +361,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ;
 ; PAUTHLR-LABEL: spill_lr_and_tail_call:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp6:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -384,9 +384,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; COMPAT-LABEL: leaf_sign_all_a_key:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp7:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp7
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; COMPAT-NEXT:    hint #39
@@ -396,9 +396,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; V83A-LABEL: leaf_sign_all_a_key:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp7:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp7
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; V83A-NEXT:    hint #39
@@ -406,9 +406,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_a_key:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp7:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp7
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; PAUTHLR-NEXT:    retaasppc .Ltmp7
@@ -420,9 +420,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp8:
 ; COMPAT-NEXT:    hint #27
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp8
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; COMPAT-NEXT:    hint #39
@@ -433,9 +433,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp8:
 ; V83A-NEXT:    pacibsp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp8
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; V83A-NEXT:    hint #39
@@ -444,9 +444,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; PAUTHLR-LABEL: leaf_sign_all_b_key:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp8:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp8
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; PAUTHLR-NEXT:    retabsppc .Ltmp8
@@ -458,9 +458,9 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp9:
 ; CHECK-NEXT:    pacibsp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp9
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp9
 ; CHECK-NEXT:    hint #39
@@ -469,9 +469,9 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR-LABEL: leaf_sign_all_v83_b_key:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp9:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp9
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp9
 ; PAUTHLR-NEXT:    retabsppc .Ltmp9
@@ -484,9 +484,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #34
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp10:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp10
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; COMPAT-NEXT:    hint #39
@@ -497,9 +497,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #34
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp10:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp10
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; V83A-NEXT:    hint #39
@@ -508,9 +508,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR-LABEL: leaf_sign_all_a_key_bti:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp10:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp10
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; PAUTHLR-NEXT:    retaasppc .Ltmp10
@@ -524,9 +524,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; COMPAT-NEXT:    hint #34
 ; COMPAT-NEXT:    .cfi_b_key_frame
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp11:
 ; COMPAT-NEXT:    hint #27
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp11
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; COMPAT-NEXT:    hint #39
@@ -538,9 +538,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; V83A-NEXT:    hint #34
 ; V83A-NEXT:    .cfi_b_key_frame
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp11:
 ; V83A-NEXT:    pacibsp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp11
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; V83A-NEXT:    hint #39
@@ -550,9 +550,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp11:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp11
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; PAUTHLR-NEXT:    retabsppc .Ltmp11
@@ -566,9 +566,9 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "si
 ; CHECK-NEXT:    hint #34
 ; CHECK-NEXT:    .cfi_b_key_frame
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp12:
 ; CHECK-NEXT:    pacibsp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp12
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp12
 ; CHECK-NEXT:    hint #39
@@ -578,9 +578,9 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "si
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp12:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp12
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp12
 ; PAUTHLR-NEXT:    retabsppc .Ltmp12
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address.ll b/llvm/test/CodeGen/AArch64/sign-return-address.ll
index dafe0d7..e0ee0d8 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address.ll
@@ -29,15 +29,15 @@ define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: leaf_sign_all:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -45,8 +45,8 @@ define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
 define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
 ; COMPAT-LABEL: leaf_clobbers_lr:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -59,8 +59,8 @@ define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
 ;
 ; V83A-LABEL: leaf_clobbers_lr:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -78,8 +78,8 @@ declare i32 @foo(i32)
 define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: non_leaf_sign_all:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -90,8 +90,8 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ;
 ; V83A-LABEL: non_leaf_sign_all:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -105,8 +105,8 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ; COMPAT-LABEL: non_leaf_sign_non_leaf:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -117,8 +117,8 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ;
 ; V83A-LABEL: non_leaf_sign_non_leaf:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -135,8 +135,8 @@ define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstac
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x30, [x18], #8
 ; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
-; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -152,8 +152,8 @@ define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstac
 define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
 ; CHECK-LABEL: leaf_sign_all_v83:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    retaa
   ret i32 %x
 }
@@ -163,8 +163,8 @@ declare fastcc i64 @bar(i64)
 define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: spill_lr_and_tail_call:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -177,8 +177,8 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 ;
 ; V83A-LABEL: spill_lr_and_tail_call:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -196,15 +196,15 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
 ; COMPAT-LABEL: leaf_sign_all_a_key:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_a_key:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -213,16 +213,16 @@ define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return
 ; COMPAT-LABEL: leaf_sign_all_b_key:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    hint #31
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_b_key:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    retab
   ret i32 %x
 }
@@ -231,8 +231,8 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-
 ; CHECK-LABEL: leaf_sign_all_v83_b_key:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    retab
   ret i32 %x
 }
@@ -241,15 +241,15 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-
 define i32 @leaf_sign_all_a_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" "branch-target-enforcement"{
 ; COMPAT-LABEL: leaf_sign_all_a_key_bti:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_a_key_bti:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -259,16 +259,16 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "sign-return-address"="all" "sign-re
 ; COMPAT-LABEL: leaf_sign_all_b_key_bti:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    hint #31
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_b_key_bti:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    retab
   ret i32 %x
 }
@@ -278,8 +278,8 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "sign-return-address"="all" "tar
 ; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    retab
   ret i32 %x
 }
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll
new file mode 100644
index 0000000..5037772
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+
+
+define i1 @streaming_mode_streaming_compatible() #0 {
+; CHECK-LABEL: streaming_mode_streaming_compatible:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and w0, w0, #0x1
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %mode = tail call noundef i1 @llvm.aarch64.sme.in.streaming.mode()
+  ret i1 %mode
+}
+
+
+attributes #0 = {nounwind memory(none) "aarch64_pstate_sm_compatible"}
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 66d6e03..66f83c6 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -316,6 +316,84 @@ entry:
   ret <vscale x 4 x i64> %partial.reduce
 }
 
+define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: udot_no_bin_op:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: sdot_no_bin_op:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
+; CHECK-LABEL: udot_no_bin_op_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
+; CHECK-LABEL: sdot_no_bin_op_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: udot_no_bin_op_8to64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ret
+  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: sdot_no_bin_op_8to64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ret
+  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0: // %entry
@@ -419,3 +497,133 @@ entry:
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
+
+define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: udot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: sdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: usdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: sudot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 7f2eefe..7f3c1fd 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,26 +1,86 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; Basic tests from input vector to bitmask
 ; IR generated from clang for:
 ; __builtin_convertvector + reinterpret_cast<uint16&>
 
+; GISEL: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
+; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
+; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
+
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; CHECK-LABEL: convert_to_bitmask16:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x8, lCPI0_0@PAGE
-; CHECK-NEXT:    cmeq.16b v0, v0, #0
-; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+; SDAG-LABEL: convert_to_bitmask16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI0_0@PAGE
+; SDAG-NEXT:    cmeq.16b v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.16b v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[8]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[9]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[10]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #8
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[11]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #9
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[12]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #10
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[13]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #11
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[14]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #12
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[15]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #13
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #14
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #15
+; GISEL-NEXT:    strh w8, [sp, #14]
+; GISEL-NEXT:    and w0, w8, #0xffff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 ; Actual conversion
 
@@ -30,19 +90,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; CHECK-LABEL: convert_to_bitmask8:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x8, lCPI1_0@PAGE
-; CHECK-NEXT:    cmeq.8h v0, v0, #0
-; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
+; SDAG-LABEL: convert_to_bitmask8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI1_0@PAGE
+; SDAG-NEXT:    cmeq.8h v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w8, s0
+; SDAG-NEXT:    and w0, w8, #0xff
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.8h v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
@@ -52,18 +143,36 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 }
 
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
-; CHECK-LABEL: convert_to_bitmask4:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    adrp x8, lCPI2_0@PAGE
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh5
+; SDAG-LABEL: convert_to_bitmask4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    adrp x8, lCPI2_0@PAGE
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -74,17 +183,14 @@ define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 ; CHECK-LABEL: convert_to_bitmask2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh6:
 ; CHECK-NEXT:    adrp x8, lCPI3_0@PAGE
 ; CHECK-NEXT:    cmeq.2d v0, v0, #0
-; CHECK-NEXT:  Lloh7:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI3_0@PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addp.2d d0, v0
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x3
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
 
 
   %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
@@ -97,16 +203,13 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 ; CHECK-LABEL: clang_builtins_undef_concat_convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh8:
 ; CHECK-NEXT:    adrp x8, lCPI4_0@PAGE
 ; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh9:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI4_0@PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addv.4s s0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh9
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -117,20 +220,37 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 
 
 define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_no_compare:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:  Lloh10:
-; CHECK-NEXT:    adrp x8, lCPI5_0@PAGE
-; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
-; CHECK-NEXT:    shl.4s v0, v0, #31
-; CHECK-NEXT:    cmlt.4s v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh11
+; SDAG-LABEL: convert_to_bitmask_no_compare:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    adrp x8, lCPI5_0@PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
+; SDAG-NEXT:    shl.4s v0, v0, #31
+; SDAG-NEXT:    cmlt.4s v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_no_compare:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    and.16b v0, v0, v1
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp = and <4 x i32> %vec1, %vec2
@@ -140,20 +260,39 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 }
 
 define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_compare_chain:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmeq.4s v2, v0, #0
-; CHECK-NEXT:    cmeq.4s v0, v0, v1
-; CHECK-NEXT:  Lloh12:
-; CHECK-NEXT:    adrp x8, lCPI6_0@PAGE
-; CHECK-NEXT:  Lloh13:
-; CHECK-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v0, v2
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
+; SDAG-LABEL: convert_to_bitmask_with_compare_chain:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmeq.4s v2, v0, #0
+; SDAG-NEXT:    cmeq.4s v0, v0, v1
+; SDAG-NEXT:    adrp x8, lCPI6_0@PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v0, v2
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_compare_chain:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v2, v0, #0
+; GISEL-NEXT:    cmeq.4s v0, v0, v1
+; GISEL-NEXT:    bic.16b v0, v0, v2
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -164,21 +303,39 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 }
 
 define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh14:
-; CHECK-NEXT:    adrp x8, lCPI7_0@PAGE
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
-; CHECK-NEXT:    shl.4s v0, v0, #31
-; CHECK-NEXT:    cmlt.4s v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh15
+; SDAG-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    adrp x8, lCPI7_0@PAGE
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
+; SDAG-NEXT:    shl.4s v0, v0, #31
+; SDAG-NEXT:    cmlt.4s v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    bic.16b v0, v1, v0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -189,33 +346,82 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 }
 
 define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:    adrp x8, lCPI8_0@PAGE
-; CHECK-NEXT:    movi d2, #0x000000ffffffff
-; CHECK-NEXT:    movi d3, #0x00ffffffffffff
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    movi d1, #0xffff0000ffff0000
-; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    orr.8b v0, v0, v2
-; CHECK-NEXT:    movi d2, #0x00ffffffff0000
-; CHECK-NEXT:    eor.8b v1, v0, v1
-; CHECK-NEXT:    eor.8b v0, v0, v2
-; CHECK-NEXT:    mov.h v1[2], wzr
-; CHECK-NEXT:    orr.8b v0, v0, v3
-; CHECK-NEXT:    orr.8b v0, v1, v0
-; CHECK-NEXT:  Lloh17:
-; CHECK-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
-; CHECK-NEXT:    shl.4h v0, v0, #15
-; CHECK-NEXT:    cmlt.4h v0, v0, #0
-; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:    addv.4h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh17
+; SDAG-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    adrp x8, lCPI8_0@PAGE
+; SDAG-NEXT:    movi d2, #0x000000ffffffff
+; SDAG-NEXT:    movi d3, #0x00ffffffffffff
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    movi d1, #0xffff0000ffff0000
+; SDAG-NEXT:    xtn.4h v0, v0
+; SDAG-NEXT:    orr.8b v0, v0, v2
+; SDAG-NEXT:    movi d2, #0x00ffffffff0000
+; SDAG-NEXT:    eor.8b v1, v0, v1
+; SDAG-NEXT:    eor.8b v0, v0, v2
+; SDAG-NEXT:    mov.h v1[2], wzr
+; SDAG-NEXT:    orr.8b v0, v0, v3
+; SDAG-NEXT:    orr.8b v0, v1, v0
+; SDAG-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
+; SDAG-NEXT:    shl.4h v0, v0, #15
+; SDAG-NEXT:    cmlt.4h v0, v0, #0
+; SDAG-NEXT:    and.8b v0, v0, v1
+; SDAG-NEXT:    addv.4h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    mov w8, #1 ; =0x1
+; GISEL-NEXT:    mov w9, #0 ; =0x0
+; GISEL-NEXT:    cmeq.4s v5, v0, #0
+; GISEL-NEXT:    fmov s2, w8
+; GISEL-NEXT:    fmov s4, w9
+; GISEL-NEXT:    cmeq.4s v1, v1, #0
+; GISEL-NEXT:    mov.16b v3, v2
+; GISEL-NEXT:    mov.16b v0, v4
+; GISEL-NEXT:    mov.h v4[1], w8
+; GISEL-NEXT:    bic.16b v1, v1, v5
+; GISEL-NEXT:    mov.16b v5, v2
+; GISEL-NEXT:    mov.h v2[1], w8
+; GISEL-NEXT:    mov.h v3[1], w8
+; GISEL-NEXT:    mov.h v0[1], w8
+; GISEL-NEXT:    mov.h v5[1], w8
+; GISEL-NEXT:    mov.h v4[2], w8
+; GISEL-NEXT:    xtn.4h v1, v1
+; GISEL-NEXT:    mov.h v2[2], w8
+; GISEL-NEXT:    mov.h v3[2], w9
+; GISEL-NEXT:    mov.h v0[2], w9
+; GISEL-NEXT:    mov.h v5[2], w9
+; GISEL-NEXT:    mov.h v4[3], w9
+; GISEL-NEXT:    mov.h v2[3], w9
+; GISEL-NEXT:    mov.h v3[3], w9
+; GISEL-NEXT:    mov.h v0[3], w8
+; GISEL-NEXT:    mov.h v5[3], w8
+; GISEL-NEXT:    orr.8b v1, v1, v3
+; GISEL-NEXT:    eor.8b v0, v1, v0
+; GISEL-NEXT:    eor.8b v1, v4, v1
+; GISEL-NEXT:    and.8b v0, v0, v5
+; GISEL-NEXT:    orr.8b v1, v2, v1
+; GISEL-NEXT:    orr.8b v0, v0, v1
+; GISEL-NEXT:    ushll.4s v0, v0, #0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -234,21 +440,42 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 }
 
 define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:    cmeq.4h v0, v0, #0
-; CHECK-NEXT:  Lloh18:
-; CHECK-NEXT:    adrp x8, lCPI9_0@PAGE
-; CHECK-NEXT:    xtn.4h v1, v1
-; CHECK-NEXT:    orn.8b v0, v1, v0
-; CHECK-NEXT:  Lloh19:
-; CHECK-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
-; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:    addv.4h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh19
+; SDAG-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    cmeq.4h v0, v0, #0
+; SDAG-NEXT:    adrp x8, lCPI9_0@PAGE
+; SDAG-NEXT:    xtn.4h v1, v1
+; SDAG-NEXT:    orn.8b v0, v1, v0
+; SDAG-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
+; SDAG-NEXT:    and.8b v0, v0, v1
+; SDAG-NEXT:    addv.4h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v1, v1, #0
+; GISEL-NEXT:    cmeq.4h v0, v0, #0
+; GISEL-NEXT:    xtn.4h v1, v1
+; GISEL-NEXT:    orn.8b v0, v1, v0
+; GISEL-NEXT:    ushll.4s v0, v0, #0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -259,21 +486,73 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 }
 
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
-; CHECK-LABEL: convert_to_bitmask_without_knowing_type:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:  Lloh20:
-; CHECK-NEXT:    adrp x8, lCPI10_0@PAGE
-; CHECK-NEXT:  Lloh21:
-; CHECK-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh21
+; SDAG-LABEL: convert_to_bitmask_without_knowing_type:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    shl.16b v0, v0, #7
+; SDAG-NEXT:    adrp x8, lCPI10_0@PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
+; SDAG-NEXT:    cmlt.16b v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_without_knowing_type:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[8]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[9]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[10]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #8
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[11]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #9
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[12]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #10
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[13]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #11
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[14]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #12
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[15]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #13
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #14
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #15
+; GISEL-NEXT:    strh w8, [sp, #14]
+; GISEL-NEXT:    and w0, w8, #0xffff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
@@ -282,16 +561,13 @@ define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
 define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_2xi32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  Lloh22:
 ; CHECK-NEXT:    adrp x8, lCPI11_0@PAGE
 ; CHECK-NEXT:    cmeq.2s v0, v0, #0
-; CHECK-NEXT:  Lloh23:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI11_0@PAGEOFF]
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addp.2s v0, v0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh23
 
   %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
   %bitmask = bitcast <2 x i1> %cmp_result to i2
@@ -299,19 +575,51 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 }
 
 define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
-; CHECK-LABEL: convert_to_bitmask_4xi8:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    bic.4h v0, #255, lsl #8
-; CHECK-NEXT:  Lloh24:
-; CHECK-NEXT:    adrp x8, lCPI12_0@PAGE
-; CHECK-NEXT:  Lloh25:
-; CHECK-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
-; CHECK-NEXT:    cmeq.4h v0, v0, #0
-; CHECK-NEXT:    bic.8b v0, v1, v0
-; CHECK-NEXT:    addv.4h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh25
+; SDAG-LABEL: convert_to_bitmask_4xi8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    bic.4h v0, #255, lsl #8
+; SDAG-NEXT:    adrp x8, lCPI12_0@PAGE
+; SDAG-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
+; SDAG-NEXT:    cmeq.4h v0, v0, #0
+; SDAG-NEXT:    bic.8b v0, v1, v0
+; SDAG-NEXT:    addv.4h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_4xi8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    mov w8, #0 ; =0x0
+; GISEL-NEXT:    uzp1.8b v0, v0, v0
+; GISEL-NEXT:    fmov s1, w8
+; GISEL-NEXT:    mov.b v1[1], w8
+; GISEL-NEXT:    mov.b v1[2], w8
+; GISEL-NEXT:    mov.b v1[3], w8
+; GISEL-NEXT:    cmeq.8b v0, v0, v1
+; GISEL-NEXT:    mvn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[0]
+; GISEL-NEXT:    umov.b w9, v0[1]
+; GISEL-NEXT:    mov.s v1[0], w8
+; GISEL-NEXT:    umov.b w8, v0[2]
+; GISEL-NEXT:    mov.s v1[1], w9
+; GISEL-NEXT:    umov.b w9, v0[3]
+; GISEL-NEXT:    mov.s v1[2], w8
+; GISEL-NEXT:    mov.s v1[3], w9
+; GISEL-NEXT:    mov.s w8, v1[1]
+; GISEL-NEXT:    mov.s w9, v1[2]
+; GISEL-NEXT:    fmov w11, s1
+; GISEL-NEXT:    mov.s w10, v1[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -322,17 +630,14 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_8xi2:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.8b v1, #3
-; CHECK-NEXT:  Lloh26:
 ; CHECK-NEXT:    adrp x8, lCPI13_0@PAGE
 ; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:  Lloh27:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI13_0@PAGEOFF]
 ; CHECK-NEXT:    cmeq.8b v0, v0, #0
 ; CHECK-NEXT:    bic.8b v0, v1, v0
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh26, Lloh27
 
   %cmp_result = icmp ne <8 x i2> %vec, zeroinitializer
   %bitmask = bitcast <8 x i1> %cmp_result to i8
@@ -340,20 +645,39 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 }
 
 define i4 @convert_to_bitmask_float(<4 x float> %vec) {
-; CHECK-LABEL: convert_to_bitmask_float:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    fcmgt.4s v1, v0, #0.0
-; CHECK-NEXT:    fcmlt.4s v0, v0, #0.0
-; CHECK-NEXT:  Lloh28:
-; CHECK-NEXT:    adrp x8, lCPI14_0@PAGE
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:  Lloh29:
-; CHECK-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    addv.4s s0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh28, Lloh29
+; SDAG-LABEL: convert_to_bitmask_float:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    fcmgt.4s v1, v0, #0.0
+; SDAG-NEXT:    fcmlt.4s v0, v0, #0.0
+; SDAG-NEXT:    adrp x8, lCPI14_0@PAGE
+; SDAG-NEXT:    orr.16b v0, v0, v1
+; SDAG-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    addv.4s s0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_to_bitmask_float:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    fcmgt.4s v1, v0, #0.0
+; GISEL-NEXT:    fcmlt.4s v0, v0, #0.0
+; GISEL-NEXT:    orr.16b v0, v0, v1
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -364,24 +688,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; CHECK-LABEL: convert_large_vector:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:  Lloh30:
-; CHECK-NEXT:    adrp x8, lCPI15_0@PAGE
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-NEXT:  Lloh31:
-; CHECK-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
-; CHECK-NEXT:    bic.16b v0, v1, v0
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh31
+; SDAG-LABEL: convert_large_vector:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    adrp x8, lCPI15_0@PAGE
+; SDAG-NEXT:    uzp1.8h v0, v0, v1
+; SDAG-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
+; SDAG-NEXT:    bic.16b v0, v1, v0
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    fmov w8, s0
+; SDAG-NEXT:    and w0, w8, #0xff
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_large_vector:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    cmeq.4s v1, v1, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mvn.16b v1, v1
+; GISEL-NEXT:    uzp1.8h v0, v0, v1
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w10, v0[2]
+; GISEL-NEXT:    umov.b w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    umov.b w10, v0[6]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #6
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
@@ -390,20 +748,40 @@ define i8 @convert_large_vector(<8 x i32> %vec) {
 }
 
 define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
-; CHECK-LABEL: convert_legalized_illegal_element_size:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.4s v1, #63, msl #16
-; CHECK-NEXT:  Lloh32:
-; CHECK-NEXT:    adrp x8, lCPI16_0@PAGE
-; CHECK-NEXT:    cmtst.4s v0, v0, v1
-; CHECK-NEXT:  Lloh33:
-; CHECK-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
-; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    and.8b v0, v0, v1
-; CHECK-NEXT:    addv.4h h0, v0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh33
+; SDAG-LABEL: convert_legalized_illegal_element_size:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    movi.4s v1, #63, msl #16
+; SDAG-NEXT:    adrp x8, lCPI16_0@PAGE
+; SDAG-NEXT:    cmtst.4s v0, v0, v1
+; SDAG-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
+; SDAG-NEXT:    xtn.4h v0, v0
+; SDAG-NEXT:    and.8b v0, v0, v1
+; SDAG-NEXT:    addv.4h h0, v0
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: convert_legalized_illegal_element_size:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    movi.4s v1, #63, msl #16
+; GISEL-NEXT:    and.16b v0, v0, v1
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    fmov w11, s0
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w11, w8, #1, #31
+; GISEL-NEXT:    and w8, w9, #0x1
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w11, w8, lsl #2
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -415,7 +793,6 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-LABEL: no_direct_convert_for_bad_concat:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    cmtst.4s v0, v0, v0
-; CHECK-NEXT:  Lloh34:
 ; CHECK-NEXT:    adrp x8, lCPI17_0@PAGE
 ; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    umov.h w9, v0[0]
@@ -427,14 +804,12 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-NEXT:    umov.h w9, v0[3]
 ; CHECK-NEXT:    mov.b v1[7], w9
 ; CHECK-NEXT:    shl.8b v0, v1, #7
-; CHECK-NEXT:  Lloh35:
 ; CHECK-NEXT:    ldr d1, [x8, lCPI17_0@PAGEOFF]
 ; CHECK-NEXT:    cmlt.8b v0, v0, #0
 ; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    addv.8b b0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh34, Lloh35
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   %vector_pad = shufflevector <4 x i1> poison, <4 x i1> %cmp_result, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
@@ -443,47 +818,101 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 }
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
-; CHECK-LABEL: no_convert_without_direct_bitcast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmtst.8h v0, v0, v0
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: no_convert_without_direct_bitcast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    cmtst.8h v0, v0, v0
+; SDAG-NEXT:    xtn.8b v0, v0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: no_convert_without_direct_bitcast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmeq.8h v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    xtn.8b v0, v0
+; GISEL-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
 }
 
 define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
-; CHECK-LABEL: no_combine_illegal_num_elements:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    fmov s1, w4
-; CHECK-NEXT:    mov.s v0[1], w1
-; CHECK-NEXT:    mov.s v1[1], w5
-; CHECK-NEXT:    mov.s v0[2], w2
-; CHECK-NEXT:    cmeq.4s v1, v1, #0
-; CHECK-NEXT:    mov.s v0[3], w3
-; CHECK-NEXT:    cmeq.4s v0, v0, #0
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-NEXT:    mvn.16b v0, v0
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    umov.b w8, v0[0]
-; CHECK-NEXT:    umov.b w9, v0[1]
-; CHECK-NEXT:    umov.b w10, v0[2]
-; CHECK-NEXT:    and w8, w8, #0x1
-; CHECK-NEXT:    bfi w8, w9, #1, #1
-; CHECK-NEXT:    umov.b w9, v0[3]
-; CHECK-NEXT:    bfi w8, w10, #2, #1
-; CHECK-NEXT:    umov.b w10, v0[4]
-; CHECK-NEXT:    bfi w8, w9, #3, #1
-; CHECK-NEXT:    umov.b w9, v0[5]
-; CHECK-NEXT:    bfi w8, w10, #4, #1
-; CHECK-NEXT:    orr w8, w8, w9, lsl #5
-; CHECK-NEXT:    and w0, w8, #0x3f
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; SDAG-LABEL: no_combine_illegal_num_elements:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    fmov s0, w0
+; SDAG-NEXT:    fmov s1, w4
+; SDAG-NEXT:    mov.s v0[1], w1
+; SDAG-NEXT:    mov.s v1[1], w5
+; SDAG-NEXT:    mov.s v0[2], w2
+; SDAG-NEXT:    cmeq.4s v1, v1, #0
+; SDAG-NEXT:    mov.s v0[3], w3
+; SDAG-NEXT:    cmeq.4s v0, v0, #0
+; SDAG-NEXT:    uzp1.8h v0, v0, v1
+; SDAG-NEXT:    mvn.16b v0, v0
+; SDAG-NEXT:    xtn.8b v0, v0
+; SDAG-NEXT:    umov.b w8, v0[0]
+; SDAG-NEXT:    umov.b w9, v0[1]
+; SDAG-NEXT:    umov.b w10, v0[2]
+; SDAG-NEXT:    and w8, w8, #0x1
+; SDAG-NEXT:    bfi w8, w9, #1, #1
+; SDAG-NEXT:    umov.b w9, v0[3]
+; SDAG-NEXT:    bfi w8, w10, #2, #1
+; SDAG-NEXT:    umov.b w10, v0[4]
+; SDAG-NEXT:    bfi w8, w9, #3, #1
+; SDAG-NEXT:    umov.b w9, v0[5]
+; SDAG-NEXT:    bfi w8, w10, #4, #1
+; SDAG-NEXT:    orr w8, w8, w9, lsl #5
+; SDAG-NEXT:    and w0, w8, #0x3f
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: no_combine_illegal_num_elements:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    mov.s v0[0], w0
+; GISEL-NEXT:    mov.s v1[0], w4
+; GISEL-NEXT:    mov.s v2[0], wzr
+; GISEL-NEXT:    mov.s v0[1], w1
+; GISEL-NEXT:    mov.s v1[1], w5
+; GISEL-NEXT:    mov.s v2[1], wzr
+; GISEL-NEXT:    mov.s v0[2], w2
+; GISEL-NEXT:    cmeq.4s v1, v1, v2
+; GISEL-NEXT:    mvn.16b v1, v1
+; GISEL-NEXT:    mov.s v0[3], w3
+; GISEL-NEXT:    cmeq.4s v0, v0, #0
+; GISEL-NEXT:    mvn.16b v0, v0
+; GISEL-NEXT:    mov.s w8, v0[1]
+; GISEL-NEXT:    mov.s w9, v0[2]
+; GISEL-NEXT:    mov.s w10, v0[3]
+; GISEL-NEXT:    mov.h v0[1], w8
+; GISEL-NEXT:    mov.s w8, v1[1]
+; GISEL-NEXT:    mov.h v0[2], w9
+; GISEL-NEXT:    mov.h v0[3], w10
+; GISEL-NEXT:    mov.h v0[4], v1[0]
+; GISEL-NEXT:    mov.h v0[5], w8
+; GISEL-NEXT:    umov.h w8, v0[1]
+; GISEL-NEXT:    umov.h w9, v0[0]
+; GISEL-NEXT:    umov.h w10, v0[2]
+; GISEL-NEXT:    umov.h w11, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    umov.h w10, v0[4]
+; GISEL-NEXT:    orr w8, w9, w8, lsl #2
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    umov.h w11, v0[5]
+; GISEL-NEXT:    orr w8, w8, w9, lsl #3
+; GISEL-NEXT:    and w9, w10, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #4
+; GISEL-NEXT:    and w9, w11, #0x1
+; GISEL-NEXT:    orr w8, w8, w9, lsl #5
+; GISEL-NEXT:    and w8, w8, #0x3f
+; GISEL-NEXT:    strb w8, [sp, #15]
+; GISEL-NEXT:    and w0, w8, #0xff
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
 
   %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer
   %bitmask = bitcast <6 x i1> %cmp_result to i6
@@ -492,28 +921,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 
 ; Only apply the combine when casting a vector to a scalar.
 define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
-; CHECK-LABEL: vector_to_vector_cast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:  Lloh36:
-; CHECK-NEXT:    adrp x8, lCPI20_0@PAGE
-; CHECK-NEXT:  Lloh37:
-; CHECK-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
-; CHECK-NEXT:    add x8, sp, #14
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    zip1.16b v0, v0, v1
-; CHECK-NEXT:    addv.8h h0, v0
-; CHECK-NEXT:    str h0, [sp, #14]
-; CHECK-NEXT:    ld1.b { v0 }[0], [x8]
-; CHECK-NEXT:    orr x8, x8, #0x1
-; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh37
+; SDAG-LABEL: vector_to_vector_cast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    sub sp, sp, #16
+; SDAG-NEXT:    shl.16b v0, v0, #7
+; SDAG-NEXT:    adrp x8, lCPI20_0@PAGE
+; SDAG-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
+; SDAG-NEXT:    add x8, sp, #14
+; SDAG-NEXT:    cmlt.16b v0, v0, #0
+; SDAG-NEXT:    and.16b v0, v0, v1
+; SDAG-NEXT:    ext.16b v1, v0, v0, #8
+; SDAG-NEXT:    zip1.16b v0, v0, v1
+; SDAG-NEXT:    addv.8h h0, v0
+; SDAG-NEXT:    str h0, [sp, #14]
+; SDAG-NEXT:    ld1.b { v0 }[0], [x8]
+; SDAG-NEXT:    orr x8, x8, #0x1
+; SDAG-NEXT:    ld1.b { v0 }[4], [x8]
+; SDAG-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; SDAG-NEXT:    add sp, sp, #16
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: vector_to_vector_cast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    sub sp, sp, #16
+; GISEL-NEXT:    umov.b w8, v0[1]
+; GISEL-NEXT:    mov d1, v0[1]
+; GISEL-NEXT:    umov.b w10, v0[1]
+; GISEL-NEXT:    umov.b w9, v0[0]
+; GISEL-NEXT:    umov.b w13, v0[0]
+; GISEL-NEXT:    umov.b w14, v0[2]
+; GISEL-NEXT:    umov.b w15, v0[3]
+; GISEL-NEXT:    umov.b w11, v0[2]
+; GISEL-NEXT:    umov.b w16, v0[4]
+; GISEL-NEXT:    umov.b w17, v0[5]
+; GISEL-NEXT:    umov.b w12, v0[3]
+; GISEL-NEXT:    and w8, w8, #0x1
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w0, v1[1]
+; GISEL-NEXT:    bfi w9, w8, #1, #31
+; GISEL-NEXT:    bfi w13, w10, #1, #31
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    umov.b w8, v1[0]
+; GISEL-NEXT:    umov.b w10, v1[2]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w13, w13, w14, lsl #2
+; GISEL-NEXT:    umov.b w14, v1[3]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #2
+; GISEL-NEXT:    orr w13, w13, w15, lsl #3
+; GISEL-NEXT:    umov.b w15, v1[4]
+; GISEL-NEXT:    umov.b w11, v0[6]
+; GISEL-NEXT:    bfi w8, w0, #1, #31
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w17, w17, #0x1
+; GISEL-NEXT:    orr w13, w13, w16, lsl #4
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    umov.b w0, v0[7]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #2
+; GISEL-NEXT:    umov.b w10, v1[5]
+; GISEL-NEXT:    umov.b w16, v1[6]
+; GISEL-NEXT:    orr w13, w13, w17, lsl #5
+; GISEL-NEXT:    umov.b w17, v0[4]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w8, w8, w14, lsl #3
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    umov.b w14, v1[7]
+; GISEL-NEXT:    orr w9, w9, w12, lsl #3
+; GISEL-NEXT:    orr w11, w13, w11, lsl #6
+; GISEL-NEXT:    orr w8, w8, w15, lsl #4
+; GISEL-NEXT:    umov.b w15, v0[5]
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    and w12, w17, #0x1
+; GISEL-NEXT:    umov.b w13, v0[1]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #5
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w12, lsl #4
+; GISEL-NEXT:    umov.b w10, v0[0]
+; GISEL-NEXT:    orr w11, w11, w0, lsl #7
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    and w12, w15, #0x1
+; GISEL-NEXT:    umov.b w15, v0[2]
+; GISEL-NEXT:    orr w8, w8, w16, lsl #6
+; GISEL-NEXT:    orr w9, w9, w12, lsl #5
+; GISEL-NEXT:    umov.b w12, v0[6]
+; GISEL-NEXT:    strb w11, [sp, #8]
+; GISEL-NEXT:    and w11, w13, #0x1
+; GISEL-NEXT:    umov.b w13, v0[3]
+; GISEL-NEXT:    orr w8, w8, w14, lsl #7
+; GISEL-NEXT:    umov.b w14, v0[7]
+; GISEL-NEXT:    ldr b0, [sp, #8]
+; GISEL-NEXT:    bfi w10, w11, #1, #31
+; GISEL-NEXT:    and w11, w15, #0x1
+; GISEL-NEXT:    strb w8, [sp, #9]
+; GISEL-NEXT:    umov.b w15, v0[4]
+; GISEL-NEXT:    and w8, w12, #0x1
+; GISEL-NEXT:    orr w10, w10, w11, lsl #2
+; GISEL-NEXT:    orr w8, w9, w8, lsl #6
+; GISEL-NEXT:    and w9, w13, #0x1
+; GISEL-NEXT:    umov.b w11, v0[1]
+; GISEL-NEXT:    orr w9, w10, w9, lsl #3
+; GISEL-NEXT:    umov.b w10, v0[5]
+; GISEL-NEXT:    umov.b w12, v0[0]
+; GISEL-NEXT:    and w13, w14, #0x1
+; GISEL-NEXT:    umov.b w16, v0[2]
+; GISEL-NEXT:    umov.b w17, v0[3]
+; GISEL-NEXT:    and w14, w15, #0x1
+; GISEL-NEXT:    umov.b w15, v0[2]
+; GISEL-NEXT:    orr w8, w8, w13, lsl #7
+; GISEL-NEXT:    orr w9, w9, w14, lsl #4
+; GISEL-NEXT:    umov.b w13, v0[6]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    umov.b w14, v0[3]
+; GISEL-NEXT:    strb w8, [sp, #10]
+; GISEL-NEXT:    and w8, w10, #0x1
+; GISEL-NEXT:    bfi w12, w11, #1, #31
+; GISEL-NEXT:    orr w8, w9, w8, lsl #5
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    and w9, w15, #0x1
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    umov.b w15, v0[1]
+; GISEL-NEXT:    orr w9, w12, w9, lsl #2
+; GISEL-NEXT:    umov.b w12, v0[5]
+; GISEL-NEXT:    and w13, w13, #0x1
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    orr w8, w8, w13, lsl #6
+; GISEL-NEXT:    umov.b w13, v0[0]
+; GISEL-NEXT:    orr w9, w9, w14, lsl #3
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w14, v0[6]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    umov.b w0, v0[3]
+; GISEL-NEXT:    orr w9, w9, w10, lsl #4
+; GISEL-NEXT:    and w10, w12, #0x1
+; GISEL-NEXT:    umov.b w12, v0[7]
+; GISEL-NEXT:    orr w8, w8, w11, lsl #7
+; GISEL-NEXT:    bfi w13, w15, #1, #31
+; GISEL-NEXT:    and w11, w16, #0x1
+; GISEL-NEXT:    orr w9, w9, w10, lsl #5
+; GISEL-NEXT:    and w10, w14, #0x1
+; GISEL-NEXT:    umov.b w14, v0[4]
+; GISEL-NEXT:    strb w8, [sp, #11]
+; GISEL-NEXT:    umov.b w15, v0[1]
+; GISEL-NEXT:    umov.b w16, v0[3]
+; GISEL-NEXT:    orr w8, w9, w10, lsl #6
+; GISEL-NEXT:    orr w9, w13, w11, lsl #2
+; GISEL-NEXT:    and w10, w12, #0x1
+; GISEL-NEXT:    and w11, w17, #0x1
+; GISEL-NEXT:    umov.b w12, v0[5]
+; GISEL-NEXT:    umov.b w17, v0[0]
+; GISEL-NEXT:    orr w8, w8, w10, lsl #7
+; GISEL-NEXT:    orr w9, w9, w11, lsl #3
+; GISEL-NEXT:    umov.b w10, v0[1]
+; GISEL-NEXT:    and w11, w14, #0x1
+; GISEL-NEXT:    umov.b w14, v0[0]
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #4
+; GISEL-NEXT:    umov.b w11, v0[2]
+; GISEL-NEXT:    umov.b w13, v0[6]
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    bfi w17, w15, #1, #31
+; GISEL-NEXT:    umov.b w15, v0[5]
+; GISEL-NEXT:    orr w9, w9, w12, lsl #5
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    umov.b w12, v0[2]
+; GISEL-NEXT:    bfi w14, w10, #1, #31
+; GISEL-NEXT:    umov.b w10, v0[4]
+; GISEL-NEXT:    ldr b1, [sp, #9]
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w13, w13, #0x1
+; GISEL-NEXT:    strb w8, [sp, #12]
+; GISEL-NEXT:    orr w11, w14, w11, lsl #2
+; GISEL-NEXT:    and w14, w16, #0x1
+; GISEL-NEXT:    umov.b w16, v0[4]
+; GISEL-NEXT:    and w12, w12, #0x1
+; GISEL-NEXT:    and w15, w15, #0x1
+; GISEL-NEXT:    orr w9, w9, w13, lsl #6
+; GISEL-NEXT:    orr w11, w11, w14, lsl #3
+; GISEL-NEXT:    orr w12, w17, w12, lsl #2
+; GISEL-NEXT:    and w10, w10, #0x1
+; GISEL-NEXT:    and w17, w0, #0x1
+; GISEL-NEXT:    umov.b w0, v0[5]
+; GISEL-NEXT:    umov.b w14, v0[6]
+; GISEL-NEXT:    orr w10, w11, w10, lsl #4
+; GISEL-NEXT:    orr w12, w12, w17, lsl #3
+; GISEL-NEXT:    umov.b w11, v0[7]
+; GISEL-NEXT:    and w16, w16, #0x1
+; GISEL-NEXT:    umov.b w17, v0[6]
+; GISEL-NEXT:    orr w10, w10, w15, lsl #5
+; GISEL-NEXT:    umov.b w15, v0[7]
+; GISEL-NEXT:    orr w12, w12, w16, lsl #4
+; GISEL-NEXT:    and w16, w0, #0x1
+; GISEL-NEXT:    umov.b w0, v0[7]
+; GISEL-NEXT:    and w14, w14, #0x1
+; GISEL-NEXT:    orr w12, w12, w16, lsl #5
+; GISEL-NEXT:    orr w10, w10, w14, lsl #6
+; GISEL-NEXT:    and w11, w11, #0x1
+; GISEL-NEXT:    and w13, w17, #0x1
+; GISEL-NEXT:    orr w9, w9, w11, lsl #7
+; GISEL-NEXT:    mov.s v0[1], v1[0]
+; GISEL-NEXT:    orr w11, w12, w13, lsl #6
+; GISEL-NEXT:    and w12, w15, #0x1
+; GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; GISEL-NEXT:    orr w8, w10, w12, lsl #7
+; GISEL-NEXT:    and w10, w0, #0x1
+; GISEL-NEXT:    strb w9, [sp, #13]
+; GISEL-NEXT:    orr w9, w11, w10, lsl #7
+; GISEL-NEXT:    strb w8, [sp, #14]
+; GISEL-NEXT:    strb w9, [sp, #15]
+; GISEL-NEXT:    add sp, sp, #16
+; GISEL-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
   ret <2 x i8> %bc
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 8473f45..5d6b523 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -72,6 +72,24 @@ entry:
   ret i64 %z
 }
 
+define i64 @add_v4i32_v4i64_zsext(<4 x i32> %xi) {
+; CHECK-LABEL: add_v4i32_v4i64_zsext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+entry:
+  %x = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 0, i32 1>
+  %y = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 2, i32 3>
+  %xx = zext <2 x i32> %x to <2 x i64>
+  %yy = sext <2 x i32> %y to <2 x i64>
+  %zz = add <2 x i64> %xx, %yy
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %zz)
+  ret i64 %z
+}
+
 define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
 ; CHECK-LABEL: add_v2i32_v2i64_zext:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll
index 1caee99..510d457 100644
--- a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll
@@ -18,7 +18,7 @@ define <vscale x 2 x double> @test_svabs_f64_x_1(<vscale x 2 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fabs z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -34,7 +34,7 @@ define <vscale x 2 x double> @test_svabs_f64_x_2(<vscale x 2 x i1> %pg, double %
 ; CHECK-2p2-NEXT:    fabs z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -65,7 +65,7 @@ define <vscale x 4 x float> @test_svabs_f32_x_1(<vscale x 4 x i1> %pg, <vscale x
 ; CHECK-2p2-NEXT:    fabs z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -81,7 +81,7 @@ define <vscale x 4 x float> @test_svabs_f32_x_2(<vscale x 4 x i1> %pg, double %z
 ; CHECK-2p2-NEXT:    fabs z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -112,7 +112,7 @@ define <vscale x 8 x half> @test_svabs_f16_x_1(<vscale x 8 x i1> %pg, <vscale x
 ; CHECK-2p2-NEXT:    fabs z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -128,7 +128,7 @@ define <vscale x 8 x half> @test_svabs_f16_x_2(<vscale x 8 x i1> %pg, double %z0
 ; CHECK-2p2-NEXT:    fabs z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -159,7 +159,7 @@ define <vscale x 16 x i8> @test_svabs_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 1
 ; CHECK-2p2-NEXT:    abs z0.b, p0/z, z0.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
   ret <vscale x 16 x i8> %0
 }
 
@@ -175,8 +175,8 @@ define <vscale x 16 x i8> @test_svabs_s8_x_2(<vscale x 16 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    abs z0.b, p0/z, z1.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
-  ret <vscale x 16 x i8> %1
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
 }
 
 define <vscale x 16 x i8> @test_svabs_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
@@ -191,8 +191,8 @@ define <vscale x 16 x i8> @test_svabs_s8_z(<vscale x 16 x i1> %pg, double %z0, <
 ; CHECK-2p2-NEXT:    abs z0.b, p0/z, z1.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
-  ret <vscale x 16 x i8> %1
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
 }
 
 define <vscale x 8 x i16> @test_svabs_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
@@ -206,7 +206,7 @@ define <vscale x 8 x i16> @test_svabs_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8
 ; CHECK-2p2-NEXT:    abs z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -222,7 +222,7 @@ define <vscale x 8 x i16> @test_svabs_s16_x_2(<vscale x 8 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    abs z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -253,7 +253,7 @@ define <vscale x 4 x i32> @test_svabs_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4
 ; CHECK-2p2-NEXT:    abs z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -269,7 +269,7 @@ define <vscale x 4 x i32> @test_svabs_s32_x_2(<vscale x 4 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    abs z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -300,7 +300,7 @@ define <vscale x 2 x i64> @test_svabs_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2
 ; CHECK-2p2-NEXT:    abs z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -316,7 +316,7 @@ define <vscale x 2 x i64> @test_svabs_s64_x_2(<vscale x 2 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    abs z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -347,7 +347,7 @@ define <vscale x 2 x double> @test_svneg_f64_x_1(<vscale x 2 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fneg z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -363,7 +363,7 @@ define <vscale x 2 x double> @test_svneg_f64_x_2(<vscale x 2 x i1> %pg, double %
 ; CHECK-2p2-NEXT:    fneg z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -394,7 +394,7 @@ define <vscale x 4 x float> @test_svneg_f32_x_1(<vscale x 4 x i1> %pg, <vscale x
 ; CHECK-2p2-NEXT:    fneg z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -410,7 +410,7 @@ define <vscale x 4 x float> @test_svneg_f32_x_2(<vscale x 4 x i1> %pg, double %z
 ; CHECK-2p2-NEXT:    fneg z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -441,7 +441,7 @@ define <vscale x 8 x half> @test_svneg_f16_x_1(<vscale x 8 x i1> %pg, <vscale x
 ; CHECK-2p2-NEXT:    fneg z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -457,7 +457,7 @@ define <vscale x 8 x half> @test_svneg_f16_x_2(<vscale x 8 x i1> %pg, double %z0
 ; CHECK-2p2-NEXT:    fneg z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -488,7 +488,7 @@ define <vscale x 16 x i8> @test_svneg_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 1
 ; CHECK-2p2-NEXT:    neg z0.b, p0/z, z0.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
   ret <vscale x 16 x i8> %0
 }
 
@@ -504,8 +504,8 @@ define <vscale x 16 x i8> @test_svneg_s8_x_2(<vscale x 16 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    neg z0.b, p0/z, z1.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
-  ret <vscale x 16 x i8> %1
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
 }
 
 define <vscale x 16 x i8> @test_svneg_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
@@ -520,8 +520,8 @@ define <vscale x 16 x i8> @test_svneg_s8_z(<vscale x 16 x i1> %pg, double %z0, <
 ; CHECK-2p2-NEXT:    neg z0.b, p0/z, z1.b
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
-  ret <vscale x 16 x i8> %1
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
 }
 
 define <vscale x 8 x i16> @test_svneg_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
@@ -535,7 +535,7 @@ define <vscale x 8 x i16> @test_svneg_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8
 ; CHECK-2p2-NEXT:    neg z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -551,7 +551,7 @@ define <vscale x 8 x i16> @test_svneg_s16_x_2(<vscale x 8 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    neg z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -582,7 +582,7 @@ define <vscale x 4 x i32> @test_svneg_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4
 ; CHECK-2p2-NEXT:    neg z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -598,7 +598,7 @@ define <vscale x 4 x i32> @test_svneg_s32_x_2(<vscale x 4 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    neg z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -629,7 +629,7 @@ define <vscale x 2 x i64> @test_svneg_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2
 ; CHECK-2p2-NEXT:    neg z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -645,7 +645,7 @@ define <vscale x 2 x i64> @test_svneg_s64_x_2(<vscale x 2 x i1> %pg, double %z0,
 ; CHECK-2p2-NEXT:    neg z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -664,3 +664,535 @@ entry:
   %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
   ret <vscale x 2 x i64> %0
 }
+
+define <vscale x 2 x double> @test_svfabs_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfabs_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fabs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fabs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfabs_f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfabs_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fabs z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fabs z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 4 x float> @test_svfabs_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfabs_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fabs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fabs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfabs_f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfabs_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fabs z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fabs z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fabs.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 8 x half> @test_svfabs_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfabs_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fabs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fabs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfabs_f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfabs_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fabs z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfabs_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fabs z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fabs.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 16 x i8> @test_svabs_s8_ptrue_u(double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svabs_s8_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    abs z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s8_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    abs z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svabs_s8_ptrue(double %z0, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-LABEL: test_svabs_s8_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    abs z0.b, p0/m, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s8_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    abs z0.b, p0/z, z2.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svabs_s16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svabs_s16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    abs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    abs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nx84i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svabs_s16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svabs_s16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    abs z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    abs z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svabs_s32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svabs_s32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    abs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    abs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svabs_s32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svabs_s32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    abs z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    abs z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svabs_s64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svabs_s64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    abs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    abs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svabs_s64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svabs_s64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    abs z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svabs_s64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    abs z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x double> @test_svfneg_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfneg_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fneg z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fneg z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfneg_f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfneg_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fneg z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fneg z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fneg.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 4 x float> @test_svfneg_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfneg_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fneg z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fneg z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfneg_f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfneg_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fneg z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fneg z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fneg.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 8 x half> @test_svfneg_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfneg_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fneg z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fneg z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfneg_f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfneg_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fneg z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfneg_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fneg z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fneg.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 16 x i8> @test_svneg_s8_ptrue_u(double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svneg_s8_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    neg z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s8_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    neg z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svneg_s8_ptrue(double %z0, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-LABEL: test_svneg_s8_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    neg z0.b, p0/m, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s8_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    neg z0.b, p0/z, z2.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.neg.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svneg_s16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svneg_s16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    neg z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    neg z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svneg_s16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svneg_s16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    neg z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    neg z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svneg_s32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svneg_s32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    neg z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    neg z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svneg_s32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svneg_s32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    neg z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    neg z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svneg_s64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svneg_s64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    neg z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    neg z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svneg_s64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svneg_s64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    neg z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svneg_s64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    neg z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll
index cf9ac49..855bf9a 100644
--- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll
@@ -18,7 +18,7 @@ define <vscale x 8 x half> @test_svcvt_f16_f32_x_1(<vscale x 4 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -33,7 +33,7 @@ define <vscale x 8 x half> @test_svcvt_f16_f32_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -64,7 +64,7 @@ define <vscale x 8 x bfloat> @test_svcvt_bf16_f32_x_1(<vscale x 4 x i1> %pg, <vs
 ; CHECK-2p2-NEXT:    bfcvt z0.h, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 8 x bfloat> %0
 }
 
@@ -79,7 +79,7 @@ define <vscale x 8 x bfloat> @test_svcvt_bf16_f32_x_2(<vscale x 4 x i1> %pg, dou
 ; CHECK-2p2-NEXT:    bfcvt z0.h, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 8 x bfloat> %0
 }
 
@@ -110,7 +110,7 @@ define <vscale x 8 x half> @test_svcvt_f16_f64_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -125,7 +125,7 @@ define <vscale x 8 x half> @test_svcvt_f16_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 8 x half> %0
 }
 
@@ -156,7 +156,7 @@ define <vscale x 4 x float> @test_svcvt_f32_f64_x_1(<vscale x 2 x i1> %pg, <vsca
 ; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -171,7 +171,7 @@ define <vscale x 4 x float> @test_svcvt_f32_f64_x_2(<vscale x 2 x i1> %pg, doubl
 ; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -202,7 +202,7 @@ define <vscale x 4 x float> @test_svcvt_f32_f16_x_1(<vscale x 4 x i1> %pg, <vsca
 ; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -217,7 +217,7 @@ define <vscale x 4 x float> @test_svcvt_f32_f16_x_2(<vscale x 4 x i1> %pg, doubl
 ; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -248,7 +248,7 @@ define <vscale x 2 x double> @test_svcvt_f64_f16_x_1(<vscale x 2 x i1> %pg, <vsc
 ; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -263,7 +263,7 @@ define <vscale x 2 x double> @test_svcvt_f64_f16_x_2(<vscale x 2 x i1> %pg, doub
 ; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -294,7 +294,7 @@ define <vscale x 2 x double> @test_svcvt_f64_f32_x_1(<vscale x 2 x i1> %pg, <vsc
 ; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -309,7 +309,7 @@ define <vscale x 2 x double> @test_svcvt_f64_f32_x_2(<vscale x 2 x i1> %pg, doub
 ; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -328,3 +328,262 @@ entry:
   %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
+
+define <vscale x 8 x half> @test_svcvt_f16_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svcvt_f16_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f16_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svcvt_f16_f32_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 4 x float> %y ) {
+; CHECK-LABEL: test_svcvt_f16_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f16_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x bfloat> @test_svcvt_bf16_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svcvt_bf16_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    bfcvt z0.h, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_bf16_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    bfcvt z0.h, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x bfloat> @test_svcvt_bf16_f32_ptrue(double %z0, <vscale x 8 x bfloat> %x, <vscale x 4 x float> %y ) {
+; CHECK-LABEL: test_svcvt_bf16_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    bfcvt z0.h, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_bf16_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    bfcvt z0.h, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32.v2(<vscale x 8 x bfloat> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x half> @test_svcvt_f16_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svcvt_f16_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f16_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svcvt_f16_f64_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 2 x double> %y ) {
+; CHECK-LABEL: test_svcvt_f16_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f16_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.h, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x float> @test_svcvt_f32_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svcvt_f32_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f32_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvt_f32_f64_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 2 x double> %y ) {
+; CHECK-LABEL: test_svcvt_f32_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f32_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvt_f32_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svcvt_f32_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.s, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f32_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvt_f32_f16_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 8 x half> %y ) {
+; CHECK-LABEL: test_svcvt_f32_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f32_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvt z0.s, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svcvt_f64_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svcvt_f64_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f64_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svcvt_f64_f16_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 8 x half> %y ) {
+; CHECK-LABEL: test_svcvt_f64_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f64_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svcvt_f64_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svcvt_f64_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f64_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svcvt_f64_f32_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 4 x float> %y ) {
+; CHECK-LABEL: test_svcvt_f64_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvt_f64_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvt z0.d, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 2 x double> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll
index 60879b1..c7431e1 100644
--- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll
@@ -18,7 +18,7 @@ define <vscale x 4 x float> @test_svcvtlt_f32_f16_x_1(<vscale x 4 x i1> %pg, <vs
 ; CHECK-2p2-NEXT:    fcvtlt z0.s, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -33,7 +33,7 @@ define <vscale x 4 x float> @test_svcvtlt_f32_f16_x_2(<vscale x 4 x i1> %pg, dou
 ; CHECK-2p2-NEXT:    fcvtlt z0.s, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -64,7 +64,7 @@ define <vscale x 2 x double> @test_svcvtlt_f64_f32_x_1(<vscale x 2 x i1> %pg, <v
 ; CHECK-2p2-NEXT:    fcvtlt z0.d, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -79,7 +79,7 @@ define <vscale x 2 x double> @test_svcvtlt_f64_f32_x_2(<vscale x 2 x i1> %pg, do
 ; CHECK-2p2-NEXT:    fcvtlt z0.d, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x double> %0
 }
 
@@ -110,7 +110,7 @@ define <vscale x 4 x float> @test_svcvtx_f32_f64_x_1(<vscale x 2 x i1> %pg, <vsc
 ; CHECK-2p2-NEXT:    fcvtx z0.s, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -125,7 +125,7 @@ define <vscale x 4 x float> @test_svcvtx_f32_f64_x_2(<vscale x 2 x i1> %pg, doub
 ; CHECK-2p2-NEXT:    fcvtx z0.s, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
 
@@ -144,3 +144,114 @@ entry:
   %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x float> %0
 }
+
+define <vscale x 4 x float> @test_svcvtlt_f32_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svcvtlt_f32_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtlt z0.s, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtlt_f32_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtlt z0.s, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvtlt_f32_f16_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svcvtlt_f32_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtlt z0.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtlt_f32_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtlt z0.s, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svcvtlt_f64_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svcvtlt_f64_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtlt z0.d, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtlt_f64_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtlt z0.d, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svcvtlt_f64_f32_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svcvtlt_f64_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtlt z0.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtlt_f64_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtlt z0.d, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 4 x float> @test_svcvtx_f32_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svcvtx_f32_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtx z0.s, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtx_f32_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtx z0.s, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svcvtx_f32_f64_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svcvtx_f32_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtx z0.s, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svcvtx_f32_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtx z0.s, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 4 x float> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll
index b8b36d3..7259502 100644
--- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll
@@ -18,7 +18,7 @@ define <vscale x 4 x i32> @test_fcvtzs_s32_f64_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -33,7 +33,7 @@ define <vscale x 4 x i32> @test_fcvtzs_s32_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -64,7 +64,7 @@ define <vscale x 2 x i64> @test_fcvtzs_s64_f32_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -79,7 +79,7 @@ define <vscale x 2 x i64> @test_fcvtzs_s64_f32_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -110,7 +110,7 @@ define <vscale x 4 x i32> @test_fcvtzs_s32_f16_x_1(<vscale x 4 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -125,7 +125,7 @@ define <vscale x 4 x i32> @test_fcvtzs_s32_f16_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -156,7 +156,7 @@ define <vscale x 2 x i64> @test_fcvtzs_s64_f16_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -171,7 +171,7 @@ define <vscale x 2 x i64> @test_fcvtzs_s64_f16_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -202,7 +202,7 @@ define <vscale x 4 x i32> @test_fcvtzu_u32_f64_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -217,7 +217,7 @@ define <vscale x 4 x i32> @test_fcvtzu_u32_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -248,7 +248,7 @@ define <vscale x 2 x i64> @test_fcvtzu_u64_f32_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -263,7 +263,7 @@ define <vscale x 2 x i64> @test_fcvtzu_u64_f32_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -294,7 +294,7 @@ define <vscale x 4 x i32> @test_fcvtzu_u32_f16_x_1(<vscale x 4 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -309,7 +309,7 @@ define <vscale x 4 x i32> @test_fcvtzu_u32_f16_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -340,7 +340,7 @@ define <vscale x 2 x i64> @test_fcvtzu_u64_f16_x_1(<vscale x 2 x i1> %pg, <vscal
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -355,7 +355,7 @@ define <vscale x 2 x i64> @test_fcvtzu_u64_f16_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -387,7 +387,7 @@ define <vscale x 8 x i16> @test_svcvt_s16_f16_x_1(<vscale x 8 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzs z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -403,7 +403,7 @@ define <vscale x 8 x i16> @test_svcvt_s16_f16_x_2(<vscale x 8 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -434,7 +434,7 @@ define <vscale x 8 x i16> @test_svcvt_u16_f16_x_1(<vscale x 8 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzu z0.h, p0/z, z0.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -450,7 +450,7 @@ define <vscale x 8 x i16> @test_svcvt_u16_f16_x_2(<vscale x 8 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.h, p0/z, z1.h
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
   ret <vscale x 8 x i16> %0
 }
 
@@ -481,7 +481,7 @@ define <vscale x 4 x i32> @test_svcvt_s32_f32_x_1(<vscale x 4 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -497,7 +497,7 @@ define <vscale x 4 x i32> @test_svcvt_s32_f32_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -528,7 +528,7 @@ define <vscale x 4 x i32> @test_svcvt_u32_f32_x_1(<vscale x 4 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z0.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -544,7 +544,7 @@ define <vscale x 4 x i32> @test_svcvt_u32_f32_x_2(<vscale x 4 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.s
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
   ret <vscale x 4 x i32> %0
 }
 
@@ -575,7 +575,7 @@ define <vscale x 2 x i64> @test_svcvt_s64_f64_x_1(<vscale x 2 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -591,7 +591,7 @@ define <vscale x 2 x i64> @test_svcvt_s64_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -622,7 +622,7 @@ define <vscale x 2 x i64> @test_svcvt_u64_f64_x_1(<vscale x 2 x i1> %pg, <vscale
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z0.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -638,7 +638,7 @@ define <vscale x 2 x i64> @test_svcvt_u64_f64_x_2(<vscale x 2 x i1> %pg, double
 ; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.d
 ; CHECK-2p2-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
 
@@ -657,3 +657,527 @@ entry:
   %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
   ret <vscale x 2 x i64> %0
 }
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_fcvtzs_i32_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f64_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_fcvtzs_i32_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_fcvtzu_i32_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f64_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_fcvtzu_i32_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_fcvtzs_i64_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f32_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_fcvtzs_i64_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_fcvtzu_i64_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f32_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_fcvtzu_i64_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzs_i32_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f16_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzs_i32_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzu_i32_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f16_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzu_i32_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzs_i64_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f16_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzs_i64_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzu_i64_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f16_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzu_i64_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_fcvtzs_i16_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzs_i16_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i16_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fcvtzs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_fcvtzs_i16_f16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzs_i16_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i16_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fcvtzs z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_fcvtzu_i16_f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_fcvtzu_i16_f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i16_f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fcvtzu z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_fcvtzu_i16_f16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_fcvtzu_i16_f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i16_f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fcvtzu z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_fcvtzs_i32_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzs_i32_f32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_fcvtzs_i32_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i32_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzs z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_fcvtzu_i32_f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_fcvtzu_i32_f32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_fcvtzu_i32_f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i32_f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fcvtzu z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_fcvtzs_i64_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzs_i64_f64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_fcvtzs_i64_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzs_i64_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzs z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_fcvtzu_i64_f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_fcvtzu_i64_f64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_fcvtzu_i64_f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_fcvtzu_i64_f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fcvtzu z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x i64> %0
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
deleted file mode 100644
index aefcad4..0000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, align 4, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, align 64, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) {
-  %alloca = alloca i32, i32 %n, align 4, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) {
-  %alloca = alloca i32, i32 %n, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: func_dynamic_stackalloc_vgpr_align64)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align64
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align64 void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_align64(i32 %n) {
-  %alloca = alloca i32, i32 %n, align 64, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index b0f2aac..7cafa2f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -3990,6 +3990,116 @@ bb:
   ret void
 }
 
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    v_add3_u32 v0, s2, v0, -16
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX940:       ; %bb.0: ; %bb
+; GFX940-NEXT:    v_add_u32_e32 v0, s1, v0
+; GFX940-NEXT:    v_add3_u32 v0, s0, v0, -16
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; UNALIGNED_GFX9-NEXT:    v_add3_u32 v0, s2, v0, -16
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v0, v1, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s0, s5
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, s1, v0
+; UNALIGNED_GFX940-NEXT:    v_add3_u32 v0, s0, v0, -16
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
+bb:
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, -16
+  %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  ret void
+}
+
 define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
 ; GFX9-LABEL: sgpr_base_negative_offset:
 ; GFX9:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 0577117..d81faf9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -10,10 +10,10 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -25,19 +25,19 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -49,19 +49,19 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -73,17 +73,18 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -91,32 +92,34 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -127,111 +130,114 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX6-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_not_b32_e32 v4, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v4
 ; GFX8-NEXT:    v_log_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_not_b32_e32 v4, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v4
 ; GFX9-NEXT:    v_log_f32_e32 v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_not_b32_e32 v4, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f32:
@@ -239,10 +245,12 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
@@ -257,46 +265,54 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s4
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    v_log_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v4, 5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s0
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y)
   ret <2 x float> %pow
@@ -316,9 +332,9 @@ define half @v_pow_f16(half %x, half %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -388,18 +404,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; GFX6-NEXT:    v_not_b32_e32 v3, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v3, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v6
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -508,17 +524,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v1, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -634,17 +650,17 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -764,17 +780,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -885,10 +901,10 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -900,19 +916,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -924,19 +940,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -948,17 +964,18 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, |v0|
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -966,9 +983,9 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs:
@@ -976,23 +993,24 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %pow = call float @llvm.pow.f32(float %fabs.x, float %y)
@@ -1004,10 +1022,10 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1019,19 +1037,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1043,19 +1061,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1067,17 +1085,18 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1085,32 +1104,34 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_rhs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %pow = call float @llvm.pow.f32(float %x, float %fabs.y)
@@ -1122,10 +1143,10 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1137,19 +1158,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1161,19 +1182,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1185,17 +1206,18 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, |v0|
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1203,9 +1225,9 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs:
@@ -1213,23 +1235,24 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, |v1|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
@@ -1241,10 +1264,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1256,18 +1279,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1279,18 +1302,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX9-NEXT:    v_log_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1302,49 +1325,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s1
-; GFX10-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX10-NEXT:    v_log_f32_e32 v1, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s1
-; GFX11-NEXT:    v_mul_f32_e32 v1, s0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX11-NEXT:    v_log_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1354,10 +1379,10 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX6-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1369,18 +1394,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1392,18 +1417,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1415,16 +1440,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1432,31 +1458,33 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1466,10 +1494,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX6-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1481,18 +1509,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX8-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1504,18 +1532,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX9-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1527,49 +1555,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX11-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, s1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1580,10 +1610,10 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1595,19 +1625,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1619,19 +1649,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fneg_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1643,17 +1673,18 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fneg_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, -v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1661,9 +1692,9 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fneg_lhs:
@@ -1671,23 +1702,24 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %pow = call float @llvm.pow.f32(float %neg.x, float %y)
@@ -1699,10 +1731,10 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1714,19 +1746,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1738,19 +1770,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1762,17 +1794,18 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fneg_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1780,32 +1813,34 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fneg_rhs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.y = fneg float %y
   %pow = call float @llvm.pow.f32(float %x, float %neg.y)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
index 55015c6d..cdb67ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
@@ -20,8 +20,8 @@ body: |
     ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
     ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
-    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
-    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]]
     ;
     ; GFX11-FAKE16-LABEL: name: fcmp_false_f16
     ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
@@ -58,8 +58,8 @@ body: |
     ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
     ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
-    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
-    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]]
     ;
     ; GFX11-FAKE16-LABEL: name: fcmp_true_f16
     ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
index 4241f94..ed811d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
@@ -20,8 +20,8 @@ body: |
     ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
     ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
-    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
-    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]]
     ;
     ; GFX11-FAKE16-LABEL: name: fcmp_false_f16
     ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
@@ -58,8 +58,8 @@ body: |
     ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
     ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
-    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
-    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]]
     ;
     ; GFX11-FAKE16-LABEL: name: fcmp_true_f16
     ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index eeb7b13..fe002d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -18,9 +18,9 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_exp_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_not_b32_e32 v1, 63
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -75,53 +75,80 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 }
 
 define float @v_powi_f32(float %l, i32 %r) {
-; GFX78-LABEL: v_powi_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX78-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_exp_f32_e32 v0, v0
+; GFX7-NEXT:    v_not_b32_e32 v1, 63
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    v_log_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 %r)
   ret float %res
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
index 5378ce2..10517a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
@@ -491,3 +491,132 @@ body: |
     %1:_(p5) = G_DYN_STACKALLOC %0, 32
     S_ENDPGM 0, implicit %1
 ...
+
+---
+name: test_dyn_stackalloc_vgpr_align4
+legalized:       true
+frameInfo:
+  maxAlignment: 4
+stack:
+  - { id: 0, type: variable-sized, alignment: 4 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align4
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align4
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 4
+    S_ENDPGM 0, implicit %1
+...
+
+---
+name: test_dyn_stackalloc_vgpr_align16
+legalized:       true
+frameInfo:
+  maxAlignment: 16
+stack:
+  - { id: 0, type: variable-sized, alignment: 16 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align16
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align16
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 16
+    S_ENDPGM 0, implicit %1
+...
+
+---
+name: test_dyn_stackalloc_vgpr_align64
+legalized:       true
+frameInfo:
+  maxAlignment: 64
+stack:
+  - { id: 0, type: variable-sized, alignment: 64 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align64
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
+    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align64
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 64
+    S_ENDPGM 0, implicit %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
new file mode 100644
index 0000000..52259c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Reduce a 64-bit add by a constant if we know the low 32-bits are all
+; zero.
+
+; add i64:x, K if computeTrailingZeros(K) >= 32
+; => build_pair (add x.hi, K.hi), x.lo
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x40000
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 2
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x80000000
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x40000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_high_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, -1
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_high_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %add
+}
+
+define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, 1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %add
+}
+
+define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, 2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %add
+}
+
+define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %add
+}
+
+define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, 2
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %add
+}
+
+; We could reduce this to use a 32-bit add if we use computeKnownBits
+define i64 @v_add_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) {
+; GFX9-LABEL: v_add_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %add = add i64 %reg, %in.high.bits
+  ret i64 %add
+}
+
+; We could reduce this to use a 32-bit add if we use computeKnownBits
+define amdgpu_ps i64 @s_add_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) {
+; GFX9-LABEL: s_add_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, 0
+; GFX9-NEXT:    s_addc_u32 s1, s1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %add = add i64 %reg, %in.high.bits
+  ret i64 %add
+}
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 5b72795..b128be2 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 
 define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX7-LABEL: fmul_select_f32_test1:
@@ -21,22 +25,22 @@ define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 2.000000e+00, float 1.000000e+00
   %ldexp = fmul float %x, %y
@@ -60,22 +64,22 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 5.000000e-01, float 1.000000e+00
   %ldexp = fmul float %x, %y
@@ -83,49 +87,71 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f32_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f32_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_v2f32_test3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f32_test3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
   %ldexp = fmul <2 x float> %x, %y
@@ -133,49 +159,71 @@ define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1
 }
 
 define <2 x float> @fmul_select_v2f32_test4(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f32_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f32_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_v2f32_test4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f32_test4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x float> <float 5.000000e-01, float 5.000000e-01>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
   %ldexp = fmul <2 x float> %x, %y
@@ -199,22 +247,22 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -2.000000e+00, float -1.000000e+00
   %ldexp = fmul float %x, %y
@@ -222,44 +270,83 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define float @fmul_select_f32_test6(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc0400000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc0400000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xc0400000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc0400000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -3.000000e+00, float 8.000000e+00
   %ldexp = fmul float %x, %y
@@ -285,22 +372,22 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0x43A0000000000000, float 0x45B0000000000000
   %ldexp = fmul float %x, %y
@@ -308,44 +395,83 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
 }
 
 define float @fmul_select_f32_test8(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc1000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0x41800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc1000000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test8:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test8:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test8:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test8:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test8:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test8:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 1.600000e+01, float -8.000000e+00
   %ldexp = fmul float %x, %y
@@ -369,22 +495,22 @@ define float @fmul_select_f32_test9(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test9:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test9:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0.000000e+00, float 2.000000e+00
   %ldexp = fmul float %x, %y
@@ -410,22 +536,22 @@ define float @fmul_select_f32_test10(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test10:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test10:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test10:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test10:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -0.000000e+00, float 0.000000e+00
   %ldexp = fmul float %x, %y
@@ -451,22 +577,22 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
 ; GFX9-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, -v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f32 v0, -v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0xC4D0000000000000, float 0xC370000000000000
   %ldexp = fmul float %x, %y
@@ -474,44 +600,83 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
 }
 
 define float @fmul_select_f32_test12_sel_log2val_neg48_pos68(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x44
-; GFX7-NEXT:    v_not_b32_e32 v4, 47
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x44
-; GFX9-NEXT:    v_not_b32_e32 v4, 47
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_not_b32_e32 v3, 47
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_not_b32_e32 v3, 47
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 47
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v3, 47
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x44
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 47
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v3, 47
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x44
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_not_b32_e32 v3, 47
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_not_b32_e32 v3, 47
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0x3CF0000000000000, float 0x4430000000000000
   %ldexp = fmul float %x, %y
@@ -535,22 +700,22 @@ define double @fmul_select_f64_test1(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 2.000000e+00, double 1.000000e+00
   %ldexp = fmul double %x, %y
@@ -574,22 +739,22 @@ define double @fmul_select_f64_test2(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 5.000000e-01, double 1.000000e+00
   %ldexp = fmul double %x, %y
@@ -619,28 +784,28 @@ define <2 x double> @fmul_select_v2f64_test3(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -670,28 +835,28 @@ define <2 x double> @fmul_select_v2f64_test4(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 5.000000e-01>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -715,22 +880,22 @@ define double @fmul_select_f64_test5(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -5.000000e-01, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -754,22 +919,22 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test6:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test6:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -2.000000e+00, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -777,44 +942,64 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test7(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbff00000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbff00000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_f64_test7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 2.000000e+00, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -838,22 +1023,22 @@ define double @fmul_select_f64_test8(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -4.000000e+00, double -3.200000e+01
   %ldexp = fmul double %x, %y
@@ -883,28 +1068,28 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test9:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test9:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double -2.000000e+00, double -2.000000e+00>, <2 x double> <double -1.000000e+00, double -1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -912,60 +1097,115 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
 }
 
 define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f64_test10:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, 0xbff00000
-; GFX7-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v8, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX7-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f64_test10:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xbff00000
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f64_test10:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test10:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v9, 0xbff00000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v9, 0xbff00000 :: v_dual_mov_b32 v8, 0
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 2.000000e+00>, <2 x double> <double -1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -973,44 +1213,64 @@ define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.a
 }
 
 define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test11:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test11:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test11:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test11:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test11:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test11:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test11:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_f64_test11:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test11:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -2.000000e+00, double -0.000000e+00
   %ldexp = fmul double %x, %y
@@ -1018,45 +1278,84 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test12:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test12:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 31, v3
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test12:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test12:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test12:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test12:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test12:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test12:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v3
+; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test12:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test12:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
+; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test12:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0.000000e+00, double -0.000000e+00
   %ldexp = fmul double %x, %y
@@ -1084,24 +1383,24 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test13:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test13:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test13:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test13:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0.000000e+00, double 1.600000e+01
   %ldexp = fmul double %x, %y
@@ -1109,44 +1408,83 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_not_b32_e32 v4, 26
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x5c
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v4, 26
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x5c
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0x5c
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0x5c
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 26
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v5, 26
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 26
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v5, 26
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_not_b32_e32 v4, 26
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_not_b32_e32 v4, 26
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0x45B0000000000000, double 0x3E40000000000000
   %ldexp = fmul double %x, %y
@@ -1154,44 +1492,83 @@ define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bo
 }
 
 define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_not_b32_e32 v4, 32
-; GFX7-NEXT:    v_not_b32_e32 v5, 41
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v4, 32
-; GFX9-NEXT:    v_not_b32_e32 v5, 41
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_not_b32_e32 v4, 41
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_not_b32_e32 v4, 41
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 32
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v5, 41
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v4, 41
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v5, 32
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 32
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v5, 41
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v4, 41
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v5, 32
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_not_b32_e32 v4, 41
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_not_b32_e32 v4, 32
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_not_b32_e32 v4, 41
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_not_b32_e32 v4, 32
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0x3D50000000000000, double 0x3DE0000000000000
   %ldexp = fmul double %x, %y
@@ -1200,40 +1577,82 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo
 
 
 define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 2.000000e+00, half 1.000000e+00
   %ldexp = fmul half %x, %y
@@ -1241,47 +1660,89 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test2:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test2:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test2:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test2:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test2:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test2:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 5.000000e-01, half 1.000000e+00
   %ldexp = fmul half %x, %y
@@ -1289,59 +1750,126 @@ define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f16_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f16_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f16_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1030-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f16_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x half> <half 2.000000e+00, half 2.000000e+00>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
   %ldexp = fmul <2 x half> %x, %y
@@ -1349,59 +1877,126 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 }
 
 define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f16_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f16_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f16_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x3800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1030-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f16_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x half> <half 5.000000e-01, half 5.000000e-01>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
   %ldexp = fmul <2 x half> %x, %y
@@ -1409,15 +2004,25 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 }
 
 define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test5:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmul_select_f16_test5:
 ; GFX9:       ; %bb.0:
@@ -1427,22 +2032,22 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f16_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f16_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f16_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 2.000000e+00, half 8.000000e+00
   %ldexp = fmul half %x, %y
@@ -1450,46 +2055,88 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4200
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xc800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4200
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4200
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -8.000000e+00, half 3.000000e+00
   %ldexp = fmul half %x, %y
@@ -1497,45 +2144,87 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc400
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x4800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc400
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc400
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test7:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test7:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test7:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test7:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 8.000000e+00, half -4.000000e+00
   %ldexp = fmul half %x, %y
@@ -1543,16 +2232,28 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmul_select_f16_test8:
 ; GFX9:       ; %bb.0:
@@ -1563,22 +2264,22 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f16_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f16_test8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f16_test8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -0.000000e+00, half 0.000000e+00
   %ldexp = fmul half %x, %y
@@ -1586,40 +2287,87 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test9:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test9:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test9:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 5, v1
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test9:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test9:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test9:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test9:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test9:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test9:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -1.600000e+01, half -3.200000e+01
   %ldexp = fmul half %x, %y
@@ -1627,47 +2375,82 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 0xH1000, half 0xH6800
   %ldexp = fmul half %x, %y
@@ -1675,47 +2458,82 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a
 }
 
 define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 0xH5800, half 0xH0400
   %ldexp = fmul half %x, %y
@@ -1723,72 +2541,114 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar
 }
 
 define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -1796,72 +2656,114 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f00
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x3f00
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3f00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -1869,111 +2771,158 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2bf16_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2bf16_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2bf16_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1030-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1030-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1030-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1030-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1030-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1030-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1030-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2bf16_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -1981,111 +2930,158 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2bf16_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2bf16_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3f00
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2bf16_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1030-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1030-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1030-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1030-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1030-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1030-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1030-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2bf16_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3f00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -2093,73 +3089,108 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 }
 
 define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test5:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2167,74 +3198,116 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffc100
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc100
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc100
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2242,73 +3315,115 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc080
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc080
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4100
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2316,73 +3431,111 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-SDAG-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2390,74 +3543,121 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test9:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc2000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc200
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc180
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffc180
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 5, v1
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc200
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc180
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc180
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01
   %ldexp = fmul bfloat %x, %y
@@ -2465,74 +3665,111 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xdb800000
-; GFX7-NEXT:    v_bfrev_b32_e32 v4, 7
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffe000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffe000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xdb800000
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v4, 7
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffe000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80
   %ldexp = fmul bfloat %x, %y
@@ -2540,74 +3777,111 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 }
 
 define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_bfrev_b32_e32 v3, 50
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0x34800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4c00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3480
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x3480
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v3, 50
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0x34800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v3, 21
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3480
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v3, 21
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3480
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00
   %ldexp = fmul bfloat %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir
new file mode 100644
index 0000000..5c7c076
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=default -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=dummy -o - %s | FileCheck -check-prefixes=CHECK,DUMMY %s
+
+# Check that the regalloc-enable-priority-advisor=dummy option works
+# and the result is different from the default. Ordinarily %1 would be
+# prioritized higher than %0 due to the register class priority
+
+---
+name:            foo
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; DEFAULT-LABEL: name: foo
+    ; DEFAULT: liveins: $vgpr0, $vgpr1
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+    ; DEFAULT-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; DEFAULT-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; DEFAULT-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    ; DEFAULT-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; DEFAULT-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec
+    ; DEFAULT-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; DUMMY-LABEL: name: foo
+    ; DUMMY: liveins: $vgpr0, $vgpr1
+    ; DUMMY-NEXT: {{  $}}
+    ; DUMMY-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+    ; DUMMY-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; DUMMY-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    ; DUMMY-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; DUMMY-NEXT: renamable $vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+    ; DUMMY-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
+    ; DUMMY-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+    undef %1.sub0:vreg_128 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+    %2:vgpr_32 = V_ADD_U32_e32 %1.sub0, %0, implicit $exec
+    $vgpr3 = COPY %2
+    SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+
+...
+
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 73aa87e..9acb3a4 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -1,64 +1,829 @@
-; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-GISEL %s
 target datalayout = "A5"
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s4, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
   store volatile i32 10, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
   store volatile i32 22, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca float, i32 %idx, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 128, addrspace(5)
   store volatile i32 444, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 4, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX11-SDAG-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 4, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i128, i32 %idx, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s9, s5
+; GFX9-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s5, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s5
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s5
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB6_4: ; %bb.1
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s5, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB6_4: ; %bb.1
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s4, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, s32
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s3, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s1, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s4, s1, -16
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s32
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s2, 5, s3
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s3 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:  .LBB6_4: ; %bb.1
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_add_u32 s3, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_lshl_b32 s4, s1, 5
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s4
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s2, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s3 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s3, s2
+; GFX11-GISEL-NEXT:  .LBB6_4: ; %bb.1
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %n, 0
   %alloca1 = alloca i32, i32 8, addrspace(5)
@@ -77,10 +842,206 @@ bb.1:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x1000
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s4, s4, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s4, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
+; GFX9-SDAG-NEXT:  .LBB7_4: ; %bb.0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s4, s5
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB7_5: ; %bb.2
+; GFX9-SDAG-NEXT:    s_endpgm
+; GFX9-SDAG-NEXT:  .LBB7_6:
+; GFX9-SDAG-NEXT:    s_branch .LBB7_4
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, 1
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x1000
+; GFX9-GISEL-NEXT:    s_cbranch_scc0 .LBB7_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s9, v0, s4
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s4
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s9
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX9-GISEL-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_6
+; GFX9-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s5, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB7_6: ; %bb.2
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 64
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
+; GFX11-SDAG-NEXT:  .LBB7_4: ; %bb.0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:  .LBB7_5: ; %bb.2
+; GFX11-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-NEXT:  .LBB7_6:
+; GFX11-SDAG-NEXT:    s_branch .LBB7_4
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 64
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 1
+; GFX11-GISEL-NEXT:    s_cbranch_scc0 .LBB7_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s0, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s2, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s3, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s3 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX11-GISEL-NEXT:    s_xor_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_6
+; GFX11-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s1, 15
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff800
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:  .LBB7_6: ; %bb.2
+; GFX11-GISEL-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %n, 0
   br i1 %cond, label %bb.0, label %bb.1
@@ -97,62 +1058,1113 @@ bb.2:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 10
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
   store volatile i32 10, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 22
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
   store volatile i32 22, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 128, addrspace(5)
   store volatile i32 444, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s13, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x3000
+; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_6
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s11, v1, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s11, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s12, v1, s11
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s11
+; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s12
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX9-SDAG-NEXT:  ; %bb.5:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB14_6: ; %bb.1
+; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s6
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s7
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX9-SDAG-NEXT:  ; %bb.8:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xd000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s13
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s13, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x3000
+; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB14_6
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, 0
+; GFX9-GISEL-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s11, v1, s10
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9-GISEL-NEXT:    s_max_u32 s9, s9, s11
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_add_u32 s7, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s9, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s9, s7, 0xfffff000
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v2, 2, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s9, s6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, 0
+; GFX9-GISEL-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s11, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s12, v1, s11
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s11
+; GFX9-GISEL-NEXT:    s_max_u32 s10, s10, s12
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX9-GISEL-NEXT:  ; %bb.5:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s10, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB14_6: ; %bb.1
+; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s6
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s7
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX9-GISEL-NEXT:  ; %bb.8:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xd000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s13
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_mov_b32 s7, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xc0
+; GFX11-SDAG-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_6
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v1, s4
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s5
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s3, 5, s2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX11-SDAG-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s4, s5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s6
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX11-SDAG-NEXT:  ; %bb.5:
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s3, 5, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s4 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:  .LBB14_6: ; %bb.1
+; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX11-SDAG-NEXT:  ; %bb.8:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff40
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s7, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xc0
+; GFX11-GISEL-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB14_6
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v2, v1, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, -16, v2
+; GFX11-GISEL-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s5, v2, s4
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-GISEL-NEXT:    s_lshl_b32 s5, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s5
+; GFX11-GISEL-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s4, s5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s3, s3, s6
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX11-GISEL-NEXT:  ; %bb.5:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s3, s3, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s4 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s4, s3
+; GFX11-GISEL-NEXT:  .LBB14_6: ; %bb.1
+; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX11-GISEL-NEXT:  ; %bb.8:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s7
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff40
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
   %alloca1 = alloca i32, i32 8, addrspace(5)
@@ -171,10 +2183,272 @@ bb.1:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v1, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v2, s8, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    ; implicit-def: $vgpr31
+; GFX9-SDAG-NEXT:  .LBB15_4: ; %Flow
+; GFX9-SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_8
+; GFX9-SDAG-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX9-SDAG-NEXT:  ; %bb.7:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB15_8: ; %bb.2
+; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xe000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s11
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s11, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_add_u32 s7, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s7, s7, 0xfffff000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s7, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    ; implicit-def: $vgpr31
+; GFX9-GISEL-NEXT:  .LBB15_4: ; %Flow
+; GFX9-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_8
+; GFX9-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX9-GISEL-NEXT:  ; %bb.7:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB15_8: ; %bb.2
+; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xe000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s11
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s1, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:  .LBB15_4: ; %Flow
+; GFX11-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_8
+; GFX11-SDAG-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX11-SDAG-NEXT:  ; %bb.7:
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:  .LBB15_8: ; %bb.2
+; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff80
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GFX11-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    ; implicit-def: $vgpr31
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB15_4: ; %Flow
+; GFX11-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_8
+; GFX11-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX11-GISEL-NEXT:  ; %bb.7:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB15_8: ; %bb.2
+; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff80
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s5
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
   br i1 %cond, label %bb.0, label %bb.1
@@ -190,3 +2464,257 @@ bb.1:
 bb.2:
   ret void
 }
+
+define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
+; GFX11-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, i16 %n, align 2, addrspace(5)
+  store volatile i32 666, ptr addrspace(5) %alloca
+  ret void
+}
+
+define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, i64 %n, align 2, addrspace(5)
+  store volatile i32 666, ptr addrspace(5) %alloca
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index ebfb5e9..a324ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1625,14 +1625,12 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
 ; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rsq_f32_e32 v0, v0
-; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x45800000
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 97d642b..5415af0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -5249,6 +5249,114 @@ bb:
   ret void
 }
 
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add3_u32 v0, s2, s3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9-PAL:       ; %bb.0: ; %bb
+; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX940:       ; %bb.0: ; %bb
+; GFX940-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10-PAL:       ; %bb.0: ; %bb
+; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11-PAL:       ; %bb.0: ; %bb
+; GFX11-PAL-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    s_endpgm
+bb:
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, -16
+  %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  ret void
+}
+
 define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
 ; GFX9-LABEL: sgpr_base_negative_offset:
 ; GFX9:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 005e401..822d40f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
 
 declare half @llvm.fma.f16(half, half, half)
 declare half @llvm.maxnum.f16(half, half)
@@ -27,6 +29,16 @@ define half @test_fma(half %x, half %y, half %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fma:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half %z)
   ret half %r
 }
@@ -50,6 +62,16 @@ define half @test_fmac(half %x, half %y, half %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmac_f16_e32 v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmac:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fmac_f16_e32 v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %y, half %z, half %x)
   ret half %r
 }
@@ -81,6 +103,16 @@ define half @test_fmaak(half %x, half %y, half %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmaak:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
   ret half %r
 }
@@ -112,6 +144,16 @@ define half @test_fmamk(half %x, half %y, half %z) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmamk:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
   ret half %r
 }
@@ -193,6 +235,42 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: test_D139469_f16:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0x211e
+; GFX12-SDAG-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
+; GFX12-SDAG-NEXT:    v_min_num_f16_e32 v0, v2, v1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_D139469_f16:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e
+; GFX12-GISEL-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v1
+; GFX12-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract half %arg, 0xH291E
   %i1 = fcmp olt half %i, 0xH0000
@@ -306,6 +384,55 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: test_D139469_v2f16:
+; GFX12-SDAG:       ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0x211e
+; GFX12-SDAG-NEXT:    v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v1, v0
+; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_D139469_v2f16:
+; GFX12-GISEL:       ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
+; GFX12-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e64 s1, 0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT:    v_cmp_gt_f16_e64 s2, 0, v3
+; GFX12-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_or_b32 s0, s1, s2
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
   %i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000>
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0db..fbcdbed 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile  float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_max_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e..269fd52 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_min_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s6
+; GFX12-NEXT:    s_mov_b32 s13, s7
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
   %c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s6
+; GFX12-NEXT:    s_mov_b32 s13, s7
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
   %c = load volatile double, ptr addrspace(1) %cptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 104e157..9ae60f9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -3307,489 +3307,459 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) {
 ; --------------------------------------------------------------------
 
 define float @v_mul_f32_select_64_1(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f32_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float 64.0, float 1.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float 1.0, float 64.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float -1.0, float -64.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n64_n1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_n64_n1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 64.0, float 1.0
+  %select.pow2 = select i1 %cond, float -64.0, float -1.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_1_64:
+define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
+; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 1.0, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_1_64:
+; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_1_64:
+; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 1.0, float 64.0
+  %select.pow2 = select i1 %cond, float 128.0, float 64.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n1_n64:
+define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
+; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1.0, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n1_n64:
+; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n1_n64:
+; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -1.0, float -64.0
+  %select.pow2 = select i1 %cond, float -128.0, float -64.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n64_n1:
+define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_mul_f32_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float -128.0, float -16.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1.0, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n64_n1:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n64_n1:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -64.0, float -1.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
+define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, 1.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x43000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 128.0, float 64.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
+define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc3000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -128.0, float -64.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n16:
+define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, -1.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc3000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n16:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n16:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -128.0, float -16.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
-}
-
-define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_64_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_64_1:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_1_64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, 1.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_1_64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_n64_n1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc2800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, -1.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_n64_n1:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_n1_n64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc2800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -1.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_n1_n64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, float -1.0, float -64.0
   %mul = fmul contract float %x, %select.pow2
   %fma = fadd contract float %mul, %y
@@ -3810,11 +3780,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
@@ -3829,10 +3799,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
@@ -3847,10 +3818,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 64.0
@@ -3860,22 +3832,57 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 }
 
 define float @v_contract_mul_add_f32_select_128_4(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_128_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 4.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, 4.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_128_4:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 4.0
   %mul = fmul contract float %x, %select.pow2
@@ -3907,143 +3914,102 @@ define float @v_contract_mul_add_f32_select_2_4(i32 %arg, float %x, float %y) {
 }
 
 define float @v_contract_mul_add_f32_select_4_128(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_4_128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, 4.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_4_128:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 4.0, float 128.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, 4.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x3ff00000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float 4.0, float 128.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
+}
+
+define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
+; GFX9-LABEL: v_mul_f64_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_mul_f64_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 64.0, double 1.0
   %mul = fmul double %x, %select.pow2
   ret double %mul
 }
 
 define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40500000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 1.0, double 64.0
   %mul = fmul double %x, %select.pow2
@@ -4051,59 +4017,21 @@ define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
 }
 
 define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0500000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0500000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -1.0, double -64.0
   %mul = fmul double %x, %select.pow2
@@ -4122,12 +4050,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX9-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_128_64:
@@ -4141,11 +4067,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX10-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_128_64:
@@ -4159,10 +4084,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX11-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4182,12 +4107,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n64:
@@ -4201,11 +4124,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n64:
@@ -4219,10 +4141,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -128.0, double -64.0
@@ -4231,59 +4153,21 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 }
 
 define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0300000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0300000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0300000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -128.0, double -16.0
   %mul = fmul double %x, %select.pow2
@@ -4305,12 +4189,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
@@ -4326,11 +4208,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
@@ -4345,10 +4226,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x3ff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double 64.0, double 1.0
@@ -4372,12 +4253,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
@@ -4393,11 +4272,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
@@ -4412,10 +4290,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double 1.0, double 64.0
@@ -4439,12 +4317,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xc0500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0xbff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
@@ -4460,11 +4336,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
@@ -4479,10 +4354,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0xbff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double -64.0, double -1.0
@@ -4506,12 +4381,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
@@ -4527,11 +4400,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
@@ -4546,10 +4418,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0xc0500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double -1.0, double -64.0
@@ -4573,12 +4445,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
@@ -4594,11 +4465,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
@@ -4613,10 +4484,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4640,12 +4512,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40100000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
@@ -4661,11 +4531,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
@@ -4680,10 +4549,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40100000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 4.0
@@ -4706,21 +4575,50 @@ define double @v_contract_mul_add_f64_select_2_4(i32 %arg, double %x, double %y)
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 2.0, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_contract_mul_add_f64_select_2_4:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 2.0, double 4.0
   %mul = fmul contract double %x, %select.pow2
@@ -4743,12 +4641,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40600000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
@@ -4764,11 +4660,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
@@ -4783,10 +4678,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40600000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 4.0, double 128.0
@@ -4796,57 +4691,21 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 }
 
 define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 64.0, half 1.0
   %mul = fmul half %x, %select.pow2
@@ -4854,57 +4713,21 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 1.0, half 64.0
   %mul = fmul half %x, %select.pow2
@@ -4912,57 +4735,21 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -1.0, half -64.0
   %mul = fmul half %x, %select.pow2
@@ -4981,11 +4768,13 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX9-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_128_64:
@@ -4999,10 +4788,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX10-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_128_64:
@@ -5016,10 +4807,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5039,11 +4832,13 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n64:
@@ -5057,10 +4852,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n64:
@@ -5074,10 +4871,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -64.0
@@ -5086,57 +4885,21 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -16.0
   %mul = fmul half %x, %select.pow2
@@ -5157,11 +4920,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
@@ -5176,10 +4938,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
@@ -5194,10 +4956,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 64.0, half 1.0
@@ -5220,11 +4982,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
@@ -5239,10 +5000,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
@@ -5257,10 +5018,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 1.0, half 64.0
@@ -5283,11 +5044,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
@@ -5302,10 +5062,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
@@ -5320,10 +5080,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -64.0, half -1.0
@@ -5346,11 +5106,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
@@ -5365,10 +5124,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
@@ -5383,10 +5142,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -1.0, half -64.0
@@ -5409,11 +5168,14 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
@@ -5428,10 +5190,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
@@ -5446,10 +5211,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5472,11 +5240,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
@@ -5491,10 +5258,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
@@ -5509,10 +5276,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 4.0
@@ -5535,11 +5302,14 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
@@ -5554,10 +5324,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
@@ -5572,10 +5345,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 2.0, half 4.0
@@ -5598,11 +5374,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
@@ -5617,10 +5392,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
@@ -5635,10 +5410,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 4.0, half 128.0
@@ -5664,15 +5439,13 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_64_1:
@@ -5690,14 +5463,14 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_64_1:
@@ -5715,14 +5488,15 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -5747,15 +5521,13 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_1_64:
@@ -5773,14 +5545,14 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_1_64:
@@ -5798,14 +5570,15 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -5830,15 +5603,14 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
@@ -5856,14 +5628,15 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
@@ -5881,14 +5654,16 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -5913,15 +5688,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_128_64:
@@ -5939,14 +5718,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_128_64:
@@ -5964,14 +5748,20 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -5996,15 +5786,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
@@ -6022,14 +5817,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
@@ -6047,14 +5848,21 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -64.0, half -64.0>
@@ -6079,15 +5887,14 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xcc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
@@ -6105,14 +5912,15 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
@@ -6130,14 +5938,16 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -16.0, half -16.0>
@@ -6162,15 +5972,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
@@ -6188,14 +5997,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
@@ -6213,14 +6023,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -6246,15 +6058,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
@@ -6272,14 +6083,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
@@ -6297,14 +6109,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -6330,15 +6144,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
@@ -6356,14 +6170,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
@@ -6381,14 +6197,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -64.0, half -64.0>, <2 x half> <half -1.0, half -1.0>
@@ -6414,15 +6233,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
@@ -6440,14 +6259,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
@@ -6465,14 +6286,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -6498,15 +6322,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v4, v5
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v4, v5
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
@@ -6524,14 +6353,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
@@ -6549,14 +6384,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v5, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -6582,15 +6423,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
@@ -6608,14 +6448,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
@@ -6633,14 +6474,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 4.0, half 4.0>
@@ -6666,15 +6509,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 2, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v4, v5
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v4, v5
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
@@ -6692,14 +6540,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 2, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
@@ -6717,14 +6571,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 2, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v5, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 2.0, half 2.0>, <2 x half> <half 4.0, half 4.0>
@@ -6750,15 +6610,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5800
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
@@ -6776,14 +6635,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
@@ -6801,14 +6661,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 4.0, half 4.0>, <2 x half> <half 128.0, half 128.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index f6ee007..80b4d64 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -14,6 +14,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
 
 ; Test patterns to match v_fract_* instructions.
 
@@ -103,6 +104,21 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v3, v0
+; GFX12-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX12-NEXT:    v_floor_f32_e32 v4, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -181,6 +197,18 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f32_noinf_check:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v3, v0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    global_store_b32 v[1:2], v3, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -263,6 +291,22 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w
 ; GFX11-NEXT:    v_min_f32_e32 v4, 0x3f7fffff, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_nan_check_math_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v3, v0
+; GFX12-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v4, v0, v3
+; GFX12-NEXT:    global_store_b32 v[1:2], v3, off
+; GFX12-NEXT:    v_min_num_f32_e32 v4, 0x3f7fffff, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -314,6 +358,16 @@ define float @basic_fract_f32_nonans(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_nonans:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -362,6 +416,19 @@ define float @basic_fract_f32_flags_minnum(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_flags_minnum:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -409,6 +476,16 @@ define float @basic_fract_f32_flags_fsub(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_flags_fsub:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub nsz float %x, %floor
@@ -467,6 +544,17 @@ define <2 x float> @basic_fract_v2f32_nonans(<2 x float> nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    v_fract_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_v2f32_nonans:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    v_fract_f32_e32 v1, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
   %sub = fsub <2 x float> %x, %floor
@@ -540,6 +628,20 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_multi_use_fsub_nonans:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v3, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v3, v0, v3
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    global_store_b32 v[1:2], v3, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -588,6 +690,16 @@ define float @nnan_minnum_fract_f32(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_minnum_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -638,6 +750,19 @@ define float @nnan_fsub_fract_f32(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_fsub_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub nnan float %x, %floor
@@ -686,6 +811,19 @@ define float @nnan_floor_fract_f32(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_floor_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call nnan float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -733,6 +871,16 @@ define float @nnan_src_fract_f32(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_src_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -782,6 +930,19 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_wrong_const:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7ffffe, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -831,6 +992,19 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_swapped_fsub:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %floor, %x
@@ -880,6 +1054,19 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_not_floor:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.trunc.f32(float %x)
   %sub = fsub float %x, %floor
@@ -929,6 +1116,19 @@ define float @not_fract_f32_different_floor(float %x, float %y) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_different_floor:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %y)
   %sub = fsub float %x, %floor
@@ -978,6 +1178,19 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_maxnum:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT:    v_max_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1000,6 +1213,15 @@ define float @fcmp_uno_check_is_nan_f32(float %x) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_uno_check_is_nan_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1054,6 +1276,16 @@ define float @select_nan_fract_f32(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1107,6 +1339,16 @@ define float @commuted_select_nan_fract_f32(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: commuted_select_nan_fract_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1168,6 +1410,22 @@ define float @wrong_commuted_nan_select_f32(float %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: wrong_commuted_nan_select_f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f32_e32 v1, v0
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v1, v0, v1
+; GFX12-NEXT:    v_min_num_f32_e32 v1, 0x3f7fffff, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1231,6 +1489,16 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f16_nonan:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -1313,6 +1581,20 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_fract_f16_e32 v1, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_v2f16_nonan:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_fract_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
   %sub = fsub <2 x half> %x, %floor
@@ -1369,6 +1651,16 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f64_nanans:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
   %sub = fsub double %x, %floor
@@ -1461,6 +1753,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture
 ; GFX11-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX11-NEXT:    global_store_b16 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f16_noinf_check:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f16_e32 v3, v0
+; GFX12-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-NEXT:    global_store_b16 v[1:2], v3, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -1546,6 +1850,18 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap
 ; GFX11-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f64_noinf_check:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_floor_f64_e32 v[4:5], v[0:1]
+; GFX12-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT:    global_store_b64 v[2:3], v[4:5], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
   %sub = fsub double %x, %floor
@@ -1600,6 +1916,16 @@ define float @select_nan_fract_f32_flags_select(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32_flags_select:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1653,6 +1979,16 @@ define float @select_nan_fract_f32_flags_minnum(float %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32_flags_minnum:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
   %sub = fsub float %x, %floor
@@ -1769,6 +2105,25 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v7, 0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f32:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f32_e32 v6, v0
+; GFX12-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x204
+; GFX12-NEXT:    v_fract_f32_e32 v7, v1
+; GFX12-NEXT:    v_floor_f32_e32 v4, v0
+; GFX12-NEXT:    v_floor_f32_e32 v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX12-NEXT:    v_cmp_class_f32_e64 s0, v1, 0x204
+; GFX12-NEXT:    global_store_b64 v[2:3], v[4:5], off
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, v7, 0, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
   %sub = fsub <2 x float> %x, %floor
@@ -1881,6 +2236,21 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[6:7], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
+; GFX12-NEXT:    v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX12-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
+; GFX12-NEXT:    global_store_b64 v[2:3], v[6:7], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
   %sub = fsub double %x, %floor
@@ -2002,6 +2372,21 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b16 v[1:2], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f16_e32 v3, v0
+; GFX12-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
+; GFX12-NEXT:    v_floor_f16_e32 v4, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-NEXT:    global_store_b16 v[1:2], v4, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -2168,6 +2553,29 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    v_fract_f16_e32 v6, v0
+; GFX12-NEXT:    v_floor_f16_e32 v5, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_fract_f16_e32 v4, v3
+; GFX12-NEXT:    v_cmp_class_f16_e64 s0, v3, 0x204
+; GFX12-NEXT:    v_floor_f16_e32 v7, v3
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v4, 0, s0
+; GFX12-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pack_b32_f16 v4, v5, v7
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX12-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
   %sub = fsub <2 x half> %x, %floor
@@ -2311,6 +2719,26 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
+; GFX12-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 0x204
+; GFX12-NEXT:    v_fract_f64_e32 v[12:13], v[2:3]
+; GFX12-NEXT:    v_cmp_class_f64_e64 s1, v[2:3], 0x204
+; GFX12-NEXT:    v_floor_f64_e32 v[8:9], v[2:3]
+; GFX12-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, v10, 0, s0
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, v11, 0, s0
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s1
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
   %sub = fsub <2 x double> %x, %floor
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index b300181..c1d5b58 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -2380,14 +2380,12 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x45800000
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
@@ -2734,20 +2732,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x4b800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 24, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, s[4:5]
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x45800000
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, s[4:5]
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
index cefd240..85c6577 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
@@ -18,7 +18,7 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
-    ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = COPY %0.sub0
@@ -43,7 +43,7 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
-    ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub1
     %2 = COPY %0.sub0
@@ -68,7 +68,7 @@ body:             |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
-    ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
     %0 = IMPLICIT_DEF
     %1 = COPY %0.sub0
     %2 = COPY %0.sub1
@@ -90,7 +90,7 @@ body:             |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
     ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
-    ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
     %0:vgpr_32 = COPY killed $vgpr0
 
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 157f91c..b2f113f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1)
 define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_add_i32 s3, s3, 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_add_i32 s3, s3, 1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_add_i32 s3, s3, 1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, 1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[2:3], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
@@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
 define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_add_i32 s3, s3, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_add_i32 s3, s3, -1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_add_i32 s3, s3, -1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_add_co_i32 s3, s3, -1
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[2:3], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
new file mode 100644
index 0000000..503f27e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -0,0 +1,204 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -simplify-mir -start-before=greedy,2 -stress-regalloc=4 -stop-before=virtregrewriter,2 -o - -verify-regalloc %s 2> %t.err | FileCheck %s
+# RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+# To allocate the vreg_512_align2, the allocation will attempt to
+# inflate the register class to av_512_align2. This will ultimately
+# not work, and the allocation will fail. There is an unproductive
+# live range split, and we end up with a snippet copy of an
+# unspillable register. Recursive assignment of interfering ranges
+# during last chance recoloring would delete the unspillable snippet
+# live range. Make sure there's no use after free when rolling back
+# the last chance assignment.
+
+# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free'
+# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free_lane_subset'
+
+--- |
+  define amdgpu_kernel void @inflated_reg_class_copy_use_after_free() {
+    ret void
+  }
+
+  define amdgpu_kernel void @inflated_reg_class_copy_use_after_free_lane_subset() {
+    ret void
+  }
+
+...
+
+# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free
+# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: early-clobber [[MFMA0:%[0-9]+]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[MFMA0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY [[MFMA0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1
+# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3
+# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 {
+# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
+# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[MFMA_USE1]], 0, 0, 0, implicit $mode, implicit $exec
+
+---
+name:            inflated_reg_class_copy_use_after_free
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr72_sgpr73_sgpr74_sgpr75'
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       7
+  vgprForAGPRCopy: '$vgpr255'
+  sgprForEXECCopy: '$sgpr74_sgpr75'
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr4_sgpr5
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
+    S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
+    early-clobber %2:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    %1.sub2:vreg_512_align2 = COPY %2.sub3
+    %1.sub3:vreg_512_align2 = COPY %2.sub2
+    %1.sub4:vreg_512_align2 = COPY %2.sub0
+    %1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
+    S_ENDPGM 0
+
+...
+
+# This test is similar to except it is still broken when the use
+# instruction does not read the full set of lanes after one attempted fix.
+
+# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free_lane_subset
+# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE_0:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: S_NOP 0, implicit-def early-clobber [[REG1:%[0-9]+]], implicit [[RESTORE_0]].sub0_sub1_sub2_sub3, implicit [[RESTORE_0]].sub4_sub5_sub6_sub7
+# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[REG1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY [[REG1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE_1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE_1]].sub0_sub1
+# CHECK-NEXT: [[RESTORE_2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE_2]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[RESTORE_2]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT5:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT4]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT5]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT3]].sub2:av_512_align2 = COPY [[SPLIT5]].sub3
+# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT3]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT5]].sub0 {
+# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT5]].sub2
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT7]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
+# CHECK-NEXT: undef [[SPLIT9:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT7]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: undef [[LAST_USE:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: [[LAST_USE]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[LAST_USE]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: S_NOP 0, implicit-def [[LAST_USE]], implicit [[LAST_USE]].sub0_sub1_sub2_sub3, implicit [[LAST_USE]].sub4_sub5_sub6_sub7, implicit [[LAST_USE]].sub8_sub9_sub10_sub11
+
+---
+name:            inflated_reg_class_copy_use_after_free_lane_subset
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr72_sgpr73_sgpr74_sgpr75'
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       7
+  vgprForAGPRCopy: '$vgpr255'
+  sgprForEXECCopy: '$sgpr74_sgpr75'
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr4_sgpr5
+
+    %0:vgpr_32 = IMPLICIT_DEF
+    renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
+    S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
+    S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
+    S_NOP 0, implicit-def early-clobber %2:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7
+    %1.sub2:vreg_512_align2 = COPY %2.sub3
+    %1.sub3:vreg_512_align2 = COPY %2.sub2
+    %1.sub4:vreg_512_align2 = COPY %2.sub0
+    %1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    %1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+    S_NOP 0, implicit-def %1:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7, implicit %1.sub8_sub9_sub10_sub11
+    GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index cf9fdbd..2ceaca3 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %12
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %10
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll b/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll
new file mode 100644
index 0000000..1a87887
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+define <4 x float> @issue121601(bfloat %fptrunc) {
+; CHECK-LABEL: issue121601:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %bitcast = bitcast bfloat %fptrunc to <1 x bfloat>
+  %shufflevector = shufflevector <1 x bfloat> %bitcast, <1 x bfloat> zeroinitializer, <2 x i32> zeroinitializer
+  %fpext = fpext <2 x bfloat> %shufflevector to <2 x float>
+  %shufflevector1 = shufflevector <2 x float> %fpext, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shufflevector1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 3ff759a..867025a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -4,6 +4,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
 
 define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX6-LABEL: cos_f16:
@@ -80,6 +81,19 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    v_cos_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: cos_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cos_f16_e32 v1, v1
+; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.cos.f16(half %a.val)
   store half %r.val, ptr addrspace(1) %r
@@ -188,6 +202,24 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: cos_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX12-NEXT:    v_cos_f16_e32 v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT:    v_cos_f16_e32 v2, v2
+; GFX12-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a.val = load <2 x half>, ptr addrspace(1) %a
   %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
   store <2 x half> %r.val, ptr addrspace(1) %r
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index ac51580..333d428 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -41,10 +41,10 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s2, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -78,9 +78,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s2, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v0, v1
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
@@ -115,9 +115,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -203,7 +203,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
@@ -213,10 +213,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v3, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -252,7 +252,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
@@ -262,10 +262,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v3, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,7 +300,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
@@ -310,10 +310,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s11, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v3, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -421,17 +421,17 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v4
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -439,11 +439,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
 ; SI-GISEL-NEXT:    s_endpgm
@@ -487,16 +487,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -504,10 +504,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v4, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
@@ -551,15 +551,15 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -567,10 +567,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -710,7 +710,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -720,22 +720,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v3, v3, v4
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -787,7 +787,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -797,22 +797,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v1, v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -863,7 +863,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -873,22 +873,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v3, v3, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -1006,19 +1006,19 @@ define float @v_exp2_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1034,6 +1034,20 @@ define float @v_exp2_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1048,6 +1062,20 @@ define float @v_exp2_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1076,19 +1104,19 @@ define float @v_exp2_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fabs_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1104,6 +1132,20 @@ define float @v_exp2_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1118,6 +1160,20 @@ define float @v_exp2_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fabs_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1147,19 +1203,19 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fneg_fabs_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1175,6 +1231,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1189,6 +1259,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fneg_fabs_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1219,19 +1303,19 @@ define float @v_exp2_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fneg_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fneg_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1247,6 +1331,20 @@ define float @v_exp2_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fneg_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1261,6 +1359,20 @@ define float @v_exp2_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fneg_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fneg_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1290,19 +1402,19 @@ define float @v_exp2_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_fast:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_fast:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_fast:
 ; VI-SDAG:       ; %bb.0:
@@ -1318,6 +1430,20 @@ define float @v_exp2_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_fast:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1332,6 +1458,20 @@ define float @v_exp2_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_fast:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1360,19 +1500,19 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1388,6 +1528,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1402,6 +1556,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_unsafe_math_attr:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1430,19 +1598,19 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1458,6 +1626,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1472,6 +1654,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_approx_fn_attr:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1500,19 +1696,19 @@ define float @v_exp2_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -1528,6 +1724,20 @@ define float @v_exp2_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1542,6 +1752,20 @@ define float @v_exp2_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1570,19 +1794,19 @@ define float @v_exp2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_afn:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -1598,6 +1822,20 @@ define float @v_exp2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1612,6 +1850,20 @@ define float @v_exp2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_afn:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1660,19 +1912,19 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_afn_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_afn_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -1688,6 +1940,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_afn_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1702,6 +1968,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_afn_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1730,19 +2010,19 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_fabs_exp2_f32_afn:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_fabs_exp2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -1758,6 +2038,20 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_fabs_exp2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1772,6 +2066,20 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_fabs_exp2_f32_afn:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1821,19 +2129,19 @@ define float @v_exp2_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
@@ -1849,6 +2157,20 @@ define float @v_exp2_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1863,6 +2185,20 @@ define float @v_exp2_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1911,19 +2247,19 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -1939,6 +2275,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1953,6 +2303,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2001,19 +2365,19 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2029,6 +2393,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2043,6 +2421,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_ninf_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2071,19 +2463,19 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -2099,6 +2491,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2113,6 +2519,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2161,19 +2581,19 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2189,6 +2609,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2203,6 +2637,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2251,19 +2699,19 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_dynamic_mode:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
@@ -2279,6 +2727,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2293,6 +2755,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_dynamic_mode:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2313,20 +2789,50 @@ define float @v_exp2_f32_undef() {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, 0x7fc00000
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_undef:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; GCN-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
-; GCN-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_undef:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; SI-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_exp2_f32_undef:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp2_f32_undef:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_undef:
 ; R600:       ; %bb.0:
@@ -3359,19 +3865,19 @@ define float @v_exp2_f32_contract(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_contract:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_contract:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_contract:
 ; VI-SDAG:       ; %bb.0:
@@ -3387,6 +3893,20 @@ define float @v_exp2_f32_contract(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_contract:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_contract:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3401,6 +3921,20 @@ define float @v_exp2_f32_contract(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_contract:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_contract:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -3449,19 +3983,19 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -3477,6 +4011,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3491,6 +4039,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -3518,3 +4080,5 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2
 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
 attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index b9fef083..88ef7a93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -3,11 +3,13 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
 
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
 
 define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i32:
@@ -50,6 +52,19 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -96,6 +111,16 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) {
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -145,6 +170,18 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -221,6 +258,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-NEXT:    v_bfe_i32 v2, v4, 0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f16_v2i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v3, v1
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v4, v1
+; GFX12-NEXT:    v_bfe_i32 v1, v0, 0, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_pack_b32_f16 v0, v2, v3
+; GFX12-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -311,6 +367,20 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,6 +456,22 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
+; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -463,6 +549,19 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -509,6 +608,16 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) {
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -554,6 +663,16 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
 ; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -623,6 +742,19 @@ define { float, i32 } @test_frexp_f32_i32(float %a) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f32_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -665,6 +797,16 @@ define float @test_frexp_f32_i32_only_use_fract(float %a) {
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f32_i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -706,6 +848,16 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f32_i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -771,6 +923,21 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f32_v2i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v5, v1
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -846,6 +1013,17 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX12-NEXT:    v_frexp_mant_f32_e32 v1, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -896,6 +1074,17 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GFX12-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -954,6 +1143,19 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f64_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1000,6 +1202,16 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) {
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f64_i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1044,6 +1256,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_f64_i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1116,6 +1338,22 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: test_frexp_v2f64_v2i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[8:9], v[0:1]
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[6:7], v[2:3]
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v5, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX12-NEXT:    v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1174,6 +1412,17 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) {
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_fract:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0
   ret <2 x double> %result.0
@@ -1213,6 +1462,17 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX12-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1
   ret <2 x i32> %result.1
@@ -1235,3 +1495,5 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
 ; GCN: {{.*}}
 ; GFX11-GISEL: {{.*}}
 ; GFX11-SDAG: {{.*}}
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 218e41f..b850428 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
@@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41b17218
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x41b17218, s2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_f32:
@@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v2, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
@@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT:    v_and_b32_e32 v4, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s6, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v1, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v1, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v3, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s10, v2
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v3, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v4, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, 0x3f317217, v0 :: v_dual_mul_f32 v3, 0x3f317217, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v3, 0x3f317217, v1 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3f317217, v1, -v3
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v5, 0x3377d1cf, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v4, 0x3f317217, v0, -v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3f317217, v1, -v3
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v4, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
@@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
-; SI-GISEL-NEXT:    v_log_f32_e32 v6, v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v5, s9, v5
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v2, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v3, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v8
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v8|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
@@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v3
@@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s9, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
 ; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v3, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v2, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v6, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v6|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
@@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v2, -v6
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v6, s9, v6
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v2, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v2, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v4, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v9|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v2 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3f317217, v2, -v5
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v8, 0x3377d1cf, v2
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v5, v5, v8
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v6, 0x3f317217, v0, -v3
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v2
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v7, 0x3f317217, v1, -v4
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v6, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3f317217, v2, -v5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v7, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v4, v4, v7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v8, 0x3377d1cf, v2 :: v_dual_mov_b32 v3, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v10
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
@@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v4, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v4, v8
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
-; SI-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 5, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v8, s10, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v8
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v9
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v3, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v4, v9
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT:    v_log_f32_e32 v9, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v8|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v9
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v4, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v7, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v9|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v1
@@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v1, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v1, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, s10, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v5, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; VI-GISEL-NEXT:    v_ldexp_f32 v6, s10, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v6, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v10, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v9, v10, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317000, v8
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v7|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
+; VI-GISEL-NEXT:    v_log_f32_e32 v7, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v7, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v3, -v8
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 5, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v9, s10, v9
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v9
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v3, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v10
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v3, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v5, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v8, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v10|, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v5
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v7
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3f317217, v3, -v8
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v7, v7, v12
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v10, 0x3f317217, v0, -v5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v11, 0x3f317217, v1, -v6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v7
-; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3f317217, v3, -v8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v15
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_v4f32:
@@ -2126,10 +2165,10 @@ define float @v_log_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2175,16 +2214,16 @@ define float @v_log_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2224,10 +2263,10 @@ define float @v_log_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2270,21 +2309,22 @@ define float @v_log_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2329,10 +2369,10 @@ define float @v_log_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2378,16 +2418,16 @@ define float @v_log_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2427,10 +2467,10 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2475,20 +2515,22 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2534,10 +2576,10 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2583,16 +2625,16 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2632,10 +2674,10 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2680,20 +2722,22 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2740,10 +2784,10 @@ define float @v_log_fneg_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2789,16 +2833,16 @@ define float @v_log_fneg_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2838,10 +2882,10 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2885,20 +2929,22 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3304,10 +3350,10 @@ define float @v_log_f32_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -3353,16 +3399,16 @@ define float @v_log_f32_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -3402,10 +3448,10 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -3448,21 +3494,22 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4038,10 +4085,10 @@ define float @v_log_f32_nnan(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4087,16 +4134,16 @@ define float @v_log_f32_nnan(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4136,10 +4183,10 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4182,21 +4229,22 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4381,10 +4429,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4430,16 +4478,16 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4479,10 +4527,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4525,21 +4573,22 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4724,10 +4773,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4773,16 +4822,16 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4822,10 +4871,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4868,21 +4917,22 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4924,10 +4974,10 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4967,16 +5017,16 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5010,10 +5060,10 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5051,18 +5101,20 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5207,10 +5259,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5250,16 +5302,16 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5293,10 +5345,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5334,18 +5386,20 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5419,10 +5473,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5468,16 +5522,16 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5517,10 +5571,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5563,21 +5617,22 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index fd50d1b..d09df75 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
@@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x411a209b
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x411a209b, s2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_f32:
@@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v2, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
@@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT:    v_and_b32_e32 v4, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s6, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v1, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v1, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v3, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s10, v2
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v3, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v4, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, 0x3e9a209a, v0 :: v_dual_mul_f32 v3, 0x3e9a209a, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v3, 0x3e9a209a, v1 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3e9a209a, v1, -v3
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v5, 0x3284fbcf, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v4, 0x3e9a209a, v0, -v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3e9a209a, v1, -v3
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v4, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
@@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
-; SI-GISEL-NEXT:    v_log_f32_e32 v6, v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v5, s9, v5
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v2, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v3, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v8
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v8|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
@@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v3
@@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s9, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
 ; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x411a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v3, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v2, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v6, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v6|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
@@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v2, -v6
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v6, s9, v6
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v2, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v2, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v4, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v9|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v2 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3e9a209a, v2, -v5
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v8, 0x3284fbcf, v2
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v5, v5, v8
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v6, 0x3e9a209a, v0, -v3
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v2
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v7, 0x3e9a209a, v1, -v4
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v6, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3e9a209a, v2, -v5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v7, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v4, v4, v7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v8, 0x3284fbcf, v2 :: v_dual_mov_b32 v3, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v10
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
@@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v4, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v4, v8
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
-; SI-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 5, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v8, s10, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v8
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v9
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v3, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v4, v9
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT:    v_log_f32_e32 v9, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v8|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v9
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v4, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v7, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v9|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v1
@@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x411a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v1, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v1, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, s10, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v5, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; VI-GISEL-NEXT:    v_ldexp_f32 v6, s10, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v6, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v10, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v9, v10, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a2000, v8
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v7|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
+; VI-GISEL-NEXT:    v_log_f32_e32 v7, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v7, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v3, -v8
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 5, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v9, s10, v9
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v9
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v3, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v10
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v3, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v5, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v8, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v10|, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v5
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v7
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3e9a209a, v3, -v8
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v7, v7, v12
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v10, 0x3e9a209a, v0, -v5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v11, 0x3e9a209a, v1, -v6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v7
-; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3e9a209a, v3, -v8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v15
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_v4f32:
@@ -2126,10 +2165,10 @@ define float @v_log10_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2175,16 +2214,16 @@ define float @v_log10_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2224,10 +2263,10 @@ define float @v_log10_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2270,21 +2309,22 @@ define float @v_log10_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2329,10 +2369,10 @@ define float @v_log10_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2378,16 +2418,16 @@ define float @v_log10_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2427,10 +2467,10 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2475,20 +2515,22 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2534,10 +2576,10 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2583,16 +2625,16 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2632,10 +2674,10 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2680,20 +2722,22 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2740,10 +2784,10 @@ define float @v_log10_fneg_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2789,16 +2833,16 @@ define float @v_log10_fneg_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2838,10 +2882,10 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2885,20 +2929,22 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3304,10 +3350,10 @@ define float @v_log10_f32_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -3353,16 +3399,16 @@ define float @v_log10_f32_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -3402,10 +3448,10 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -3448,21 +3494,22 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4038,10 +4085,10 @@ define float @v_log10_f32_nnan(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4087,16 +4134,16 @@ define float @v_log10_f32_nnan(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4136,10 +4183,10 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4182,21 +4229,22 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4381,10 +4429,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4430,16 +4478,16 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4479,10 +4527,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4525,21 +4573,22 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4724,10 +4773,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4773,16 +4822,16 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4822,10 +4871,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4868,21 +4917,22 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4924,10 +4974,10 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4967,16 +5017,16 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a2000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5010,10 +5060,10 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5051,18 +5101,20 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5207,10 +5259,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5250,16 +5302,16 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a2000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5293,10 +5345,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5334,18 +5386,20 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5419,10 +5473,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5468,16 +5522,16 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5517,10 +5571,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5563,21 +5617,22 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 2c5a9f5..8b3b79b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -36,14 +36,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s2, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -74,13 +74,13 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -108,20 +108,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX900-GISEL-LABEL: s_log2_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -147,20 +146,22 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX1100-GISEL-LABEL: s_log2_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_f32:
@@ -242,21 +243,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s6, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v3, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -291,21 +293,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s6, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -339,22 +342,23 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, s10, v3
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v3, s10, v3
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -387,23 +391,28 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL-LABEL: s_log2_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v2f32:
@@ -506,32 +515,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; SI-GISEL-LABEL: s_log2_v3f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v3, s9, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_log_f32_e32 v4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
 ; SI-GISEL-NEXT:    s_endpgm
@@ -571,32 +582,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; VI-GISEL-LABEL: s_log2_v3f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
-; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
-; VI-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; VI-GISEL-NEXT:    v_log_f32_e32 v4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; VI-GISEL-NEXT:    s_endpgm
 ;
@@ -637,28 +650,30 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v4, s1, v4
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s2, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v5, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -702,33 +717,40 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v3f32:
 ; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b96 v6, v[0:2], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v3f32:
@@ -865,34 +887,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v4, s10, v4
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
@@ -942,33 +967,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v4, s10, v4
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; VI-GISEL-NEXT:    v_log_f32_e32 v5, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1018,34 +1046,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v5, s10, v5
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v6, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1095,39 +1126,46 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v4f32:
 ; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 0x42000000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x42000000, s8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v3, 5, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x42000000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 0x42000000, s9
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7
-; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v3, v3, v7 :: v_dual_lshlrev_b32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX1100-GISEL-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v4f32:
@@ -1243,19 +1281,19 @@ define float @v_log2_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1271,6 +1309,20 @@ define float @v_log2_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1285,6 +1337,20 @@ define float @v_log2_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1304,10 +1370,12 @@ define float @v_log2_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1341,19 +1409,19 @@ define float @v_log2_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fabs_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1369,6 +1437,20 @@ define float @v_log2_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1383,6 +1465,20 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1403,10 +1499,11 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1441,19 +1538,19 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fneg_fabs_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1469,6 +1566,20 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1483,6 +1594,20 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1503,10 +1628,11 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1542,19 +1668,19 @@ define float @v_log2_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fneg_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fneg_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1570,6 +1696,20 @@ define float @v_log2_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fneg_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1584,6 +1724,20 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fneg_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1604,10 +1758,11 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1642,19 +1797,19 @@ define float @v_log2_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_fast:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_fast:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_fast:
 ; VI-SDAG:       ; %bb.0:
@@ -1670,6 +1825,20 @@ define float @v_log2_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_fast:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1684,6 +1853,20 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_fast:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1703,10 +1886,12 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1740,19 +1925,19 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1768,6 +1953,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1782,6 +1981,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1801,10 +2014,12 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1838,19 +2053,19 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_approx_fn_attr:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1866,6 +2081,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1880,6 +2109,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1899,10 +2142,12 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1936,19 +2181,19 @@ define float @v_log2_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_ninf:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -1964,6 +2209,20 @@ define float @v_log2_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1978,6 +2237,20 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1997,10 +2270,12 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2034,19 +2309,19 @@ define float @v_log2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_afn:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -2062,6 +2337,20 @@ define float @v_log2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2076,6 +2365,20 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2095,10 +2398,12 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2158,19 +2463,19 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_afn_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2186,6 +2491,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2200,6 +2519,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2219,10 +2552,12 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2256,19 +2591,19 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_fabs_log2_f32_afn:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_fabs_log2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -2284,6 +2619,20 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_fabs_log2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2298,6 +2647,20 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_fabs_log2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2318,10 +2681,11 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2382,19 +2746,19 @@ define float @v_log2_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
@@ -2410,6 +2774,20 @@ define float @v_log2_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2424,6 +2802,20 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2443,10 +2835,12 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2506,19 +2900,19 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2534,6 +2928,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2548,6 +2956,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2567,10 +2989,12 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2630,19 +3054,19 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_ninf_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2658,6 +3082,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2672,6 +3110,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2691,10 +3143,12 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2728,19 +3182,19 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -2756,6 +3210,20 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2770,6 +3238,20 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2789,10 +3271,12 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2852,19 +3336,19 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2880,6 +3364,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2894,6 +3392,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2913,10 +3425,12 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2976,19 +3490,19 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_dynamic_mode:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
@@ -3004,6 +3518,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3018,6 +3546,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3037,10 +3579,12 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 4de0c54..795ed6d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
 
 declare half @llvm.rint.f16(half %a)
 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -63,6 +64,24 @@ define amdgpu_kernel void @rint_f16(
 ; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: rint_f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -168,6 +187,28 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: rint_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_rndne_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 2bb89fd..6927636 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -4,6 +4,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
 
 define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX6-LABEL: sin_f16:
@@ -80,6 +81,19 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    v_sin_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sin_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sin_f16_e32 v1, v1
+; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.sin.f16(half %a.val)
   store half %r.val, ptr addrspace(1) %r
@@ -188,6 +202,24 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sin_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX12-NEXT:    v_sin_f16_e32 v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT:    v_sin_f16_e32 v2, v2
+; GFX12-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a.val = load <2 x half>, ptr addrspace(1) %a
   %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val)
   store <2 x half> %r.val, ptr addrspace(1) %r
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 47777e3..0d58afd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
 
 declare half @llvm.trunc.f16(half %a)
 declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
@@ -62,6 +63,24 @@ define amdgpu_kernel void @trunc_f16(
 ; GFX11-NEXT:    v_trunc_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: trunc_f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -147,6 +166,28 @@ define amdgpu_kernel void @trunc_v2f16(
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: trunc_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s10, s6
+; GFX12-NEXT:    s_mov_b32 s11, s7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s2
+; GFX12-NEXT:    s_mov_b32 s9, s3
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_mov_b32 s5, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_trunc_f16_e32 v1, v1
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 33007e5..3be17f9 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1333,5 +1333,668 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
   ret i48 %a
 }
 
+define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_1:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xfc19
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xfc19
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xfffffffffffffc19
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xd1
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xd1
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xd1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffffff000000d1
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_3:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xfc88
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_3:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xfc88
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 s0xfffffffffffffc88, %lsh
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_4:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mul_lo_u32 v3, v2, v0
+; CI-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v0, 0
+; CI-NEXT:    s_movk_i32 s4, 0xfc88
+; CI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[1:2]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_4:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v2, v2, v0
+; SI-NEXT:    v_mul_hi_u32 v3, v1, v0
+; SI-NEXT:    s_movk_i32 s4, 0xfc88
+; SI-NEXT:    v_mul_lo_u32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; SI-NEXT:    v_mul_hi_u32 v3, v2, s4
+; SI-NEXT:    v_mul_lo_u32 v1, v2, s4
+; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v0, v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %ext = zext i32 %arg0 to i64
+  %mul1 = mul i64 %arg1, %ext
+  %lsh = lshr i64 %mul1, 32
+  %mul2 = mul i64 %lsh, s0xfffffffffffffc88
+  %mad = add i64 %mul2, %mul1
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_1:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; CI-NEXT:    s_movk_i32 s4, 0xfc19
+; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; SI-NEXT:    s_movk_i32 s4, 0xfc19
+; SI-NEXT:    v_mul_lo_u32 v3, v2, s4
+; SI-NEXT:    v_mul_hi_i32 v2, v2, s4
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
+; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mad_i64_i32 v[2:3], null, 0xfffffc19, v4, v[0:1]
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_1:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 36
+  %mul = mul i64 %lsh, s0xfffffffffffffc19
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_movk_i32 s4, 0xd1
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v0
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0xd1
+; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v4, v1, s4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v1
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0xd1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffff00000000d1
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_3:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 22
+; CI-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_3:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 22
+; SI-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %op = add i64 %arg0, 1
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xfffffffffffffc00
+  %mad = add i64 %mul, %op
+
+  ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_4:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; CI-NEXT:    v_mul_lo_u32 v0, v1, v1
+; CI-NEXT:    v_add_i32_e32 v1, vcc, v0, v3
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_4:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mul_hi_u32 v2, v1, v0
+; SI-NEXT:    v_mul_lo_u32 v3, v1, v1
+; SI-NEXT:    v_mul_lo_u32 v4, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_4:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1]
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_4:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v0, v4
+; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1150-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_4:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, %arg0
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_sgpr:
+; CI:       ; %bb.0:
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v2, 0xffff1c18
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s1, v2, v[0:1]
+; CI-NEXT:    v_subrev_i32_e32 v1, vcc, s1, v1
+; CI-NEXT:    v_readfirstlane_b32 s0, v0
+; CI-NEXT:    v_readfirstlane_b32 s1, v1
+; CI-NEXT:    ; return to shader part epilog
+;
+; SI-LABEL: lshr_mad_i64_sgpr:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_mov_b32_e32 v0, 0xffff1c18
+; SI-NEXT:    v_mul_hi_u32 v0, s1, v0
+; SI-NEXT:    s_mul_i32 s2, s1, 0xffff1c18
+; SI-NEXT:    v_readfirstlane_b32 s3, v0
+; SI-NEXT:    s_sub_i32 s3, s3, s1
+; SI-NEXT:    s_add_u32 s0, s2, s0
+; SI-NEXT:    s_addc_u32 s1, s3, s1
+; SI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: lshr_mad_i64_sgpr:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mul_hi_u32 s2, s1, 0xffff1c18
+; GFX9-NEXT:    s_sub_i32 s2, s2, s1
+; GFX9-NEXT:    s_mul_i32 s3, s1, 0xffff1c18
+; GFX9-NEXT:    s_add_u32 s0, s3, s0
+; GFX9-NEXT:    s_addc_u32 s1, s2, s1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: lshr_mad_i64_sgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mul_hi_u32 s2, s1, 0xffff1c18
+; GFX11-NEXT:    s_mul_i32 s3, s1, 0xffff1c18
+; GFX11-NEXT:    s_sub_i32 s2, s2, s1
+; GFX11-NEXT:    s_add_u32 s0, s3, s0
+; GFX11-NEXT:    s_addc_u32 s1, s2, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: lshr_mad_i64_sgpr:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s4, 0xffff1c18
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_mov_b32 s5, -1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT:    ; return to shader part epilog
+  %lsh = lshr i64 %arg0, 32
+  %mul = mul i64 %lsh, s0xffffffffffff1c18
+  %mad = add i64 %mul, %arg0
+
+  ret i64 %mad
+}
+
+define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_vec:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, 0xffff1c18
+; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; CI-NEXT:    s_mov_b32 s4, 0xffff1118
+; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
+; CI-NEXT:    v_mov_b32_e32 v0, v4
+; CI-NEXT:    v_mov_b32_e32 v2, v6
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_vec:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0xffff1118
+; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
+; SI-NEXT:    v_mul_hi_u32 v5, v3, s4
+; SI-NEXT:    s_mov_b32 s4, 0xffff1c18
+; SI-NEXT:    v_mul_hi_u32 v6, v1, s4
+; SI-NEXT:    v_mul_lo_u32 v7, v1, s4
+; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
+; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_vec:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff1c18
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff1118
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_vec:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX11-NEXT:    v_mov_b32_e32 v2, v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_vec:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX12-NEXT:    v_mov_b32_e32 v2, v6
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
+  %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
+  %mad = add <2 x i64> %mul, %arg0
+
+  ret <2 x i64> %mad
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 5e46fd6..fa15a42 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -1838,11 +1838,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v3f16:
@@ -1904,8 +1904,8 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v3f16_nnan:
@@ -1947,20 +1947,20 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v4f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v4f16:
@@ -2020,12 +2020,12 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v4f16_nnan:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v4f16_nnan:
@@ -2067,27 +2067,27 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v6f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v6f16:
@@ -2159,34 +2159,34 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v8f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 9e0b7da..f5fb85d 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -1792,11 +1792,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_min_f16_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v3f16:
@@ -1858,8 +1858,8 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v3f16_nnan:
@@ -1901,20 +1901,20 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v4f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v4f16:
@@ -1974,12 +1974,12 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v4f16_nnan:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_min_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v4f16_nnan:
@@ -2021,27 +2021,27 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v6f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v5
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_min_f16_e32 v2, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v6f16:
@@ -2113,34 +2113,34 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v8f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_min_f16_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 73f3d4c..774a22f 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s
 
 define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_minmax_i32:
@@ -8,6 +10,16 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
   ret i32 %sminmax
@@ -45,6 +57,16 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
   ret i32 %sminmax
@@ -56,6 +78,16 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
   ret i32 %smaxmin
@@ -67,6 +99,16 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
   ret i32 %smaxmin
@@ -79,6 +121,17 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-NEXT:    v_med3_i32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_smed3_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
   %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
   %tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
@@ -93,6 +146,16 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
   ret i32 %uminmax
@@ -130,6 +193,16 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
   ret i32 %uminmax
@@ -141,6 +214,16 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
   ret i32 %umaxmin
@@ -152,6 +235,16 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
   ret i32 %umaxmin
@@ -164,6 +257,17 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-NEXT:    v_med3_u32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_umed3_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_u32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
   %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
   %tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
@@ -173,44 +277,88 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 }
 
 define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_minmax_f32_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_minmax_f32_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %minmax = call float @llvm.minnum.f32(float %max, float %c)
   ret float %minmax
 }
 
 define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f32_ieee_false:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT:    s_mov_b32 s5, s4
-; SDAG-NEXT:    s_mov_b32 s4, s3
-; SDAG-NEXT:    v_maxmin_f32 v0, s0, s1, v0
-; SDAG-NEXT:    global_store_b32 v1, v0, s[4:5]
-; SDAG-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT:    v_maxmin_f32 v0, s0, s1, v0
+; SDAG-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX11-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_test_minmax_f32_ieee_false:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT:    s_mov_b32 s6, s3
-; GISEL-NEXT:    s_mov_b32 s7, s4
-; GISEL-NEXT:    v_maxmin_f32 v0, s0, s1, v0
-; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GISEL-GFX11-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT:    v_maxmin_f32 v0, s0, s1, v0
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT:    v_maxmin_num_f32 v0, s0, s1, v0
+; SDAG-GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX12-NEXT:    s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_max_num_f32 s0, s0, s1
+; GISEL-GFX12-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-GFX12-NEXT:    s_min_num_f32 s0, s0, s2
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT:    s_endpgm
   %smax = call float @llvm.maxnum.f32(float %a, float %b)
   %sminmax = call float @llvm.minnum.f32(float %smax, float %c)
   store float %sminmax, ptr addrspace(1) %out
@@ -222,27 +370,56 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_commuted_f32_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %minmax = call float @llvm.minnum.f32(float %c, float %max)
   ret float %minmax
 }
 
 define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_maxmin_f32_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT:    v_minmax_f32 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_maxmin_f32_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT:    v_minmax_f32 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %maxmin = call float @llvm.maxnum.f32(float %min, float %c)
   ret float %maxmin
@@ -253,6 +430,11 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_commuted_f32_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %maxmin = call float @llvm.maxnum.f32(float %c, float %min)
   ret float %maxmin
@@ -265,6 +447,17 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
 ; GFX11-NEXT:    v_med3_f32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_num_f32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
   %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
@@ -278,29 +471,54 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_f16_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %max, half %c)
   ret half %minmax
 }
 
 define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT:    s_mov_b32 s5, s4
-; SDAG-NEXT:    s_mov_b32 s4, s3
-; SDAG-NEXT:    v_maxmin_f16 v0, s0, s1, v0
-; SDAG-NEXT:    global_store_b16 v1, v0, s[4:5]
-; SDAG-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; SDAG-GFX11-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX11-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_test_minmax_f16_ieee_false:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT:    s_mov_b32 s6, s3
-; GISEL-NEXT:    s_mov_b32 s7, s4
-; GISEL-NEXT:    v_maxmin_f16 v0, s0, s1, v0
-; GISEL-NEXT:    global_store_b16 v1, v0, s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; GISEL-GFX11-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT:    v_maxmin_num_f16 v0, s0, s1, v0
+; SDAG-GFX12-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX12-NEXT:    s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_max_num_f16 s0, s0, s1
+; GISEL-GFX12-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-GFX12-NEXT:    s_min_num_f16 s0, s0, s2
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX12-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT:    s_endpgm
   %smax = call half @llvm.maxnum.f16(half %a, half %b)
   %sminmax = call half @llvm.minnum.f16(half %smax, half %c)
   store half %sminmax, ptr addrspace(1) %out
@@ -308,23 +526,49 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 }
 
 define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GISEL-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %c, half %max)
   ret half %minmax
@@ -335,29 +579,60 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_f16_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %min, half %c)
   ret half %maxmin
 }
 
 define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GISEL-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %c, half %min)
   ret half %maxmin
@@ -370,6 +645,17 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
 ; GFX11-NEXT:    v_med3_f16 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_num_f16 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
   %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
   %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
new file mode 100644
index 0000000..a9b8663
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 %s -o - | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 %s -o - | FileCheck %s --check-prefixes=GFX908
+
+define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
+; GFX942-LABEL: matmul_kernel:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX942-NEXT:    s_mov_b32 s3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX942-NEXT:    s_branch .LBB0_2
+; GFX942-NEXT:  .LBB0_1: ; %bb2
+; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX942-NEXT:    s_or_b32 s4, s3, 1
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    s_and_b32 s3, s5, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_4
+; GFX942-NEXT:  .LBB0_2: ; %bb
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    ; implicit-def: $sgpr3
+; GFX942-NEXT:  .LBB0_4: ; %common.ret
+; GFX942-NEXT:    s_endpgm
+;
+; GFX908-LABEL: matmul_kernel:
+; GFX908:       ; %bb.0: ; %entry
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    s_mov_b32 s2, 0
+; GFX908-NEXT:    s_mov_b32 s3, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX908-NEXT:    s_branch .LBB0_2
+; GFX908-NEXT:  .LBB0_1: ; %bb2
+; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX908-NEXT:    s_or_b32 s4, s3, 1
+; GFX908-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX908-NEXT:    s_mov_b32 s3, s2
+; GFX908-NEXT:    s_nop 3
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_mov_b32_e32 v5, s3
+; GFX908-NEXT:    v_mov_b32_e32 v4, s2
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    s_and_b32 s3, s5, s4
+; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX908-NEXT:    s_cbranch_execz .LBB0_4
+; GFX908-NEXT:  .LBB0_2: ; %bb
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    s_cbranch_vccz .LBB0_1
+; GFX908-NEXT:  ; %bb.3:
+; GFX908-NEXT:    ; implicit-def: $sgpr3
+; GFX908-NEXT:  .LBB0_4: ; %common.ret
+; GFX908-NEXT:    s_endpgm
+entry:
+  br label %bb
+
+bb:
+  %i = phi { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } [ %i10, %bb2 ], [ zeroinitializer, %entry ]
+  %i1 = phi i32 [ %i5, %bb2 ], [ 0, %entry ]
+  %c0 = icmp ne i32 %a0, 0
+  br i1 %c0, label %bb2, label %bb11
+
+bb2:
+  %i3 = or i32 %i1, 1
+  %i4 = icmp slt i32 %i1, 0
+  %i5 = select i1 %i4, i32 %i3, i32 0
+  %i6 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 123
+  %i7 = insertelement <4 x float> zeroinitializer, float %i6, i32 0
+  %i8 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i7, i32 0, i32 0, i32 0)
+  %i9 = extractelement <4 x float> %i8, i32 0
+  %i10 = insertvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } zeroinitializer, float %i9, 123
+  br label %bb
+
+bb11:
+  %c1 = icmp ne i32 %a1, 0
+  br i1 %c1, label %bb12, label %common.ret
+
+common.ret:
+  ret void
+
+bb12:
+  %i13 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 0
+  %i14 = insertelement <4 x float> zeroinitializer, float %i13, i32 0
+  %i15 = insertelement <4 x float> %i14, float 0.000000e+00, i32 0
+  %i16 = insertelement <4 x float> %i15, float 0.000000e+00, i32 0
+  br label %common.ret
+}
+
+; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
new file mode 100644
index 0000000..5c83170
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -0,0 +1,235 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE
+
+...
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   S_BITCMP1_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], 1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:agpr_32 = PHI [[COPY1]], %bb.0, %24, %bb.3
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %11, %bb.3
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:agpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI1]], 1, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[PHI1]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_ASHR_I32_]], killed [[S_OR_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:areg_128_align2 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[COPY5]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, [[S_AND_B32_]], %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:agpr_32 = PHI [[COPY3]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_]].sub0, %bb.2
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.1, [[S_MOV_B64_1]], %bb.2
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI4]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_1]], 1, implicit $exec
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  ;
+  ; COALESCE-LABEL: name: test
+  ; COALESCE: bb.0:
+  ; COALESCE-NEXT:   successors: %bb.1(0x80000000)
+  ; COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
+  ; COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; COALESCE-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.1:
+  ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+  ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+  ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+  ; COALESCE-NEXT:   S_BRANCH %bb.2
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.2:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
+  ; COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
+  ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.3:
+  ; COALESCE-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+  ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
+  ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
+  ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  ; COALESCE-NEXT:   S_BRANCH %bb.4
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.4:
+  ; COALESCE-NEXT:   successors: %bb.5(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.5:
+  ; COALESCE-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX908-COALESCE-LABEL: name: test
+  ; GFX908-COALESCE: bb.0:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; GFX908-COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX908-COALESCE-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.1:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+  ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.2
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.2:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.3:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.4
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.4:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.5:
+  ; GFX908-COALESCE-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    S_BITCMP1_B32 killed %1, 0, implicit-def $scc
+    %2:sgpr_32 = S_MOV_B32 0
+    %3:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %5:sreg_32 = IMPLICIT_DEF
+    %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %3, implicit $exec
+    %7:sreg_64_xexec = V_CMP_NE_U32_e64 %6, 1, implicit $exec
+
+  bb.1:
+    successors: %bb.2, %bb.3
+
+    %8:vgpr_32 = PHI %4, %bb.0, %9, %bb.3
+    %10:sreg_32 = PHI %2, %bb.0, %11, %bb.3
+    %12:agpr_32 = COPY %8
+    %13:sreg_64 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, %7, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+
+    %14:sreg_32 = S_OR_B32 %10, 1, implicit-def dead $scc
+    %15:sreg_32 = S_ASHR_I32 %10, 31, implicit-def dead $scc
+    %16:sreg_32 = S_AND_B32 killed %15, killed %14, implicit-def dead $scc
+    %17:vreg_128_align2 = REG_SEQUENCE %8, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3
+    %18:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1
+    %19:vreg_64_align2 = COPY %18
+    %20:areg_128_align2 = COPY %17
+    %21:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %19, %19, killed %20, 0, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = COPY %21.sub0
+    %23:sreg_64 = S_MOV_B64 0
+
+  bb.3:
+    successors: %bb.4, %bb.1
+
+    %11:sreg_32 = PHI %5, %bb.1, %16, %bb.2
+    %24:agpr_32 = PHI %12, %bb.1, %21.sub0, %bb.2
+    %25:sreg_64_xexec = PHI %13, %bb.1, %23, %bb.2
+    %9:vgpr_32 = COPY %24
+    %26:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %25, implicit $exec
+    %27:sreg_64_xexec = V_CMP_NE_U32_e64 %26, 1, implicit $exec
+    $vcc = S_AND_B64 $exec, %27, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
new file mode 100644
index 0000000..49c0aaf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
@@ -0,0 +1,182 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE
+
+...
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_BITCMP0_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_4:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_4]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_3:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MFMA_F32_16X16X16F16_e64_3]].sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:agpr_32 = PHI [[V_ACCVGPR_WRITE_B32_e64_]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_3]].sub0, %bb.2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_PACK_B32_F16_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  ;
+  ; COALESCE-LABEL: name: test
+  ; COALESCE: bb.0:
+  ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
+  ; COALESCE-NEXT:   S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; COALESCE-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.1:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   S_BRANCH %bb.3
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.2:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.3:
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; COALESCE-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX908-COALESCE-LABEL: name: test
+  ; GFX908-COALESCE: bb.0:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX908-COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT:   S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.1:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.3
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.2:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.3:
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; GFX908-COALESCE-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.2, %bb.1
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sgpr_32 = S_MOV_B32 0
+    S_BITCMP0_B32 killed %1, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+
+  bb.1:
+    successors: %bb.3
+
+    %3:sgpr_32 = COPY %2
+    %4:vgpr_32 = COPY %3, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.3
+
+    %5:sgpr_32 = S_MOV_B32 0
+    %6:vgpr_32 = COPY %5
+    %7:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %8:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %9:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %10:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %11:areg_128_align2 = REG_SEQUENCE %7, %subreg.sub0, %8, %subreg.sub1, %9, %subreg.sub2, %10, %subreg.sub3
+    %12:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %5, %subreg.sub1
+    %13:vreg_64_align2 = COPY %12
+    %14:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %11, 0, 0, 0, implicit $mode, implicit $exec
+    %15:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %14, 0, 0, 0, implicit $mode, implicit $exec
+    %16:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %15, 0, 0, 0, implicit $mode, implicit $exec
+    %17:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %16, 0, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = COPY %17.sub0
+    %19:vgpr_32 = COPY %18
+
+  bb.3:
+    %20:vgpr_32 = PHI %4, %bb.1, %19, %bb.2
+    %21:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, %20, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %24:vreg_64_align2 = REG_SEQUENCE %22, %subreg.sub0, killed %23, %subreg.sub1
+    %25:sgpr_128 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1, %2, %subreg.sub2, %2, %subreg.sub3
+    %26:vreg_64_align2 = COPY %24
+    BUFFER_STORE_DWORDX2_OFFSET_exact killed %26, killed %25, %2, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 98d5f30..a2a0107 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -1372,20 +1372,19 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf000
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf800
-; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[3:4]
-; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
+; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[3:4]
+; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 1, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v4
 ; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v7
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v5
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v6, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
@@ -1416,32 +1415,32 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
 ; GFX9-NEXT:    s_movk_i32 s0, 0xf000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Offset64:
@@ -1477,8 +1476,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off
@@ -1517,25 +1515,25 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[6:7], v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[4:5], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:2048
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -2408,18 +2406,17 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v0, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v0
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
+; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2450,14 +2447,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x1000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:-2048
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
@@ -2490,15 +2486,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
@@ -2525,19 +2520,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index ba428df..a439f8d 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -3,32 +3,17 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
-; GFX12-SDAG-LABEL: v_s_exp_f32:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42800000, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-SDAG-NEXT:    s_add_f32 s0, s0, s1
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0xffffffc0, 0
-; GFX12-SDAG-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, s0, s1
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: v_s_exp_f32:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42800000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_add_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x1f800000, 1.0
-; GFX12-GISEL-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: v_s_exp_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-NEXT:    s_cselect_b32 s1, 0x42800000, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    s_add_f32 s0, s0, s1
+; GFX12-NEXT:    s_cselect_b32 s1, 0xffffffc0, 0
+; GFX12-NEXT:    v_s_exp_f32 s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT:    v_ldexp_f32 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %result = call float @llvm.exp2.f32(float %src)
   ret float %result
 }
@@ -88,16 +73,16 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
 ; GFX12-GISEL-LABEL: v_s_log_f32:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, s0, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %result = call float @llvm.log2.f32(float %src)
   ret float %result
@@ -322,19 +307,18 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
 ;
 ; GFX12-GISEL-LABEL: srcmods_abs_f32:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s0, 0x7fffffff
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s1, 0x800000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, |s0|, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %abs = call float @llvm.fabs.f32(float %src)
   %result = call float @llvm.log2.f32(float %abs)
@@ -362,19 +346,18 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
 ;
 ; GFX12-GISEL-LABEL: srcmods_neg_f32:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX12-GISEL-NEXT:    s_xor_b32 s1, s0, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s1, 0x800000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, -s0, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %neg = fneg float %src
   %result = call float @llvm.log2.f32(float %neg)
diff --git a/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir
new file mode 100644
index 0000000..3879f6d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# This used to assert due to trying to rematerialize V_MOV_B64_PSEUDO
+# at copy to $vgpr1. This would assert since this would clobber the
+# live value in $vgpr0.
+
+---
+name: rematerialize_physreg_sub_def_already_live_at_def_assert
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: $vgpr1 = COPY [[V_MOV_B]].sub1
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = COPY %1
+    SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+...
+
+# Same as previous, except with an IMPLICIT_DEF
+---
+name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: $vgpr1 = COPY [[DEF]].sub1
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = COPY %1
+    SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+...
+
+---
+name: rematerialize_physreg_sub_def_no_live_sub_def_0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_0
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr1 = COPY %1
+    SI_RETURN implicit killed $vgpr1
+...
+
+---
+name: rematerialize_physreg_sub_def_no_live_sub_def_1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_1
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead $vgpr1_vgpr2 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub0
+    $vgpr1 = COPY %1
+    SI_RETURN implicit killed $vgpr1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 3e8768c..96dd627 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1065,100 +1065,37 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_sdiv24_48:
-; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_sext_i32_i16 s1, s1
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 24
-; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[2:3], 16
-; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
-; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_sub_u32 s12, s6, s0
-; GCN-IR-NEXT:    s_subb_u32 s13, s7, s0
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
-; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s16, s14, s20
-; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[10:11], s[18:19]
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[18:19], exec
-; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s13
-; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s12
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
-; GCN-IR-NEXT:    s_addc_u32 s19, s17, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s16
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s18
-; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s20
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
-; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[14:15], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s18, s16
-; GCN-IR-NEXT:    s_subb_u32 s8, s19, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s14
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s15
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT:  .LBB9_4: ; %Flow4
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[6:7]
-; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[10:11], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s0, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s1, s3, s1
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT:    s_waitcnt expcnt(0)
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_sext_i32_i16 s1, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    v_alignbit_b32 v0, s1, v0, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_sext_i32_i16 s0, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-IR-NEXT:    v_alignbit_b32 v2, s0, v2, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v0, 1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v3, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v1, v3
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index e0d0ddc..ddf6297 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
@@ -98,6 +100,8 @@ declare i64 @llvm.smin.i64(i64, i64)
 ; VI: v_max_i16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
 ; VI: v_min_i16_e32 {{v[0-9]}}, 17, [[MAX]]
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -686,6 +690,8 @@ bb:
 ; VI: v_max_i16
 
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_smed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -707,6 +713,8 @@ bb:
 
 ; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1:
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @v_test_smed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index cb8f82d..23364e8 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1188,109 +1188,39 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem24_48:
-; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_sext_i32_i16 s1, s1
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 24
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], 16
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 31
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 16
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
-; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
-; GCN-IR-NEXT:    s_mov_b32 s11, s10
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s10
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[16:17]
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[16:17], exec
-; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s3
-; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s2
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
-; GCN-IR-NEXT:    s_addc_u32 s17, s15, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s14
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s16
-; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s20
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
-; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s18, s14
-; GCN-IR-NEXT:    s_subb_u32 s8, s19, s15
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT:  .LBB9_4: ; %Flow4
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-IR-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x9
-; GCN-IR-NEXT:    s_mul_i32 s4, s6, s11
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT:    s_mul_i32 s4, s7, s10
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT:    s_mul_i32 s4, s6, s10
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-IR-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s1, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
-; GCN-IR-NEXT:    s_mov_b32 s15, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s14, -1
-; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v0, v2, vcc
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    buffer_store_short v0, off, s[12:15], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v1, off, s[12:15], 0
+; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_alignbit_b32 v0, s5, v0, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-IR-NEXT:    v_alignbit_b32 v2, s3, v2, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v5, v2, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v5, 1, v5
+; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_mad_f32 v3, -v4, v1, v3
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
new file mode 100644
index 0000000..f52f116
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Reduce a 64-bit sub by a constant if we know the low 32-bits are all
+; zero.
+
+; sub i64:x, K if computeTrailingZeros(K) >= 32
+; => build_pair (sub x.hi, K.hi), x.lo
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0xfffc0000
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -2
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x80000000
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_4(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0xfffc0000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_1(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_2(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_3(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_4(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_high_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, 1
+; GFX9-NEXT:    s_addc_u32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_high_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %sub
+}
+
+define <2 x i64> @v_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_sub_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %sub
+}
+
+define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_sub_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %sub
+}
+
+define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    s_add_i32 s3, s3, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %sub
+}
+
+define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    s_add_i32 s3, s3, -2
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %sub
+}
+
+; We could reduce this to use a 32-bit sub if we use computeKnownBits
+define i64 @v_sub_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) {
+; GFX9-LABEL: v_sub_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %sub = sub i64 %reg, %in.high.bits
+  ret i64 %sub
+}
+
+; We could reduce this to use a 32-bit sub if we use computeKnownBits
+define amdgpu_ps i64 @s_sub_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) {
+; GFX9-LABEL: s_sub_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_u32 s0, s0, 0
+; GFX9-NEXT:    s_subb_u32 s1, s1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %sub = sub i64 %reg, %in.high.bits
+  ret i64 %sub
+}
diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
new file mode 100644
index 0000000..8315708
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -stress-regalloc=4 -verify-regalloc -start-before=greedy,2 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+# This testcase hit a situation where greedy would hit a use after
+# free during last chance recoloring. This case successfully allocates
+# after, but is extremely sensitive to the exact allocation ordering.
+
+---
+name:            swdev502267_use_after_free_last_chance_recoloring_alloc_succeeds
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 16 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; CHECK-LABEL: name: swdev502267_use_after_free_last_chance_recoloring_alloc_succeeds
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr4_vgpr5, 0, 0, implicit $exec :: (volatile load (s128), addrspace 1)
+  ; CHECK-NEXT:   renamable $vgpr4 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr5 = V_FMA_F32_e64 0, $vgpr7, 0, $vgpr7, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_AV128_SAVE $vgpr6_vgpr7_vgpr8_vgpr9, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6 = V_FMA_F32_e64 0, killed $vgpr8, 0, $vgpr8, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr7 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_V128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr9 = COPY killed renamable $vgpr5
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9
+  ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX4 killed renamable $vgpr4_vgpr5, renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128), addrspace 1)
+  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0_vgpr1_vgpr2_vgpr3
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+    %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, implicit $exec :: (volatile load (s128), addrspace 1)
+    undef %4.sub0:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub0, 0, %3.sub0, 0, %0.sub3, 0, 0, implicit $mode, implicit $exec
+    %4.sub1:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub1, 0, %3.sub1, 0, %0.sub2, 0, 0, implicit $mode, implicit $exec
+    %4.sub2:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub2, 0, %3.sub2, 0, %0.sub1, 0, 0, implicit $mode, implicit $exec
+    %4.sub3:vreg_128_align2 = IMPLICIT_DEF
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    %5:vgpr_32 = V_FMA_F32_e64 0, %4.sub2, 0, %4.sub2, 0, %3.sub2, 0, 0, implicit $mode, implicit $exec
+    %6:vgpr_32 = V_TRUNC_F32_e32 %5, implicit $mode, implicit $exec
+    undef %7.sub3:vreg_128_align2 = nofpexcept V_DIV_FIXUP_F32_e64 0, %2, 0, %4.sub3, 0, %3.sub3, 0, 0, implicit $mode, implicit $exec
+    %7.sub2:vreg_128_align2 = nofpexcept V_FMA_F32_e64 1, %6, 0, %4.sub2, 0, %3.sub2, 0, 0, implicit $mode, implicit $exec
+    %7.sub0:vreg_128_align2 = nofpexcept V_DIV_FIXUP_F32_e64 0, %2, 0, %4.sub0, 0, %3.sub0, 0, 0, implicit $mode, implicit $exec
+    %8:vgpr_32 = V_FMA_F32_e64 0, %4.sub1, 0, %4.sub1, 0, %3.sub1, 0, 0, implicit $mode, implicit $exec
+    %9:vreg_128_align2 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5)
+    GLOBAL_STORE_DWORDX4 %1, %7, 0, 0, implicit $exec :: (volatile store (s128), addrspace 1)
+
+  bb.2:
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %0
+    SI_RETURN implicit $vgpr0_vgpr1_vgpr2_vgpr3
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll
new file mode 100644
index 0000000..f0b3d33
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll
@@ -0,0 +1,23 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s
+; RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+; FIXME: This error will be fixed by supporting arbitrary divergent
+; dynamic allocas by performing a wave umax of the size.
+
+; ERR: error: <unknown>:0:0: in function move_to_valu_assert_srd_is_physreg_swdev503538 i32 (ptr addrspace(1)): illegal VGPR to SGPR copy
+
+; CHECK: ; illegal copy v0 to s32
+
+define i32 @move_to_valu_assert_srd_is_physreg_swdev503538(ptr addrspace(1) %ptr) {
+entry:
+  %idx = load i32, ptr addrspace(1) %ptr, align 4
+  %zero = extractelement <4 x i32> zeroinitializer, i32 %idx
+  %alloca = alloca [2048 x i8], i32 %zero, align 8, addrspace(5)
+  %ld = load i32, ptr addrspace(5) %alloca, align 8
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 2048, i1 false)
+  ret i32 %ld
+}
+
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 557d023..4726e81 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN  -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
@@ -84,6 +86,8 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i64(ptr addrspace(1) %out, ptr add
 ; VI: v_max_u16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
 ; VI: v_min_u16_e32 {{v[0-9]}}, 17, [[MAX]]
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -707,6 +711,8 @@ bb:
 ; VI: v_max_u16
 
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_umed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -728,6 +734,8 @@ bb:
 
 ; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1:
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_umed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
index ad4ad6d..b663acb 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
@@ -67,7 +67,7 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0_lo16, 16
+    ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0, 16
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     CFI_INSTRUCTION offset $vgpr0, 16
 
diff --git a/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir b/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir
new file mode 100644
index 0000000..92c983e
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - %s -mtriple=armv7-- -run-pass=machine-sink | FileCheck %s
+
+name: sink-store-load-dep
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 8, alignment: 8 }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sink-store-load-dep
+    ; CHECK:       bb.0:
+    ; CHECK:         [[LDRi12_:%[0-9]+]]:gpr = LDRi12 %stack.0, 0, 14 /* CC::al */, $noreg :: (load (s32))
+    ; CHECK-NEXT:    [[MOVi:%[0-9]+]]:gpr = MOVi 55296, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    [[ADDri1:%[0-9]+]]:gpr = ADDri [[LDRi12_:%[0-9]+]], 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    [[LDRH:%[0-9]+]]:gpr = LDRH killed [[ADDri1:%[0-9]+]], $noreg, 0, 14 /* CC::al */, $noreg :: (load (s16))
+    ; CHECK-NEXT:    [[MOVi1:%[0-9]+]]:gpr = MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    early-clobber %5:gpr = STRH_PRE [[MOVi:%[0-9]+]], [[LDRi12_:%[0-9]+]], [[MOVi1:%[0-9]+]], 0, 14 /* CC::al */, $noreg
+    ; CHECK-NEXT:    [[SUBri:%.*]]:gpr = SUBri killed [[LDRi12_:%[0-9]+]], 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK:       bb.2:
+    ; CHECK-NEXT:    [[MOVi2:%[0-9]+]]:gpr = MOVi [[LDRH:%[0-9]+]], 14 /* CC::al */, $noreg, $noreg
+    %0:gpr = LDRi12 %stack.0, 0, 14, $noreg :: (load (s32))
+    %1:gpr = MOVi 55296, 14, $noreg, $noreg
+    %2:gpr = ADDri %0:gpr, 0, 14, $noreg, $noreg
+    %3:gpr = LDRH killed %2:gpr, $noreg, 0, 14, $noreg :: (load (s16))
+    %4:gpr = MOVi 0, 14, $noreg, $noreg
+    early-clobber %5:gpr = STRH_PRE %1:gpr, %0:gpr, %4:gpr, 0, 14, $noreg
+    %6:gpr = SUBri killed %0:gpr, 0, 14, $noreg, $noreg
+    CMPri %6:gpr, 0, 14, $noreg, implicit-def $cpsr
+    Bcc %bb.2, 3, $cpsr
+    B %bb.1
+
+  bb.1:
+    %8:gpr = MOVi 0, 14, $noreg, $noreg
+    $r0 = COPY %8:gpr
+    BX_RET 14, $noreg, implicit $r0
+
+  bb.2:
+    %9:gpr = MOVi %3:gpr, 14, $noreg, $noreg
+    $r0 = COPY %9:gpr
+    BX_RET 14, $noreg, implicit $r0
+...
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
new file mode 100644
index 0000000..7c915e1
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
@@ -0,0 +1,2081 @@
+; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s
+; RUN: not llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - 2>&1 | FileCheck --check-prefix=AVR25 %s
+; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s
+
+; ATTINY85: <main>:
+; ATTINY85-NEXT: andi r24, 0x1
+; ATTINY85: cpi r24, 0x0
+; ATTINY85-NEXT: breq .+2
+; ATTINY85-NEXT: rjmp .+4086
+; ATTINY85: ldi r24, 0x3
+; ATTINY85-NEXT: ret
+
+; AVR25: error: out of range branch target (expected an integer in the range -4096 to 4095)
+
+; AVR3: <main>:
+; AVR3-NEXT: andi r24, 0x1
+; AVR3: cpi r24, 0x0
+; AVR3-NEXT: breq .+4
+; AVR3-NEXT: jmp 0x0
+; AVR3-NEXT: R_AVR_CALL .text+0x2
+; AVR3: ldi r24, 0x3
+; AVR3-NEXT: ret
+
+define i8 @main(i1 %a) {
+entry-block:
+  br label %hello
+hello:
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  br i1 %a, label %hello, label %finished
+finished:
+  ret i8 3
+}
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
new file mode 100644
index 0000000..24ddb36
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
@@ -0,0 +1,2081 @@
+; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s
+; RUN: not llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - 2>&1 | FileCheck --check-prefix=AVR25 %s
+; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s
+
+; ATTINY85: <main>:
+; ATTINY85-NEXT: andi r24, 0x1
+; ATTINY85-NEXT: cpi r24, 0x0
+; ATTINY85-NEXT: brne .+2
+; ATTINY85-NEXT: rjmp .-4092
+; ATTINY85: ldi r24, 0x3
+; ATTINY85-NEXT: ret
+
+; AVR25: error: out of range branch target (expected an integer in the range -4096 to 4095)
+
+; AVR3: <main>:
+; AVR3-NEXT: andi r24, 0x1
+; AVR3-NEXT: cpi r24, 0x0
+; AVR3-NEXT: brne .+4
+; AVR3-NEXT: jmp 0x0
+; AVR3-NEXT: R_AVR_CALL .text+0x100e
+; AVR3: ldi r24, 0x3
+; AVR3-NEXT: ret
+
+define i8 @main(i1 %a) {
+entry-block:
+  br i1 %a, label %hello, label %finished
+hello:
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "nop", ""()
+  br label %finished
+finished:
+  ret i8 3
+}
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long.ll
deleted file mode 100644
index cd7a804..0000000
--- a/llvm/test/CodeGen/AVR/branch-relaxation-long.ll
+++ /dev/null
@@ -1,4162 +0,0 @@
-; RUN: llc < %s -mtriple=avr -mattr=avr3 | FileCheck %s
-; RUN: llc < %s -mtriple=avr -mattr=avr2 | FileCheck --check-prefix=AVR2 %s
-
-; CHECK-LABEL: relax_to_jmp:
-; CHECK: cpi     r{{[0-9]+}}, 0
-; CHECK: brne    [[BB1:.LBB[0-9]+_[0-9]+]]
-; CHECK: jmp     [[BB2:.LBB[0-9]+_[0-9]+]]
-; CHECK: [[BB1]]:
-; CHECK: nop
-; CHECK: [[BB2]]:
-
-;; A `RJMP` is generated instead of expected `JMP` for AVR2,
-;; and it is up to the linker to report 'out of range' or
-;; 'exceed flash maximum size'.
-; AVR2-LABEL: relax_to_jmp:
-; AVR2: cpi     r{{[0-9]+}}, 0
-; AVR2: brne    [[BB1:.LBB[0-9]+_[0-9]+]]
-; AVR2: rjmp    [[BB2:.LBB[0-9]+_[0-9]+]]
-; AVR2: [[BB1]]:
-; AVR2: nop
-; AVR2: [[BB2]]:
-
-define i8 @relax_to_jmp(i1 %a) {
-entry-block:
-  br i1 %a, label %hello, label %finished
-hello:
-  ; with >4 kB of instructions (2050 NOPs), this requires a long jump (jmp),
-  ; versus a relative one (rjmp).
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  br label %finished
-finished:
-  ret i8 3
-}
-
-; CHECK-LABEL: relax_to_jmp_backwards:
-; CHECK: [[BB1:.LBB[0-9]+_[0-9]+]]
-; CHECK: nop
-; CHECK: cpi     r{{[0-9]+}}, 0
-; CHECK: breq    [[BB2:.LBB[0-9]+_[0-9]+]]
-; CHECK: jmp     [[BB1]]
-; CHECK: [[BB2]]:
-
-;; A `RJMP` is generated instead of expected `JMP` for AVR2,
-;; and it is up to the linker to report 'out of range' or
-;; 'exceed flash maximum size'.
-; AVR2-LABEL: relax_to_jmp_backwards:
-; AVR2: [[BB1:.LBB[0-9]+_[0-9]+]]
-; AVR2: nop
-; AVR2: cpi     r{{[0-9]+}}, 0
-; AVR2: breq    [[BB2:.LBB[0-9]+_[0-9]+]]
-; AVR2: rjmp    [[BB1]]
-; AVR2: [[BB2]]:
-
-define i8 @relax_to_jmp_backwards(i1 %a) {
-entry-block:
-  br label %hello
-hello:
-  ; with >4 kB of instructions (2050 NOPs), this requires a long jump (jmp),
-  ; versus a relative one (rjmp).
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  call void asm sideeffect "nop", ""()
-  br i1 %a, label %hello, label %finished
-finished:
-  ret i8 3
-}
diff --git a/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll b/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll
new file mode 100644
index 0000000..501f151
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+; Before SM6.2 ByteAddressBuffer and StructuredBuffer lower to bufferLoad.
+
+target triple = "dxil-pc-shadermodel6.1-compute"
+
+; CHECK-LABEL: define void @loadf32_struct
+define void @loadf32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", float, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 0)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+          target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_byte
+define void @loadv4f32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %offset, i32 0)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadnested
+define void @loadnested(i32 %index) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 0)
+  %loadi32 = call {i32, i1} @llvm.dx.resource.load.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 0)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 4)
+  %loadf32 = call {<4 x float>, i1} @llvm.dx.resource.load.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 4)
+
+  ; CHECK: [[DATAF16:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 20)
+  %loadf16 = call {<3 x half>, i1} @llvm.dx.resource.load.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 20)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferLoad.ll b/llvm/test/CodeGen/DirectX/BufferLoad.ll
index 7f1291b..86e2217 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoad.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoad.ll
@@ -17,8 +17,9 @@ define void @loadv4f32() {
   ; CHECK-NOT: %dx.resource.casthandle
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load0 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0)
+  %data0 = extractvalue {<4 x float>, i1} %load0, 0
 
   ; The extract order depends on the users, so don't enforce that here.
   ; CHECK-DAG: [[VAL0_0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0
@@ -34,8 +35,9 @@ define void @loadv4f32() {
   call void @scalar_user(float %data0_2)
 
   ; CHECK: [[DATA4:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 4, i32 undef)
-  %data4 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load4 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 4)
+  %data4 = extractvalue {<4 x float>, i1} %load4, 0
 
   ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 0
   ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 1
@@ -48,8 +50,9 @@ define void @loadv4f32() {
   call void @vector_user(<4 x float> %data4)
 
   ; CHECK: [[DATA12:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 12, i32 undef)
-  %data12 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load12 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 12)
+  %data12 = extractvalue {<4 x float>, i1} %load12, 0
 
   ; CHECK: [[DATA12_3:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA12]], 3
   %data12_3 = extractelement <4 x float> %data12, i32 3
@@ -70,8 +73,9 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[LOAD:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 %bufindex, i32 undef)
-  %load = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %bufindex)
+  %data = extractvalue {<4 x float>, i1} %load, 0
 
   ; CHECK: [[ALLOCA:%.*]] = alloca [4 x float]
   ; CHECK: [[V0:%.*]] = extractvalue %dx.types.ResRet.f32 [[LOAD]], 0
@@ -89,10 +93,10 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) {
   ;
   ; CHECK: [[PTR:%.*]] = getelementptr inbounds [4 x float], ptr [[ALLOCA]], i32 0, i32 %elemindex
   ; CHECK: [[X:%.*]] = load float, ptr [[PTR]]
-  %data = extractelement <4 x float> %load, i32 %elemindex
+  %x = extractelement <4 x float> %data, i32 %elemindex
 
   ; CHECK: call void @scalar_user(float [[X]])
-  call void @scalar_user(float %data)
+  call void @scalar_user(float %x)
 
   ret void
 }
@@ -105,8 +109,9 @@ define void @loadf32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call float @llvm.dx.resource.load.typedbuffer(
+  %load0 = call {float, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 0)
+  %data0 = extractvalue {float, i1} %load0, 0
 
   ; CHECK: [[VAL0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0
   ; CHECK: call void @scalar_user(float [[VAL0]])
@@ -123,7 +128,7 @@ define void @loadv2f32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <2 x float> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<2 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <2 x float>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -137,7 +142,7 @@ define void @loadv4f32_checkbit() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call {<4 x float>, i1} @llvm.dx.resource.loadchecked.typedbuffer.f32(
+  %data0 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.f32(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0)
 
   ; CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 4
@@ -158,7 +163,7 @@ define void @loadv4i32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x i32> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x i32>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -172,7 +177,7 @@ define void @loadv4f16() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x half> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x half>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -186,7 +191,7 @@ define void @loadv4i16() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i16 @dx.op.bufferLoad.i16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x i16> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x i16>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x i16>, 0, 0, 0) %buffer, i32 0)
 
   ret void
diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll
new file mode 100644
index 0000000..b8a6649
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll
@@ -0,0 +1,24 @@
+; We use llc for this test so that we don't abort after the first error.
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.2-compute"
+
+declare void @v4f64_user(<4 x double>)
+
+; Can't load 64 bit types directly until SM6.3 (byteaddressbuf.Load<int64_t4>)
+; CHECK: error:
+; CHECK-SAME: in function loadv4f64_byte
+; CHECK-SAME: Cannot create RawBufferLoad operation: Invalid overload type
+define void @loadv4f64_byte(i32 %offset) "hlsl.export" {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  %load = call {<4 x double>, i1} @llvm.dx.resource.load.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 0, 0, 0) %buffer, i32 %offset, i32 0)
+  %data = extractvalue {<4 x double>, i1} %load, 0
+
+  call void @v4f64_user(<4 x double> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll
new file mode 100644
index 0000000..586b9c4
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll
@@ -0,0 +1,232 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+declare void @f32_user(float)
+declare void @v4f32_user(<4 x float>)
+declare void @i32_user(i32)
+declare void @v4i32_user(<4 x i32>)
+declare void @v3f16_user(<3 x half>)
+declare void @v4f64_user(<4 x double>)
+
+; CHECK-LABEL: define void @loadf32_struct
+define void @loadf32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", float, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 1, i32 4)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+          target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %data = extractvalue {float, i1} %load, 0
+
+  ; CHECK: [[VAL:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: call void @f32_user(float [[VAL]])
+  call void @f32_user(float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadf32_byte
+define void @loadf32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 1, i32 4)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+  %data = extractvalue {float, i1} %load, 0
+
+  ; CHECK: [[VAL:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: call void @f32_user(float [[VAL]])
+  call void @f32_user(float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_struct
+define void @loadv4f32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", <4 x float>, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 15, i32 4)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_v4f32_0_0_0t(
+          target("dx.RawBuffer", <4 x float>, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %data = extractvalue {<4 x float>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_byte
+define void @loadv4f32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 15, i32 4)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+  %data = extractvalue {<4 x float>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadelements
+define void @loadelements(i32 %index) {
+  %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 15, i32 4)
+  %loadf32 = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.v4f32(
+          target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %dataf32 = extractvalue {<4 x float>, i1} %loadf32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %dataf32)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 1, i8 15, i32 4)
+  %loadi32 = call {<4 x i32>, i1}
+      @llvm.dx.resource.load.rawbuffer.v4i32(
+          target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 1)
+  %datai32 = extractvalue {<4 x i32>, i1} %loadi32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 3
+  ; CHECK: insertelement <4 x i32> undef
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: call void @v4i32_user(<4 x i32>
+  call void @v4i32_user(<4 x i32> %datai32)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadnested
+define void @loadnested(i32 %index) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 1, i32 4)
+  %loadi32 = call {i32, i1} @llvm.dx.resource.load.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 0)
+  %datai32 = extractvalue {i32, i1} %loadi32, 0
+
+  ; CHECK: [[VALI32:%.*]] = extractvalue %dx.types.ResRet.i32 [[DATAI32]], 0
+  ; CHECK: call void @i32_user(i32 [[VALI32]])
+  call void @i32_user(i32 %datai32)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 4, i8 15, i32 4)
+  %loadf32 = call {<4 x float>, i1} @llvm.dx.resource.load.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 4)
+  %dataf32 = extractvalue {<4 x float>, i1} %loadf32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %dataf32)
+
+  ; CHECK: [[DATAF16:%.*]] = call %dx.types.ResRet.f16 @dx.op.rawBufferLoad.f16(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 20, i8 7, i32 2)
+  %loadf16 = call {<3 x half>, i1} @llvm.dx.resource.load.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 20)
+  %dataf16 = extractvalue {<3 x half>, i1} %loadf16, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 2
+  ; CHECK: insertelement <3 x half> undef
+  ; CHECK: insertelement <3 x half>
+  ; CHECK: insertelement <3 x half>
+  ; CHECK: call void @v3f16_user(<3 x half>
+  call void @v3f16_user(<3 x half> %dataf16)
+
+  ret void
+}
+
+; byteaddressbuf.Load<int64_t4>
+; CHECK-LABEL: define void @loadv4f64_byte
+define void @loadv4f64_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f64 @dx.op.rawBufferLoad.f64(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 15, i32 8)
+  %load = call {<4 x double>, i1} @llvm.dx.resource.load.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 0, 0, 0) %buffer, i32 %offset, i32 0)
+  %data = extractvalue {<4 x double>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 3
+  ; CHECK: insertelement <4 x double> undef
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: call void @v4f64_user(<4 x double>
+  call void @v4f64_user(<4 x double> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
index 9b7e7fd..8769e6e 100644
--- a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
@@ -15,17 +15,19 @@ define void @load_float4(i32 %index, i32 %elemindex) {
   %ptr = call ptr @llvm.dx.resource.getpointer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   %vec_data = load <4 x float>, ptr %ptr
   call void @use_float4(<4 x float> %vec_data)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
   ; CHECK: extractelement <4 x float> %[[VALUE]], i32 1
   %y_ptr = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 1
   %y_data = load float, ptr %y_ptr
   call void @use_float(float %y_data)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
   ; CHECK: extractelement <4 x float> %[[VALUE]], i32 %elemindex
   %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex
   %dyndata = load float, ptr %dynamic
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
index 1760640..0b7882a 100644
--- a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
@@ -18,21 +18,24 @@ define void @store_float4(<4 x float> %data, i32 %index, i32 %elemindex) {
 
   ; Store just the .x component
   %scalar = extractelement <4 x float> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   store float %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 4
   store float %scalar, ptr %y_ptr
 
   ; Store to one of the elements dynamically
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 %elemindex
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 %elemindex
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex
   store float %scalar, ptr %dynamic
@@ -56,14 +59,16 @@ define void @store_half4(<4 x half> %data, i32 %index) {
 
   ; Store just the .x component
   %scalar = extractelement <4 x half> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <4 x half>, i1 } @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x half>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[VEC]], half %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
   store half %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <4 x half>, i1 } @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x half>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[VEC]], half %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 2
   store half %scalar, ptr %y_ptr
@@ -87,14 +92,16 @@ define void @store_double2(<2 x double> %data, i32 %index) {
 
   ; Store just the .x component
   %scalar = extractelement <2 x double> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <2 x double>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[VEC]], double %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
   store double %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <2 x double>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[VEC]], double %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 8
   store double %scalar, ptr %y_ptr
diff --git a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
index c837b36..cd21adc 100644
--- a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
@@ -29,18 +29,20 @@ entry:
   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
   ; CHECK-NOT: load {{.*}} ptr @In
   %1 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  ; CSE: call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
-  %2 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
+  ; CSE: call noundef { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
+  %load = call noundef {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
+  %2 = extractvalue {<4 x float>, i1} %load, 0
   ; CHECK-NOT: load {{.*}} ptr @In
   %3 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  %4 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
+  %load2 = call noundef {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
+  %4 = extractvalue {<4 x float>, i1} %load2, 0
   %add.i = fadd <4 x float> %2, %4
   call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i)
   ; CHECK: ret void
   ret void
 }
 
-; CSE-DAG: declare <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
+; CSE-DAG: declare { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
 ; CSE-DAG: declare void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
 
 attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="8,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
index 2622335..060d54f 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
@@ -17,8 +17,9 @@ target triple = "dxil-pc-shadermodel6.7-library"
 define <4 x float> @multicomponent() #0 {
   %res = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %val = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %res, i32 0)
+  %val = extractvalue {<4 x float>, i1} %load, 0
   ret <4 x float> %val
 }
 
@@ -26,8 +27,9 @@ define <4 x float> @multicomponent() #0 {
 define float @onecomponent() #0 {
   %res = call target("dx.TypedBuffer", float, 1, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %val = call float @llvm.dx.resource.load.typedbuffer(
+  %load = call {float, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", float, 1, 0, 0) %res, i32 0)
+  %val = extractvalue {float, i1} %load, 0
   ret float %val
 }
 
diff --git a/llvm/test/CodeGen/Hexagon/loopIdiom.ll b/llvm/test/CodeGen/Hexagon/loopIdiom.ll
new file mode 100644
index 0000000..9c3df67
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loopIdiom.ll
@@ -0,0 +1,75 @@
+; RUN: opt -debug -S -march=hexagon -O2  < %s | FileCheck %s
+; REQUIRES: asserts
+; CHECK: define dso_local void @complexMultAccum
+target triple = "hexagon"
+
+; Function Attrs: noinline nounwind
+define dso_local void @complexMultAccum(i32 noundef %n) #0 {
+entry:
+  %n.addr = alloca i32, align 4
+  %run_c_code = alloca i8, align 1
+  %run_asm_code = alloca i8, align 1
+  %iOutter = alloca i32, align 4
+  %iOutter1 = alloca i32, align 4
+  store i32 %n, ptr %n.addr, align 4
+  store i8 1, ptr %run_c_code, align 1
+  store i8 0, ptr %run_asm_code, align 1
+  %0 = load i8, ptr %run_c_code, align 1
+  %tobool = icmp ne i8 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 0, ptr %iOutter, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.then
+  %1 = load i32, ptr %iOutter, align 4
+  %cmp = icmp slt i32 %1, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32, ptr %iOutter, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, ptr %iOutter, align 4
+  br label %for.cond, !llvm.loop !3
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, ptr %iOutter1, align 4
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc5, %for.end
+  %3 = load i32, ptr %iOutter1, align 4
+  %cmp3 = icmp slt i32 %3, 2
+  br i1 %cmp3, label %for.body4, label %for.end7
+
+for.body4:                                        ; preds = %for.cond2
+  br label %for.inc5
+
+for.inc5:                                         ; preds = %for.body4
+  %4 = load i32, ptr %iOutter1, align 4
+  %inc6 = add nsw i32 %4, 1
+  store i32 %inc6, ptr %iOutter1, align 4
+  br label %for.cond2, !llvm.loop !5
+
+for.end7:                                         ; preds = %for.cond2
+  br label %if.end
+
+if.end:                                           ; preds = %for.end7, %entry
+  ret void
+}
+
+attributes #0 = { noinline nounwind "approx-func-fp-math"="true" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv79" "target-features"="+v79,-long-calls" "unsafe-fp-math"="true" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"LLVM Clang"}
+!3 = distinct !{!3, !4}
+!4 = !{!"llvm.loop.mustprogress"}
+!5 = distinct !{!5, !4}
+
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index fc0c7ad..e0a93e3 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -315,9 +315,9 @@ define void @test_la_tls_le(i32 signext %n) {
 ; LA32-LABEL: test_la_tls_le:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    move $a1, $zero
-; LA32-NEXT:    lu12i.w $a2, %le_hi20(le)
-; LA32-NEXT:    ori $a2, $a2, %le_lo12(le)
-; LA32-NEXT:    add.w $a2, $a2, $tp
+; LA32-NEXT:    lu12i.w $a2, %le_hi20_r(le)
+; LA32-NEXT:    add.w $a2, $a2, $tp, %le_add_r(le)
+; LA32-NEXT:    addi.w $a2, $a2, %le_lo12_r(le)
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB4_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -330,12 +330,13 @@ define void @test_la_tls_le(i32 signext %n) {
 ; LA64-LABEL: test_la_tls_le:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    move $a1, $zero
-; LA64-NEXT:    lu12i.w $a2, %le_hi20(le)
-; LA64-NEXT:    ori $a2, $a2, %le_lo12(le)
+; LA64-NEXT:    lu12i.w $a2, %le_hi20_r(le)
+; LA64-NEXT:    add.d $a2, $a2, $tp, %le_add_r(le)
+; LA64-NEXT:    addi.d $a2, $a2, %le_lo12_r(le)
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB4_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ldx.w $zero, $a2, $tp
+; LA64-NEXT:    ld.w $zero, $a2, 0
 ; LA64-NEXT:    addi.w $a1, $a1, 1
 ; LA64-NEXT:    blt $a1, $a0, .LBB4_1
 ; LA64-NEXT:  # %bb.2: # %ret
diff --git a/llvm/test/CodeGen/LoongArch/mir-target-flags.ll b/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
index f530e3e..3bc8a8d 100644
--- a/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
+++ b/llvm/test/CodeGen/LoongArch/mir-target-flags.ll
@@ -28,8 +28,9 @@ define void @caller() nounwind {
 ; CHECK-NEXT: target-flags(loongarch-got-pc-lo) @t_ld
 ; CHECK:      target-flags(loongarch-ie-pc-hi) @t_ie
 ; CHECK-NEXT: target-flags(loongarch-ie-pc-lo) @t_ie
-; CHECK:      target-flags(loongarch-le-hi) @t_le
-; CHECK-NEXT: target-flags(loongarch-le-lo) @t_le
+; CHECK:      target-flags(loongarch-le-hi-r) @t_le
+; CHECK-NEXT: target-flags(loongarch-le-add-r) @t_le
+; CHECK-NEXT: target-flags(loongarch-le-lo-r) @t_le
 ; CHECK:      target-flags(loongarch-call-plt) @callee1
 ; CHECK:      target-flags(loongarch-call) @callee2
   %a = load volatile i32, ptr @g_e
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index c7de3dc..3390f7f 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=0 < %s \
 ; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --post-RA-scheduler=1 < %s \
@@ -7,6 +6,14 @@
 ; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --post-RA-scheduler=1 < %s \
 ; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --enable-tlsdesc \
+; RUN:     --post-RA-scheduler=0 < %s | FileCheck %s --check-prefix=MEDIUMDESC_NO_SCH
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=medium --relocation-model=pic --enable-tlsdesc \
+; RUN:     --post-RA-scheduler=1 < %s | FileCheck %s --check-prefix=MEDIUMDESC_SCH
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --enable-tlsdesc \
+; RUN:     --post-RA-scheduler=0 < %s | FileCheck %s --check-prefix=LARGEDESC_NO_SCH
+; RUN: llc --mtriple=loongarch64 -mattr=+d --code-model=large --relocation-model=pic --enable-tlsdesc \
+; RUN:     --post-RA-scheduler=1 < %s | FileCheck %s --check-prefix=LARGEDESC_SCH
 
 @g = dso_local global i64 zeroinitializer, align 4
 @G = global i64 zeroinitializer, align 4
@@ -194,3 +201,69 @@ define void @foo() nounwind {
   %v_ie = load volatile i64, ptr @ie
   ret void
 }
+
+define void @baz() nounwind {
+; MEDIUMDESC_NO_SCH-LABEL: baz:
+; MEDIUMDESC_NO_SCH:       # %bb.0:
+; MEDIUMDESC_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUMDESC_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUMDESC_NO_SCH-NEXT:    pcalau12i $a0, %desc_pc_hi20(gd)
+; MEDIUMDESC_NO_SCH-NEXT:    addi.d $a0, $a0, %desc_pc_lo12(gd)
+; MEDIUMDESC_NO_SCH-NEXT:    ld.d $ra, $a0, %desc_ld(gd)
+; MEDIUMDESC_NO_SCH-NEXT:    jirl $ra, $ra, %desc_call(gd)
+; MEDIUMDESC_NO_SCH-NEXT:    add.d $a0, $a0, $tp
+; MEDIUMDESC_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUMDESC_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUMDESC_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUMDESC_NO_SCH-NEXT:    ret
+;
+; MEDIUMDESC_SCH-LABEL: baz:
+; MEDIUMDESC_SCH:       # %bb.0:
+; MEDIUMDESC_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUMDESC_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUMDESC_SCH-NEXT:    pcalau12i $a0, %desc_pc_hi20(gd)
+; MEDIUMDESC_SCH-NEXT:    addi.d $a0, $a0, %desc_pc_lo12(gd)
+; MEDIUMDESC_SCH-NEXT:    ld.d $ra, $a0, %desc_ld(gd)
+; MEDIUMDESC_SCH-NEXT:    jirl $ra, $ra, %desc_call(gd)
+; MEDIUMDESC_SCH-NEXT:    add.d $a0, $a0, $tp
+; MEDIUMDESC_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUMDESC_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUMDESC_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUMDESC_SCH-NEXT:    ret
+;
+; LARGEDESC_NO_SCH-LABEL: baz:
+; LARGEDESC_NO_SCH:       # %bb.0:
+; LARGEDESC_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGEDESC_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGEDESC_NO_SCH-NEXT:    pcalau12i $a0, %desc_pc_hi20(gd)
+; LARGEDESC_NO_SCH-NEXT:    addi.d $a1, $zero, %desc_pc_lo12(gd)
+; LARGEDESC_NO_SCH-NEXT:    lu32i.d $a1, %desc64_pc_lo20(gd)
+; LARGEDESC_NO_SCH-NEXT:    lu52i.d $a1, $a1, %desc64_pc_hi12(gd)
+; LARGEDESC_NO_SCH-NEXT:    add.d $a0, $a0, $a1
+; LARGEDESC_NO_SCH-NEXT:    ld.d $ra, $a0, %desc_ld(gd)
+; LARGEDESC_NO_SCH-NEXT:    jirl $ra, $ra, %desc_call(gd)
+; LARGEDESC_NO_SCH-NEXT:    add.d $a0, $a0, $tp
+; LARGEDESC_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGEDESC_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGEDESC_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGEDESC_NO_SCH-NEXT:    ret
+;
+; LARGEDESC_SCH-LABEL: baz:
+; LARGEDESC_SCH:       # %bb.0:
+; LARGEDESC_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGEDESC_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGEDESC_SCH-NEXT:    pcalau12i $a0, %desc_pc_hi20(gd)
+; LARGEDESC_SCH-NEXT:    addi.d $a1, $zero, %desc_pc_lo12(gd)
+; LARGEDESC_SCH-NEXT:    lu32i.d $a1, %desc64_pc_lo20(gd)
+; LARGEDESC_SCH-NEXT:    lu52i.d $a1, $a1, %desc64_pc_hi12(gd)
+; LARGEDESC_SCH-NEXT:    add.d $a0, $a0, $a1
+; LARGEDESC_SCH-NEXT:    ld.d $ra, $a0, %desc_ld(gd)
+; LARGEDESC_SCH-NEXT:    jirl $ra, $ra, %desc_call(gd)
+; LARGEDESC_SCH-NEXT:    add.d $a0, $a0, $tp
+; LARGEDESC_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGEDESC_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGEDESC_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGEDESC_SCH-NEXT:    ret
+  %v_gd = load volatile i64, ptr @gd
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index dbd7bf6..e3a8ace 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -330,16 +330,16 @@ entry:
 define ptr @f4() nounwind {
 ; LA32PIC-LABEL: f4:
 ; LA32PIC:       # %bb.0: # %entry
-; LA32PIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32PIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32PIC-NEXT:    add.w $a0, $a0, $tp
+; LA32PIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32PIC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32PIC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32PIC-NEXT:    ret
 ;
 ; LA64PIC-LABEL: f4:
 ; LA64PIC:       # %bb.0: # %entry
-; LA64PIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64PIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64PIC-NEXT:    add.d $a0, $a0, $tp
+; LA64PIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64PIC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64PIC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64PIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: f4:
@@ -353,16 +353,16 @@ define ptr @f4() nounwind {
 ;
 ; LA32NOPIC-LABEL: f4:
 ; LA32NOPIC:       # %bb.0: # %entry
-; LA32NOPIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32NOPIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32NOPIC-NEXT:    add.w $a0, $a0, $tp
+; LA32NOPIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32NOPIC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32NOPIC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32NOPIC-NEXT:    ret
 ;
 ; LA64NOPIC-LABEL: f4:
 ; LA64NOPIC:       # %bb.0: # %entry
-; LA64NOPIC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64NOPIC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64NOPIC-NEXT:    add.d $a0, $a0, $tp
+; LA64NOPIC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64NOPIC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64NOPIC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64NOPIC-NEXT:    ret
 ;
 ; LA64LARGENOPIC-LABEL: f4:
@@ -376,16 +376,16 @@ define ptr @f4() nounwind {
 ;
 ; LA32DESC-LABEL: f4:
 ; LA32DESC:       # %bb.0: # %entry
-; LA32DESC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA32DESC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA32DESC-NEXT:    add.w $a0, $a0, $tp
+; LA32DESC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA32DESC-NEXT:    add.w $a0, $a0, $tp, %le_add_r(le)
+; LA32DESC-NEXT:    addi.w $a0, $a0, %le_lo12_r(le)
 ; LA32DESC-NEXT:    ret
 ;
 ; LA64DESC-LABEL: f4:
 ; LA64DESC:       # %bb.0: # %entry
-; LA64DESC-NEXT:    lu12i.w $a0, %le_hi20(le)
-; LA64DESC-NEXT:    ori $a0, $a0, %le_lo12(le)
-; LA64DESC-NEXT:    add.d $a0, $a0, $tp
+; LA64DESC-NEXT:    lu12i.w $a0, %le_hi20_r(le)
+; LA64DESC-NEXT:    add.d $a0, $a0, $tp, %le_add_r(le)
+; LA64DESC-NEXT:    addi.d $a0, $a0, %le_lo12_r(le)
 ; LA64DESC-NEXT:    ret
 ;
 ; DESC64-LABEL: f4:
diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
index d2b063a..b2abff7 100644
--- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
@@ -25,8 +25,8 @@ alignment:       4
 tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
-#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
+#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
   bb.0.entry:
@@ -42,8 +42,8 @@ tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
 #CHECK:    frame-setup EMITBKEY
-#CHECK:    frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
+#CHECK:    frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-destroy AUTIBSP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
   bb.0.entry:
@@ -59,8 +59,8 @@ tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
 #CHECK:    frame-setup PACM
-#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp, pre-instr-symbol <mcsymbol >
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state_with_pc
+#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp, pre-instr-symbol <mcsymbol >
 #CHECK:    frame-destroy PACM
 #CHECK:    frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll
index 5d1c390..b6317df 100644
--- a/llvm/test/CodeGen/NVPTX/b52037.ll
+++ b/llvm/test/CodeGen/NVPTX/b52037.ll
@@ -39,7 +39,7 @@ declare %int3 @hoge(i32, i32, i32) local_unnamed_addr
 
 declare i64 @foo() local_unnamed_addr
 
-define void @barney(ptr nocapture readonly %arg) local_unnamed_addr {
+define ptx_kernel void @barney(ptr nocapture readonly %arg) local_unnamed_addr {
 bb:
   tail call void asm sideeffect "// KEEP", ""() #1
   %tmp = alloca %struct.zot, align 16
@@ -210,9 +210,6 @@ bb14:                                             ; preds = %bb49.i.lr.ph, %bb49
 attributes #0 = { argmemonly mustprogress nofree nounwind willreturn }
 attributes #1 = { nounwind }
 
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @barney, !"kernel", i32 1}
 !1 = !{!2, !11, i64 64}
 !2 = !{!"_ZTSN7cuneibs22neiblist_iterator_coreE", !3, i64 0, !3, i64 8, !6, i64 16, !8, i64 32, !9, i64 44, !10, i64 48, !11, i64 64, !9, i64 72, !4, i64 76, !9, i64 80}
 !3 = !{!"any pointer", !4, i64 0}
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 03cdeb9..8be3a66 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -182,8 +182,8 @@ define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fneg_param_0];
-; CHECK-NEXT:    xor.b32 %r2, %r1, -2147450880;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
+; CHECK-NEXT:    neg.bf16x2 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = fneg <2 x bfloat> %a
@@ -532,8 +532,8 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fabs_param_0];
-; CHECK-NEXT:    and.b32 %r2, %r1, 2147450879;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
+; CHECK-NEXT:    abs.bf16x2 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a)
diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll
index 9b1f104..76300e3 100644
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@@ -8,7 +8,7 @@ target triple = "nvptx64-unknown-unknown"
 %struct.S = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define void @_Z11TakesStruct1SPi(ptr byval(%struct.S) nocapture readonly %input, ptr nocapture %output) #0 {
+define ptx_kernel void @_Z11TakesStruct1SPi(ptr byval(%struct.S) nocapture readonly %input, ptr nocapture %output) #0 {
 entry:
 ; CHECK-LABEL: @_Z11TakesStruct1SPi
 ; PTX-LABEL: .visible .entry _Z11TakesStruct1SPi(
@@ -23,7 +23,3 @@ entry:
 }
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @_Z11TakesStruct1SPi, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index e3656fd..ace3166 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -8,7 +8,7 @@ target triple = "nvptx64-nvidia-cuda"
 
 ; Function Attrs: nounwind
 ; CHECK-LABEL: some_kernel
-define void @some_kernel(ptr nocapture %dst) #0 {
+define ptx_kernel void @some_kernel(ptr nocapture %dst) #0 {
 _ZL11compute_vecRK6float3jb.exit:
   %ret_vec.sroa.8.i = alloca float, align 4
   %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
@@ -55,8 +55,5 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "n
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
-!nvvm.annotations = !{!0}
 !llvm.ident = !{!1}
-
-!0 = !{ptr @some_kernel, !"kernel", i32 1}
 !1 = !{!"clang version 3.5.1 (tags/RELEASE_351/final)"}
diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 00c97fb..193df7f 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -8,7 +8,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
 ; CHECK-LABEL: ex_zext
-define void @ex_zext(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.u8
   %val = load i8, ptr %data
@@ -19,7 +19,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_sext
-define void @ex_sext(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.u8
   %val = load i8, ptr %data
@@ -30,7 +30,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_zext_v2
-define void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.v2.u8
   %val = load <2 x i8>, ptr %data
@@ -41,7 +41,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_sext_v2
-define void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.v2.u8
   %val = load <2 x i8>, ptr %data
@@ -51,8 +51,3 @@ entry:
   ret void
 }
 
-!nvvm.annotations = !{!0,!1,!2,!3}
-!0 = !{ptr @ex_zext, !"kernel", i32 1}
-!1 = !{ptr @ex_sext, !"kernel", i32 1}
-!2 = !{ptr @ex_zext_v2, !"kernel", i32 1}
-!3 = !{ptr @ex_sext_v2, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 19f4ef8..1c9d271 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -16,7 +16,7 @@
 ;  }
 
 ; CHECK: .visible .entry kernel_func
-define void @kernel_func(ptr %a) {
+define ptx_kernel void @kernel_func(ptr %a) {
 entry:
   %buf = alloca [16 x i8], align 4
 
@@ -56,7 +56,3 @@ entry:
 }
 
 declare void @callee(ptr, ptr)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/cluster-dim.ll b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
index c9258ad..9275c89 100644
--- a/llvm/test/CodeGen/NVPTX/cluster-dim.ll
+++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s
 ; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
 
-define void @kernel_func_clusterxyz() {
+define ptx_kernel void @kernel_func_clusterxyz() {
 ; CHECK80-LABEL: kernel_func_clusterxyz(
 ; CHECK80:       {
 ; CHECK80-EMPTY:
@@ -23,7 +23,6 @@ define void @kernel_func_clusterxyz() {
 }
 
 
-!nvvm.annotations = !{!1, !2}
+!nvvm.annotations = !{!1}
 
-!1 = !{ptr @kernel_func_clusterxyz, !"kernel", i32 1}
-!2 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
+!1 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
diff --git a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
new file mode 100644
index 0000000..1b1bb91
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+@a = external global ptr align 16
+
+define i32  @test_disjoint_or_addr(i16 %a) {
+; CHECK-LABEL: test_disjoint_or_addr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u64 %rd1, a;
+; CHECK-NEXT:    cvta.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.u32 %r1, [%rd2+8];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a1 = ptrtoint ptr @a to i64
+  %a2 = or disjoint i64 %a1, 8
+  %a3 = inttoptr i64 %a2 to ptr
+  %v = load i32, ptr %a3
+  ret i32 %v
+}
diff --git a/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll b/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll
new file mode 100644
index 0000000..9031f33
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+define float @fabs_free(i32 %in) {
+; CHECK-LABEL: fabs_free(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fabs_free_param_0];
+; CHECK-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %b = bitcast i32 %in to float
+  %f = call float @llvm.fabs.f32(float %b)
+  ret float %f
+}
+
+define float @fneg_free(i32 %in) {
+; CHECK-LABEL: fneg_free(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fneg_free_param_0];
+; CHECK-NEXT:    neg.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %b = bitcast i32 %in to float
+  %f = fneg float %b
+  ret float %f
+}
diff --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
index 43e4dfc..2b66311 100644
--- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -12,7 +12,7 @@ target triple = "nvptx-nvidia-cuda"
 @myconst = internal constant i32 420, align 4
 
 
-define void @foo(ptr %a, ptr %b) {
+define ptx_kernel void @foo(ptr %a, ptr %b) {
 ; Expect one load -- @myconst isn't loaded from, because we know its value
 ; statically.
 ; CHECK: ld.global.u32
@@ -24,7 +24,3 @@ define void @foo(ptr %a, ptr %b) {
   store i32 %ld2, ptr %b
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-array-global.ll b/llvm/test/CodeGen/NVPTX/i1-array-global.ll
index ff3848b..20b376f 100644
--- a/llvm/test/CodeGen/NVPTX/i1-array-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-array-global.ll
@@ -7,13 +7,9 @@ target triple = "nvptx-nvidia-cuda"
 @global_cst = private constant [6 x i1] [i1 true, i1 false, i1 true, i1 false, i1 true, i1 false]
 
 ; CHECK: .global .align 1 .b8 global_cst[6] = {1, 0, 1, 0, 1}
-define void @kernel(i32 %i, ptr %out) {
+define ptx_kernel void @kernel(i32 %i, ptr %out) {
   %5 = getelementptr inbounds i1, ptr @global_cst, i32 %i
   %6 = load i1, ptr %5, align 1
   store i1 %6, ptr %out, align 1
   ret void
 }
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @kernel, !"kernel", i32 1}
-
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index 83f8f80..f5f1dd9 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -5,7 +5,7 @@
 
 target triple = "nvptx-nvidia-cuda"
 
-define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
+define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK-LABEL: foo(
 ; CHECK:    .reg .b16 %rs<2>;
 ; CHECK:    .reg .b32 %r<4>;
@@ -28,7 +28,3 @@ define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
   store i32 %and, ptr %retval
   ret void
 }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-global.ll b/llvm/test/CodeGen/NVPTX/i1-global.ll
index 17af1fa..60d2ccd 100644
--- a/llvm/test/CodeGen/NVPTX/i1-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-global.ll
@@ -8,13 +8,9 @@ target triple = "nvptx-nvidia-cuda"
 @mypred = addrspace(1) global i1 true, align 1
 
 
-define void @foo(i1 %p, ptr %out) {
+define ptx_kernel void @foo(i1 %p, ptr %out) {
   %ld = load i1, ptr addrspace(1) @mypred
   %val = zext i1 %ld to i32
   store i32 %val, ptr %out
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-param.ll b/llvm/test/CodeGen/NVPTX/i1-param.ll
index 3c74ee6..14d417b 100644
--- a/llvm/test/CodeGen/NVPTX/i1-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-param.ll
@@ -9,12 +9,8 @@ target triple = "nvptx-nvidia-cuda"
 ; CHECK: .entry foo
 ; CHECK:   .param .u8 foo_param_0
 ; CHECK:   .param .u64 .ptr .align 1 foo_param_1
-define void @foo(i1 %p, ptr %out) {
+define ptx_kernel void @foo(i1 %p, ptr %out) {
   %val = zext i1 %p to i32
   store i32 %val, ptr %out
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll
index 2f3e08a..86776ab 100644
--- a/llvm/test/CodeGen/NVPTX/intr-range.ll
+++ b/llvm/test/CodeGen/NVPTX/intr-range.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
 
-define i32 @test_maxntid() {
-; CHECK-LABEL: define i32 @test_maxntid(
+define ptx_kernel i32 @test_maxntid() {
+; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -31,8 +31,8 @@ define i32 @test_maxntid() {
   ret i32 %11
 }
 
-define i32 @test_reqntid() {
-; CHECK-LABEL: define i32 @test_reqntid(
+define ptx_kernel i32 @test_reqntid() {
+; CHECK-LABEL: define ptx_kernel i32 @test_reqntid(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -64,8 +64,8 @@ define i32 @test_reqntid() {
 ;; A case like this could occur if a function with the sreg intrinsic was
 ;; inlined into a kernel where the tid metadata is present, ensure the range is
 ;; updated.
-define i32 @test_inlined() {
-; CHECK-LABEL: define i32 @test_inlined(
+define ptx_kernel i32 @test_inlined() {
+; CHECK-LABEL: define ptx_kernel i32 @test_inlined(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    ret i32 [[TMP1]]
@@ -83,6 +83,6 @@ declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
 
 !nvvm.annotations = !{!0, !1, !2}
-!0 = !{ptr @test_maxntid, !"kernel", i32 1, !"maxntidx", i32 32, !"maxntidz", i32 3}
-!1 = !{ptr @test_reqntid, !"kernel", i32 1, !"reqntidx", i32 20}
-!2 = !{ptr @test_inlined, !"kernel", i32 1, !"maxntidx", i32 4}
+!0 = !{ptr @test_maxntid, !"maxntidx", i32 32, !"maxntidz", i32 3}
+!1 = !{ptr @test_reqntid, !"reqntidx", i32 20}
+!2 = !{ptr @test_inlined, !"maxntidx", i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
index 93d428d..2889d2d 100644
--- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
@@ -10,7 +10,7 @@
 ; CHECK: .param .u64 .ptr .shared .align 8  func_align_param_3
 ; CHECK: .param .u64 .ptr .const  .align 16 func_align_param_4
 ; CHECK: .param .u64 .ptr .local  .align 32 func_align_param_5
-define void @func_align(ptr nocapture readonly align 1 %input,
+define ptx_kernel void @func_align(ptr nocapture readonly align 1 %input,
                         ptr nocapture align 2 %out,
                         ptr addrspace(1) align 4 %global,
                         ptr addrspace(3) align 8 %shared,
@@ -27,7 +27,7 @@ entry:
 ; CHECK: .param .u64 .ptr .shared .align 1 func_noalign_param_3
 ; CHECK: .param .u64 .ptr .const  .align 1 func_noalign_param_4
 ; CHECK: .param .u64 .ptr .local  .align 1 func_noalign_param_5
-define void @func_noalign(ptr nocapture readonly %input,
+define ptx_kernel void @func_noalign(ptr nocapture readonly %input,
                           ptr nocapture %out,
                           ptr addrspace(1) %global,
                           ptr addrspace(3) %shared,
@@ -36,7 +36,3 @@ define void @func_noalign(ptr nocapture readonly %input,
 entry:
   ret void
 }
-
-!nvvm.annotations = !{!0, !1}
-!0 = !{ptr @func_align, !"kernel", i32 1}
-!1 = !{ptr @func_noalign, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index bdaeccd5..dc1917f 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -10,7 +10,7 @@ target triple = "nvptx64-unknown-unknown"
 ; SM20: ld.global.f32
 ; SM35-LABEL: .visible .entry foo1(
 ; SM35: ld.global.nc.f32
-define void @foo1(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) {
   %1 = load float, ptr %from
   store float %1, ptr %to
   ret void
@@ -20,7 +20,7 @@ define void @foo1(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.f64
 ; SM35-LABEL: .visible .entry foo2(
 ; SM35: ld.global.nc.f64
-define void @foo2(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) {
   %1 = load double, ptr %from
   store double %1, ptr %to
   ret void
@@ -30,7 +30,7 @@ define void @foo2(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u16
 ; SM35-LABEL: .visible .entry foo3(
 ; SM35: ld.global.nc.u16
-define void @foo3(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) {
   %1 = load i16, ptr %from
   store i16 %1, ptr %to
   ret void
@@ -40,7 +40,7 @@ define void @foo3(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo4(
 ; SM35: ld.global.nc.u32
-define void @foo4(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) {
   %1 = load i32, ptr %from
   store i32 %1, ptr %to
   ret void
@@ -50,7 +50,7 @@ define void @foo4(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u64
 ; SM35-LABEL: .visible .entry foo5(
 ; SM35: ld.global.nc.u64
-define void @foo5(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) {
   %1 = load i64, ptr %from
   store i64 %1, ptr %to
   ret void
@@ -63,7 +63,7 @@ define void @foo5(ptr noalias readonly %from, ptr %to) {
 ; SM35-LABEL: .visible .entry foo6(
 ; SM35: ld.global.nc.u64
 ; SM35: ld.global.nc.u64
-define void @foo6(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) {
   %1 = load i128, ptr %from
   store i128 %1, ptr %to
   ret void
@@ -73,7 +73,7 @@ define void @foo6(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u8
 ; SM35-LABEL: .visible .entry foo7(
 ; SM35: ld.global.nc.v2.u8
-define void @foo7(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i8>, ptr %from
   store <2 x i8> %1, ptr %to
   ret void
@@ -83,7 +83,7 @@ define void @foo7(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo8(
 ; SM35: ld.global.nc.u32
-define void @foo8(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i16>, ptr %from
   store <2 x i16> %1, ptr %to
   ret void
@@ -93,7 +93,7 @@ define void @foo8(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u32
 ; SM35-LABEL: .visible .entry foo9(
 ; SM35: ld.global.nc.v2.u32
-define void @foo9(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i32>, ptr %from
   store <2 x i32> %1, ptr %to
   ret void
@@ -103,7 +103,7 @@ define void @foo9(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u64
 ; SM35-LABEL: .visible .entry foo10(
 ; SM35: ld.global.nc.v2.u64
-define void @foo10(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i64>, ptr %from
   store <2 x i64> %1, ptr %to
   ret void
@@ -113,7 +113,7 @@ define void @foo10(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.f32
 ; SM35-LABEL: .visible .entry foo11(
 ; SM35: ld.global.nc.v2.f32
-define void @foo11(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x float>, ptr %from
   store <2 x float> %1, ptr %to
   ret void
@@ -123,7 +123,7 @@ define void @foo11(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.f64
 ; SM35-LABEL: .visible .entry foo12(
 ; SM35: ld.global.nc.v2.f64
-define void @foo12(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x double>, ptr %from
   store <2 x double> %1, ptr %to
   ret void
@@ -133,7 +133,7 @@ define void @foo12(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo13(
 ; SM35: ld.global.nc.u32
-define void @foo13(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i8>, ptr %from
   store <4 x i8> %1, ptr %to
   ret void
@@ -143,7 +143,7 @@ define void @foo13(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.u16
 ; SM35-LABEL: .visible .entry foo14(
 ; SM35: ld.global.nc.v4.u16
-define void @foo14(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i16>, ptr %from
   store <4 x i16> %1, ptr %to
   ret void
@@ -153,7 +153,7 @@ define void @foo14(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.u32
 ; SM35-LABEL: .visible .entry foo15(
 ; SM35: ld.global.nc.v4.u32
-define void @foo15(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i32>, ptr %from
   store <4 x i32> %1, ptr %to
   ret void
@@ -163,7 +163,7 @@ define void @foo15(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.f32
 ; SM35-LABEL: .visible .entry foo16(
 ; SM35: ld.global.nc.v4.f32
-define void @foo16(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x float>, ptr %from
   store <4 x float> %1, ptr %to
   ret void
@@ -175,7 +175,7 @@ define void @foo16(ptr noalias readonly %from, ptr %to) {
 ; SM35-LABEL: .visible .entry foo17(
 ; SM35: ld.global.nc.v2.f64
 ; SM35: ld.global.nc.v2.f64
-define void @foo17(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x double>, ptr %from
   store <4 x double> %1, ptr %to
   ret void
@@ -185,7 +185,7 @@ define void @foo17(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u64
 ; SM35-LABEL: .visible .entry foo18(
 ; SM35: ld.global.nc.u64
-define void @foo18(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) {
   %1 = load ptr, ptr %from
   store ptr %1, ptr %to
   ret void
@@ -196,7 +196,7 @@ define void @foo18(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.f32
 ; SM35-LABEL: .visible .entry foo19(
 ; SM35: ld.global.nc.f32
-define void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
+define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
 entry:
   br label %loop
 
@@ -243,24 +243,3 @@ define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) {
   store float %1, ptr %to
   ret void
 }
-
-!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19}
-!1 = !{ptr @foo1, !"kernel", i32 1}
-!2 = !{ptr @foo2, !"kernel", i32 1}
-!3 = !{ptr @foo3, !"kernel", i32 1}
-!4 = !{ptr @foo4, !"kernel", i32 1}
-!5 = !{ptr @foo5, !"kernel", i32 1}
-!6 = !{ptr @foo6, !"kernel", i32 1}
-!7 = !{ptr @foo7, !"kernel", i32 1}
-!8 = !{ptr @foo8, !"kernel", i32 1}
-!9 = !{ptr @foo9, !"kernel", i32 1}
-!10 = !{ptr @foo10, !"kernel", i32 1}
-!11 = !{ptr @foo11, !"kernel", i32 1}
-!12 = !{ptr @foo12, !"kernel", i32 1}
-!13 = !{ptr @foo13, !"kernel", i32 1}
-!14 = !{ptr @foo14, !"kernel", i32 1}
-!15 = !{ptr @foo15, !"kernel", i32 1}
-!16 = !{ptr @foo16, !"kernel", i32 1}
-!17 = !{ptr @foo17, !"kernel", i32 1}
-!18 = !{ptr @foo18, !"kernel", i32 1}
-!19 = !{ptr @foo19, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index e42f230..f21ff97 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -29,7 +29,7 @@ define void @foo(i32 %a) {
 ; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo2_param_0];
 ; PTX64:        add.u64          %rd[[SP_REG:[0-9]+]], %SPL, 0;
 ; PTX64:        st.local.u32  [%rd[[SP_REG]]], %r{{[0-9]+}};
-define void @foo2(i32 %a) {
+define ptx_kernel void @foo2(i32 %a) {
   %local = alloca i32, align 4
   store i32 %a, ptr %local
   call void @bar(ptr %local)
@@ -38,8 +38,6 @@ define void @foo2(i32 %a) {
 
 declare void @bar(ptr %a)
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo2, !"kernel", i32 1}
 
 ; PTX32:        mov.u32          %SPL, __local_depot{{[0-9]+}};
 ; PTX32-NOT:    cvta.local.u32   %SP, %SPL;
diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
index 8f2d551..530b48b 100644
--- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-define void @kernel() {
+define ptx_kernel void @kernel() {
 ; LABEL: @lower_alloca
 ; PTX-LABEL: .visible .entry kernel(
   %A = alloca i32
@@ -37,7 +37,5 @@ define void @alloca_in_explicit_local_as() {
 declare void @callee(ptr)
 declare void @callee_addrspace5(ptr addrspace(5))
 
-!nvvm.annotations = !{!0}
 !nvvm.annotations = !{!1}
-!0 = !{ptr @kernel, !"kernel", i32 1}
 !1 = !{ptr @alloca_in_explicit_local_as, !"alloca_in_explicit_local_as", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 9cfe919..208d4f0 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -29,7 +29,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b16 %rs<3>;
 ; PTX-NEXT:    .reg .b32 %r<11>;
-; PTX-NEXT:    .reg .b64 %rd<10>;
+; PTX-NEXT:    .reg .b64 %rd<9>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.u64 %SPL, __local_depot0;
@@ -38,23 +38,22 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX-NEXT:    setp.eq.b16 %p1, %rs2, 1;
 ; PTX-NEXT:    ld.param.s32 %rd1, [non_kernel_function_param_2];
-; PTX-NEXT:    add.u64 %rd2, %SP, 0;
-; PTX-NEXT:    or.b64 %rd3, %rd2, 8;
-; PTX-NEXT:    ld.param.u64 %rd4, [non_kernel_function_param_0+8];
-; PTX-NEXT:    st.u64 [%rd3], %rd4;
-; PTX-NEXT:    ld.param.u64 %rd5, [non_kernel_function_param_0];
-; PTX-NEXT:    st.u64 [%SP], %rd5;
-; PTX-NEXT:    mov.u64 %rd6, gi;
-; PTX-NEXT:    cvta.global.u64 %rd7, %rd6;
-; PTX-NEXT:    selp.b64 %rd8, %rd2, %rd7, %p1;
-; PTX-NEXT:    add.s64 %rd9, %rd8, %rd1;
-; PTX-NEXT:    ld.u8 %r1, [%rd9];
-; PTX-NEXT:    ld.u8 %r2, [%rd9+1];
+; PTX-NEXT:    ld.param.u64 %rd2, [non_kernel_function_param_0+8];
+; PTX-NEXT:    st.u64 [%SP+8], %rd2;
+; PTX-NEXT:    ld.param.u64 %rd3, [non_kernel_function_param_0];
+; PTX-NEXT:    st.u64 [%SP], %rd3;
+; PTX-NEXT:    mov.u64 %rd4, gi;
+; PTX-NEXT:    cvta.global.u64 %rd5, %rd4;
+; PTX-NEXT:    add.u64 %rd6, %SP, 0;
+; PTX-NEXT:    selp.b64 %rd7, %rd6, %rd5, %p1;
+; PTX-NEXT:    add.s64 %rd8, %rd7, %rd1;
+; PTX-NEXT:    ld.u8 %r1, [%rd8];
+; PTX-NEXT:    ld.u8 %r2, [%rd8+1];
 ; PTX-NEXT:    shl.b32 %r3, %r2, 8;
 ; PTX-NEXT:    or.b32 %r4, %r3, %r1;
-; PTX-NEXT:    ld.u8 %r5, [%rd9+2];
+; PTX-NEXT:    ld.u8 %r5, [%rd8+2];
 ; PTX-NEXT:    shl.b32 %r6, %r5, 16;
-; PTX-NEXT:    ld.u8 %r7, [%rd9+3];
+; PTX-NEXT:    ld.u8 %r7, [%rd8+3];
 ; PTX-NEXT:    shl.b32 %r8, %r7, 24;
 ; PTX-NEXT:    or.b32 %r9, %r8, %r6;
 ; PTX-NEXT:    or.b32 %r10, %r9, %r4;
@@ -68,7 +67,7 @@ entry:
   ret i32 %0, !dbg !23
 }
 
-define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
+define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
 ; PTX-LABEL: grid_const_int(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -82,7 +81,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 ; PTX-NEXT:    add.s32 %r3, %r2, %r1;
 ; PTX-NEXT:    st.global.u32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_int(
+; OPT-LABEL: define ptx_kernel void @grid_const_int(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
@@ -91,6 +90,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
 ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT3]], align 4
 ; OPT-NEXT:    ret void
+;
   %tmp = load i32, ptr %input1, align 4
   %add = add i32 %tmp, %input2
   store i32 %add, ptr %out
@@ -99,7 +99,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 
 %struct.s = type { i32, i32 }
 
-define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
+define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; PTX-LABEL: grid_const_struct(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -113,7 +113,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
 ; PTX-NEXT:    st.global.u32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_struct(
+; OPT-LABEL: define ptx_kernel void @grid_const_struct(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
@@ -125,6 +125,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
 ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT5]], align 4
 ; OPT-NEXT:    ret void
+;
   %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %int1 = load i32, ptr %gep1
@@ -134,7 +135,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
   ret void
 }
 
-define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
+define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-LABEL: grid_const_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<3>;
@@ -159,17 +160,18 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
 ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
 ; OPT-NEXT:    ret void
+;
   %call = call i32 @escape(ptr %input)
   ret void
 }
 
-define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
+define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
 ; PTX-LABEL: multiple_grid_const_escape(
 ; PTX:       {
 ; PTX-NEXT:    .local .align 4 .b8 __local_depot4[4];
@@ -212,7 +214,7 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 ; PTX-NEXT:    ld.param.b32 %r2, [retval0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @multiple_grid_const_escape(
+; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
 ; OPT-NEXT:    [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
@@ -222,13 +224,14 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 ; OPT-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
 ; OPT-NEXT:    ret void
+;
   %a.addr = alloca i32, align 4
   store i32 %a, ptr %a.addr, align 4
   %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
   ret void
 }
 
-define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
+define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
 ; PTX-LABEL: grid_const_memory_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b64 %rd<6>;
@@ -241,7 +244,7 @@ define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %
 ; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
 ; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_memory_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
 ; OPT-NEXT:    [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
@@ -249,11 +252,12 @@ define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %
 ; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
 ; OPT-NEXT:    store ptr [[INPUT1]], ptr [[ADDR5]], align 8
 ; OPT-NEXT:    ret void
+;
   store ptr %input, ptr %addr, align 8
   ret void
 }
 
-define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
+define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
 ; PTX-LABEL: grid_const_inlineasm_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b64 %rd<8>;
@@ -271,7 +275,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
 ; PTX-NEXT:    st.global.u64 [%rd6], %rd1;
 ; PTX-NEXT:    ret;
 ; PTX-NOT      .local
-; OPT-LABEL: define void @grid_const_inlineasm_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
@@ -282,6 +286,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
 ; OPT-NEXT:    [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
 ; OPT-NEXT:    store i64 [[TMP2]], ptr [[RESULT5]], align 8
 ; OPT-NEXT:    ret void
+;
   %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
@@ -289,7 +294,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
   ret void
 }
 
-define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
+define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<5>;
@@ -319,7 +324,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-NEXT:    ld.param.b32 %r3, [retval0];
 ; PTX-NEXT:    } // callseq 2
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_partial_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
 ; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
@@ -330,6 +335,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; OPT-NEXT:    store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %input
   %twice = add i32 %val, %val
   store i32 %twice, ptr %output
@@ -337,7 +343,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
   ret void
 }
 
-define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
+define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escapemem(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<6>;
@@ -369,7 +375,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
 ; PTX-NEXT:    } // callseq 3
 ; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define i32 @grid_const_partial_escapemem(
+; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
@@ -383,6 +389,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
 ; OPT-NEXT:    [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
 ; OPT-NEXT:    ret i32 [[ADD]]
+;
   %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %val1 = load i32, ptr %ptr1
   %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -393,7 +400,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
   ret i32 %add
 }
 
-define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; PTX-LABEL: grid_const_phi(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -415,7 +422,7 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; PTX-NEXT:    ld.u32 %r2, [%rd8];
 ; PTX-NEXT:    st.global.u32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_phi(
+; OPT-LABEL: define ptx_kernel void @grid_const_phi(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -435,6 +442,7 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
 
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
@@ -453,7 +461,7 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
 ; PTX-LABEL: grid_const_phi_ngc(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -478,7 +486,7 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(
 ; PTX-NEXT:    ld.u32 %r2, [%rd11];
 ; PTX-NEXT:    st.global.u32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_phi_ngc(
+; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -500,6 +508,7 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   br i1 %less, label %first, label %second
@@ -517,7 +526,7 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
 ; PTX-LABEL: grid_const_select(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -539,7 +548,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; PTX-NEXT:    ld.u32 %r2, [%rd9];
 ; PTX-NEXT:    st.global.u32 [%rd3], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_select(
+; OPT-LABEL: define ptx_kernel void @grid_const_select(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -553,6 +562,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   %ptrnew = select i1 %less, ptr %input1, ptr %input2
@@ -561,7 +571,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
   ret void
 }
 
-define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
+define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-LABEL: grid_const_ptrtoint(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -576,7 +586,7 @@ define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
 ; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define i32 @grid_const_ptrtoint(
+; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
 ; OPT-NEXT:    [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
@@ -584,6 +594,7 @@ define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; OPT-NEXT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
 ; OPT-NEXT:    [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
 ; OPT-NEXT:    ret i32 [[KEEPALIVE]]
+;
   %val = load i32, ptr %input
   %ptrval = ptrtoint ptr %input to i32
   %keepalive = add i32 %val, %ptrval
@@ -598,40 +609,40 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
 
 !nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23}
 
-!0 = !{ptr @grid_const_int, !"kernel", i32 1, !"grid_constant", !1}
+!0 = !{ptr @grid_const_int, !"grid_constant", !1}
 !1 = !{i32 1}
 
-!2 = !{ptr @grid_const_struct, !"kernel", i32 1, !"grid_constant", !3}
+!2 = !{ptr @grid_const_struct, !"grid_constant", !3}
 !3 = !{i32 1}
 
-!4 = !{ptr @grid_const_escape, !"kernel", i32 1, !"grid_constant", !5}
+!4 = !{ptr @grid_const_escape, !"grid_constant", !5}
 !5 = !{i32 1}
 
-!6 = !{ptr @multiple_grid_const_escape, !"kernel", i32 1, !"grid_constant", !7}
+!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7}
 !7 = !{i32 1, i32 3}
 
-!8 = !{ptr @grid_const_memory_escape, !"kernel", i32 1, !"grid_constant", !9}
+!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9}
 !9 = !{i32 1}
 
-!10 = !{ptr @grid_const_inlineasm_escape, !"kernel", i32 1, !"grid_constant", !11}
+!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11}
 !11 = !{i32 1}
 
-!12 = !{ptr @grid_const_partial_escape, !"kernel", i32 1, !"grid_constant", !13}
+!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13}
 !13 = !{i32 1}
 
-!14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
+!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15}
 !15 = !{i32 1}
 
-!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17}
+!16 = !{ptr @grid_const_phi, !"grid_constant", !17}
 !17 = !{i32 1}
 
-!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19}
+!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19}
 !19 = !{i32 1}
 
-!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21}
+!20 = !{ptr @grid_const_select, !"grid_constant", !21}
 !21 = !{i32 1}
 
-!22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
+!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23}
 !23 = !{i32 1}
 
 
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index eba4f27..269bba7 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -65,7 +65,7 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
 }
 
 ; COMMON-LABEL: ptr_generic
-define void @ptr_generic(ptr %out, ptr %in) {
+define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
 ; IRC:  %in3 = addrspacecast ptr %in to ptr addrspace(1)
 ; IRC:  %in4 = addrspacecast ptr addrspace(1) %in3 to ptr
 ; IRC:  %out1 = addrspacecast ptr %out to ptr addrspace(1)
@@ -87,7 +87,7 @@ define void @ptr_generic(ptr %out, ptr %in) {
 }
 
 ; COMMON-LABEL: ptr_nongeneric
-define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; IR-NOT: addrspacecast
 ; PTX-NOT: cvta.to.global
 ; PTX:  ld.const.u32
@@ -98,7 +98,7 @@ define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 }
 
 ; COMMON-LABEL: ptr_as_int
- define void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
+ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
 ; IR:   [[P:%.*]] = inttoptr i64 %i to ptr
 ; IRC:  [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
 ; IRC:  addrspacecast ptr addrspace(1) [[P1]] to ptr
@@ -121,7 +121,7 @@ define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 %struct.S = type { i64 }
 
 ; COMMON-LABEL: ptr_as_int_aggr
-define void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
+define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
 ; IR:   [[S:%.*]] = addrspacecast ptr %s to ptr addrspace(101)
 ; IR:   [[I:%.*]] = load i64, ptr addrspace(101) [[S]], align 8
 ; IR:   [[P0:%.*]] = inttoptr i64 [[I]] to ptr
@@ -146,8 +146,3 @@ define void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) ali
 
 ; Function Attrs: convergent nounwind
 declare dso_local ptr @escape(ptr) local_unnamed_addr
-!nvvm.annotations = !{!0, !1, !2, !3}
-!0 = !{ptr @ptr_generic, !"kernel", i32 1}
-!1 = !{ptr @ptr_nongeneric, !"kernel", i32 1}
-!2 = !{ptr @ptr_as_int, !"kernel", i32 1}
-!3 = !{ptr @ptr_as_int_aggr, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 5c52626..26102722 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -24,8 +24,8 @@ declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture read
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only(
+define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -35,7 +35,7 @@ define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocap
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -45,7 +45,7 @@ define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocap
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -62,8 +62,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep(
+define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -74,7 +74,7 @@ define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr n
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -85,7 +85,7 @@ define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr n
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -104,8 +104,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep_asc(
+define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -116,7 +116,7 @@ define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, p
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep_asc(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -127,7 +127,7 @@ define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, p
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep_asc(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -148,8 +148,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep_asc0(
+define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -164,7 +164,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep_asc0(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -179,7 +179,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep_asc0(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -202,8 +202,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr(
+define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; SM_60-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -214,7 +214,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
 ; SM_60-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]])
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; SM_70-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -225,7 +225,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
 ; SM_70-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]])
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; COPY-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -240,8 +240,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_gep(
+define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; SM_60-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -253,7 +253,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
 ; SM_60-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]])
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_gep(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; SM_70-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -265,7 +265,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
 ; SM_70-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]])
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_gep(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; COPY-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -282,8 +282,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_store(
+define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -294,7 +294,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
 ; SM_60-NEXT:    store ptr [[S3]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -305,7 +305,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
 ; SM_70-NEXT:    store ptr [[S3]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_store(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -320,8 +320,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_gep_store(
+define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -333,7 +333,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
 ; SM_60-NEXT:    store ptr [[B]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_gep_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -345,7 +345,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
 ; SM_70-NEXT:    store ptr [[B]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_gep_store(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -362,8 +362,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptrtoint(
+define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -375,7 +375,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
 ; SM_60-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptrtoint(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -387,7 +387,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
 ; SM_70-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptrtoint(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -404,8 +404,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_from_param(
+define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -414,7 +414,7 @@ define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, p
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_from_param(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -423,7 +423,7 @@ define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, p
 ; SM_70-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_from_param(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -438,8 +438,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_from_param_noalign(
+define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -448,7 +448,7 @@ define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonl
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_from_param_noalign(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -457,7 +457,7 @@ define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonl
 ; SM_70-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_from_param_noalign(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 8
@@ -472,8 +472,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_to_param(
+define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; SM_60-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -484,7 +484,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
 ; SM_60-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_to_param(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; SM_70-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -495,7 +495,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
 ; SM_70-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_to_param(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; COPY-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -510,8 +510,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @copy_on_store(
+define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; SM_60-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -523,7 +523,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
 ; SM_60-NEXT:    store i32 [[I]], ptr [[S3]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @copy_on_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; SM_70-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -535,7 +535,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
 ; SM_70-NEXT:    store i32 [[I]], ptr [[S3]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @copy_on_store(
+; COPY-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; COPY-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -551,8 +551,8 @@ bb:
   ret void
 }
 
-define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
-; SM_60-LABEL: define void @test_select(
+define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_select(
 ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -568,7 +568,7 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
 ; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[OUT8]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_select(
+; SM_70-LABEL: define ptx_kernel void @test_select(
 ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -582,7 +582,7 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
 ; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_select(
+; COPY-LABEL: define ptx_kernel void @test_select(
 ; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca i32, align 4
@@ -603,8 +603,8 @@ bb:
   ret void
 }
 
-define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
-; SM_60-LABEL: define void @test_select_write(
+define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_select_write(
 ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -619,7 +619,7 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; SM_60-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_select_write(
+; SM_70-LABEL: define ptx_kernel void @test_select_write(
 ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -634,7 +634,7 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; SM_70-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_select_write(
+; COPY-LABEL: define ptx_kernel void @test_select_write(
 ; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca i32, align 4
@@ -653,8 +653,8 @@ bb:
   ret void
 }
 
-define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
-; SM_60-LABEL: define void @test_phi(
+define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_phi(
 ; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
@@ -678,7 +678,7 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
 ; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT8]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_phi(
+; SM_70-LABEL: define ptx_kernel void @test_phi(
 ; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
@@ -700,7 +700,7 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
 ; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_phi(
+; COPY-LABEL: define ptx_kernel void @test_phi(
 ; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8
@@ -740,8 +740,8 @@ merge:                                            ; preds = %second, %first
   ret void
 }
 
-define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
-; COMMON-LABEL: define void @test_phi_write(
+define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
+; COMMON-LABEL: define ptx_kernel void @test_phi_write(
 ; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; COMMON-NEXT:  [[BB:.*:]]
 ; COMMON-NEXT:    [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
@@ -784,29 +784,11 @@ attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite
 attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
-!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !23}
 !llvm.ident = !{!20, !21}
 
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{i32 7, !"frame-pointer", i32 2}
-!4 = !{ptr @read_only, !"kernel", i32 1}
-!5 = !{ptr @escape_ptr, !"kernel", i32 1}
-!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1}
-!7 = !{ptr @escape_ptr_store, !"kernel", i32 1}
-!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1}
-!9 = !{ptr @escape_ptrtoint, !"kernel", i32 1}
-!10 = !{ptr @memcpy_from_param, !"kernel", i32 1}
-!11 = !{ptr @memcpy_to_param, !"kernel", i32 1}
-!12 = !{ptr @copy_on_store, !"kernel", i32 1}
-!13 = !{ptr @read_only_gep, !"kernel", i32 1}
-!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1}
-!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1}
-!16 = !{ptr @test_select, !"kernel", i32 1}
-!17 = !{ptr @test_phi, !"kernel", i32 1}
-!18 = !{ptr @test_phi_write, !"kernel", i32 1}
-!19 = !{ptr @test_select_write, !"kernel", i32 1}
 !20 = !{!"clang version 20.0.0git"}
 !21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
-!23 = !{ptr @memcpy_from_param_noalign, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
index f8b3b4b..4ee1ca3 100644
--- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@@ -43,7 +43,7 @@ define internal void @bar() {
   ret void
 }
 
-; CHECK-LABEL: define weak_odr void @"nvptx$device$init"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init"() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_end, align 8
@@ -60,7 +60,7 @@ define internal void @bar() {
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define weak_odr void @"nvptx$device$fini"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini"() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_end, align 8
@@ -82,12 +82,10 @@ define internal void @bar() {
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 
-; CHECK: [[META0:![0-9]+]] = !{ptr @"nvptx$device$init", !"kernel", i32 1}
 ; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
 ; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
 ; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
 ; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxclusterrank", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"kernel", i32 1}
 ; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
 ; CHECK: [[META7:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
 ; CHECK: [[META8:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
index 9ec690a..2e64c25 100644
--- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-nvidia-cuda"
 
 ; Verify that both %input and %output are converted to global pointers and then
 ; addrspacecast'ed back to the original type.
-define void @kernel(ptr %input, ptr %output) {
+define ptx_kernel void @kernel(ptr %input, ptr %output) {
 ; CHECK-LABEL: .visible .entry kernel(
 ; CHECK: cvta.to.global.u64
 ; CHECK: cvta.to.global.u64
@@ -17,7 +17,7 @@ define void @kernel(ptr %input, ptr %output) {
   ret void
 }
 
-define void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
+define ptx_kernel void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
 ; CHECK-LABEL: .visible .entry kernel2(
 ; CHECK-NOT: cvta.to.global.u64
   %1 = load float, ptr addrspace(1) %input, align 4
@@ -29,7 +29,7 @@ define void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
 
 %struct.S = type { ptr, ptr }
 
-define void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) {
+define ptx_kernel void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) {
 ; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
 ; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
 ; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
@@ -60,7 +60,3 @@ define void @ptr_in_byval_func(ptr byval(%struct.S) %input, ptr %output) {
   ret void
 }
 
-!nvvm.annotations = !{!0, !1, !2}
-!0 = !{ptr @kernel, !"kernel", i32 1}
-!1 = !{ptr @kernel2, !"kernel", i32 1}
-!2 = !{ptr @ptr_in_byval_kernel, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
index 3389e09..c445c34 100644
--- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
+++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
@@ -11,16 +11,15 @@ target triple = "nvptx64-unknown-unknown"
 
 ; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
 ; sielently ignored.
-define dso_local void @_Z18TestMaxClusterRankv() {
+define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() {
 entry:
   %a = alloca i32, align 4
   store volatile i32 1, ptr %a, align 4
   ret void
 }
 
-!nvvm.annotations = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!1, !2, !3}
 
-!0 = !{ptr @_Z18TestMaxClusterRankv, !"kernel", i32 1}
 !1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
 !2 = !{ptr @_Z18TestMaxClusterRankv, !"minctasm", i32 2}
 !3 = !{ptr @_Z18TestMaxClusterRankv, !"maxclusterrank", i32 8}
diff --git a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
index 2bc6d4c..2a0c5ab 100644
--- a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -66,7 +66,4 @@ if.end17:                                         ; preds = %if.else13, %if.then
 }
 
 ; Function Attrs: noduplicate nounwind
-declare void @llvm.nvvm.barrier0() #2
-
-!0 = !{ptr @foo, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
+declare void @llvm.nvvm.barrier0() #2
+\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/noreturn.ll b/llvm/test/CodeGen/NVPTX/noreturn.ll
index 2161d70..6c11d0a 100644
--- a/llvm/test/CodeGen/NVPTX/noreturn.ll
+++ b/llvm/test/CodeGen/NVPTX/noreturn.ll
@@ -27,7 +27,7 @@ define void @true_noreturn0() #0 {
 ; CHECK: .entry ignore_kernel_noreturn()
 ; CHECK-NOT: .noreturn
 
-define void @ignore_kernel_noreturn() #0 {
+define ptx_kernel void @ignore_kernel_noreturn() #0 {
   unreachable
 }
 
@@ -35,7 +35,7 @@ define void @ignore_kernel_noreturn() #0 {
 ; CHECK: prototype_{{[0-9]+}} : .callprototype ()_ (.param .b32 _) .noreturn;
 ; CHECK: prototype_{{[0-9]+}} : .callprototype (.param .b32 _) _ (.param .b32 _);
 
-define void @callprototype_noreturn(i32) {
+define ptx_kernel void @callprototype_noreturn(i32) {
   %fn = load ptr, ptr addrspace(1) @function_pointer
   call void %fn(i32 %0) #0
   %non_void = bitcast ptr %fn to ptr
@@ -44,8 +44,3 @@ define void @callprototype_noreturn(i32) {
 }
 
 attributes #0 = { noreturn }
-
-!nvvm.annotations = !{!0, !1}
-
-!0 = !{ptr @ignore_kernel_noreturn, !"kernel", i32 1}
-!1 = !{ptr @callprototype_noreturn, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
index 48162ea..9a78d31 100644
--- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
@@ -3,7 +3,7 @@
 
 target triple = "nvptx-unknown-nvcl"
 
-define void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
+define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
 ; The parameter alignment is determined by the align attribute (default 1).
 ; CHECK-LABEL: .entry foo(
 ; CHECK: .param .u64 .ptr .align 32 foo_param_2
@@ -11,7 +11,6 @@ define void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
+!nvvm.annotations = !{!2, !3}
 !2 = !{ptr @foo, !"rdoimage", i32 0}
 !3 = !{ptr @foo, !"sampler", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
index ac5875c..83cb3cd 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -1,9 +1,9 @@
 ; Libdevice in recent CUDA versions relies on __CUDA_ARCH reflecting GPU type.
 ; Verify that __nvvm_reflect() is replaced with an appropriate value.
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
 
 @"$str" = private addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
index 9d38321..bf8d6e2 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
@@ -1,8 +1,8 @@
 ; Verify that __nvvm_reflect_ocl() is replaced with an appropriate value
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
 
 @"$str" = private addrspace(4) constant [12 x i8] c"__CUDA_ARCH\00"
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
index 46ab79d..19c74df 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,7 @@ exit:
 
 declare i32 @llvm.nvvm.reflect(ptr)
 
-; CHECK-LABEL: define noundef i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
index 2ed9f7c..244b44f 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,8 @@ exit:
 
 declare i32 @llvm.nvvm.reflect(ptr)
 
-; CHECK-LABEL: define noundef i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
+
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0
diff --git a/llvm/test/CodeGen/NVPTX/refl1.ll b/llvm/test/CodeGen/NVPTX/refl1.ll
index 34db3bb..99b83f4 100644
--- a/llvm/test/CodeGen/NVPTX/refl1.ll
+++ b/llvm/test/CodeGen/NVPTX/refl1.ll
@@ -5,7 +5,7 @@ target triple = "nvptx-nvidia-cuda"
 
 ; Function Attrs: nounwind
 ; CHECK: .entry foo
-define void @foo(ptr nocapture %a) #0 {
+define ptx_kernel void @foo(ptr nocapture %a) #0 {
   %val = load float, ptr %a
   %tan = tail call fastcc float @__nv_fast_tanf(float %val)
   store float %tan, ptr %a
@@ -34,7 +34,3 @@ entry:
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { alwaysinline inlinehint nounwind readnone }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/reg-copy.ll b/llvm/test/CodeGen/NVPTX/reg-copy.ll
index f66ef19..20396c4 100644
--- a/llvm/test/CodeGen/NVPTX/reg-copy.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-copy.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-define void @PR24303(ptr %f) {
+define ptx_kernel void @PR24303(ptr %f) {
 ; CHECK-LABEL: .visible .entry PR24303(
 ; Do not use mov.f or mov.u to convert between float and int.
 ; CHECK-NOT: mov.{{f|u}}{{32|64}} %f{{[0-9]+}}, %r{{[0-9]+}}
@@ -217,7 +217,3 @@ _ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit: ; preds = %if.then.93.i, %
 }
 
 declare float @llvm.nvvm.fabs.f(float)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @PR24303, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll
index 3580604..991ae04 100644
--- a/llvm/test/CodeGen/NVPTX/simple-call.ll
+++ b/llvm/test/CodeGen/NVPTX/simple-call.ll
@@ -10,7 +10,7 @@ define float @device_func(float %a) noinline {
 }
 
 ; CHECK: .entry kernel_func
-define void @kernel_func(ptr %a) {
+define ptx_kernel void @kernel_func(ptr %a) {
   %val = load float, ptr %a
 ; CHECK: call.uni (retval0),
 ; CHECK: device_func,
@@ -18,9 +18,3 @@ define void @kernel_func(ptr %a) {
   store float %mul, ptr %a
   ret void
 }
-
-
-
-!nvvm.annotations = !{!1}
-
-!1 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index 504dcde..7a7904a 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
 
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -34,7 +34,7 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
 
 @surf0 = internal addrspace(1) global i64 0, align 8
 
-define void @bar(ptr %red, i32 %idx) {
+define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -56,11 +56,5 @@ define void @bar(ptr %red, i32 %idx) {
   ret void
 }
 
-
-
-
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
-
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll
index e0cebd6..cd11b56 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-unknown-nvcl"
 declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
 ; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
@@ -16,6 +16,5 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @foo, !"rdwrimage", i32 0}
+!nvvm.annotations = !{!1}
+!1 = !{ptr @foo, !"rdwrimage", i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py
index 9607a58..90d6766 100644
--- a/llvm/test/CodeGen/NVPTX/surf-tex.py
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -224,11 +224,6 @@ def get_ptx_surface(target):
 def get_surface_metadata(target, fun_ty, fun_name, has_surface_param):
     metadata = []
 
-    md_kernel = '!{{{fun_ty} @{fun_name}, !"kernel", i32 1}}'.format(
-        fun_ty=fun_ty, fun_name=fun_name
-    )
-    metadata.append(md_kernel)
-
     if target == "cuda":
         # When a parameter is lowered as a .surfref, it still has the
         # corresponding ld.param.u64, which is illegal. Do not emit the
@@ -263,14 +258,14 @@ def gen_suld_tests(target, global_surf):
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${reg_ret}, [${reg_surf}, ${reg_access}]
   ;
-  define void @${test_name}_param(i64 %s, ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %s, ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %s, ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${reg_ret}, [${global_surf}, ${reg_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
     %val = tail call ${retty} @${intrinsic}(i64 %gs, ${access})
     store ${retty} %val, ${retty}* %ret
@@ -356,13 +351,13 @@ def gen_sust_tests(target, global_surf):
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} [${reg_surf}, ${reg_access}], ${reg_value}
   ;
-  define void @${test_name}_param(i64 %s, ${value}, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %s, ${value}, ${access}) {
     tail call void @${intrinsic}(i64 %s, ${access}, ${value})
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} [${global_surf}, ${reg_access}], ${reg_value}
-  define void @${test_name}_global(${value}, ${access}) {
+  define ptx_kernel void @${test_name}_global(${value}, ${access}) {
     %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
     tail call void @${intrinsic}(i64 %gs, ${access}, ${value})
     ret void
@@ -420,19 +415,13 @@ def gen_sust_tests(target, global_surf):
         generated_items.append((params["intrinsic"], params["instruction"]))
 
         fun_name = test_name + "_param"
-        fun_ty = "void (i64, {value_ty}, {access_ty})*".format(
-            value_ty=get_llvm_value_type(vec, ctype),
-            access_ty=get_llvm_surface_access_type(geom),
-        )
+        fun_ty = "ptr"
         generated_metadata += get_surface_metadata(
             target, fun_ty, fun_name, has_surface_param=True
         )
 
         fun_name = test_name + "_global"
-        fun_ty = "void ({value_ty}, {access_ty})*".format(
-            value_ty=get_llvm_value_type(vec, ctype),
-            access_ty=get_llvm_surface_access_type(geom),
-        )
+        fun_ty = "ptr"
         generated_metadata += get_surface_metadata(
             target, fun_ty, fun_name, has_surface_param=False
         )
@@ -559,11 +548,6 @@ def get_ptx_global_sampler(target, global_sampler):
 def get_texture_metadata(target, fun_ty, fun_name, has_texture_params):
     metadata = []
 
-    md_kernel = '!{{{fun_ty} @{fun_name}, !"kernel", i32 1}}'.format(
-        fun_ty=fun_ty, fun_name=fun_name
-    )
-    metadata.append(md_kernel)
-
     if target == "cuda":
         # When a parameter is lowered as a .texref, it still has the
         # corresponding ld.param.u64, which is illegal. Do not emit the
@@ -615,14 +599,14 @@ def gen_tex_tests(target, global_tex, global_sampler):
 
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${ptx_ret}, [${ptx_tex}, ${ptx_access}]
-  define void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
     ${get_sampler_handle}
     %val = tail call ${retty} @${intrinsic}(i64 %gt, ${sampler} ${access})
@@ -799,14 +783,14 @@ def gen_tld4_tests(target, global_tex, global_sampler):
 
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${ptx_ret}, [${ptx_tex}, ${ptx_access}]
-  define void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
     ${get_sampler_handle}
     %val = tail call ${retty} @${intrinsic}(i64 %gt, ${sampler} ${access})
diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
index 881ea45..5dc44cb 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -10,7 +10,7 @@ declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
 
-define void @foo(i64 %img, i32 %val, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i32 %val, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -30,7 +30,7 @@ define void @foo(i64 %img, i32 %val, i32 %idx) {
 @surf0 = internal addrspace(1) global i64 0, align 8
 
 
-define void @bar(i32 %val, i32 %idx) {
+define ptx_kernel void @bar(i32 %val, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -47,8 +47,6 @@ define void @bar(i32 %val, i32 %idx) {
 }
 
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
 
diff --git a/llvm/test/CodeGen/NVPTX/surf-write.ll b/llvm/test/CodeGen/NVPTX/surf-write.ll
index 258bb6d..0e1f0cc 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write.ll
@@ -6,12 +6,11 @@ target triple = "nvptx-unknown-nvcl"
 declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, i32 %val, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i32 %val, i32 %idx) {
 ; CHECK: sust.b.1d.b32.trap [foo_param_0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
   tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
   ret void
 }
 
-!nvvm.annotations = !{!1, !2}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @foo, !"wroimage", i32 0}
+!nvvm.annotations = !{!1}
+!1 = !{ptr @foo, !"wroimage", i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index ba556d2d..61837bd 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -10,7 +10,7 @@ target triple = "nvptx-unknown-cuda"
 declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -34,7 +34,7 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
 
 @tex0 = internal addrspace(1) global i64 0, align 8
 
-define void @bar(ptr %red, i32 %idx) {
+define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -57,7 +57,7 @@ define void @bar(ptr %red, i32 %idx) {
 
 declare float @texfunc(i64)
 
-define void @baz(ptr %red, i32 %idx) {
+define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-LABEL: baz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -93,8 +93,5 @@ define void @baz(ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3, !4}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @tex0, !"texture", i32 1}
-!4 = !{ptr @baz, !"kernel", i32 1}
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @tex0, !"texture", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll
index d11aea4..d74c89f5 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-unknown-nvcl"
 declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
 ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
@@ -15,7 +15,6 @@ define void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
+!nvvm.annotations = !{!2, !3}
 !2 = !{ptr @foo, !"rdoimage", i32 0}
 !3 = !{ptr @foo, !"sampler", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/unreachable.ll b/llvm/test/CodeGen/NVPTX/unreachable.ll
index 286f358..80cf938 100644
--- a/llvm/test/CodeGen/NVPTX/unreachable.ll
+++ b/llvm/test/CodeGen/NVPTX/unreachable.ll
@@ -21,7 +21,7 @@ target triple = "nvptx-unknown-cuda"
 declare void @throw() #0
 declare void @llvm.trap() #0
 
-define void @kernel_func() {
+define ptx_kernel void @kernel_func() {
 ; NO-TRAP-UNREACHABLE-LABEL: kernel_func(
 ; NO-TRAP-UNREACHABLE:       {
 ; NO-TRAP-UNREACHABLE-EMPTY:
@@ -102,6 +102,3 @@ define void @kernel_func_2() {
 }
 
 attributes #0 = { noreturn }
-
-!nvvm.annotations = !{!1}
-!1 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index cb54812..f7ed690 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -153,7 +153,7 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<6>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot2;
@@ -163,24 +163,20 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
 ; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd3];
-; CHECK-PTX-NEXT:    or.b64 %rd4, %rd3, 4;
-; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd4];
-; CHECK-PTX-NEXT:    or.b64 %rd5, %rd3, 5;
-; CHECK-PTX-NEXT:    or.b64 %rd6, %rd3, 7;
-; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd6];
+; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd3+4];
+; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd3+7];
 ; CHECK-PTX-NEXT:    st.u8 [%SP+2], %rs1;
-; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd5];
-; CHECK-PTX-NEXT:    or.b64 %rd7, %rd3, 6;
-; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd7];
+; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd3+5];
+; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd3+6];
 ; CHECK-PTX-NEXT:    shl.b16 %rs4, %rs3, 8;
 ; CHECK-PTX-NEXT:    or.b16 %rs5, %rs4, %rs2;
 ; CHECK-PTX-NEXT:    st.u16 [%SP], %rs5;
-; CHECK-PTX-NEXT:    ld.u64 %rd8, [%rd3+8];
+; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3+8];
 ; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd9, %r5;
-; CHECK-PTX-NEXT:    add.s64 %rd10, %rd9, %rd8;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd10;
+; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r5;
+; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
+; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd6;
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-PTX-NEXT:    ret;
 entry:
@@ -219,7 +215,7 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<10>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot3;
@@ -240,17 +236,16 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    st.u16 [%SP], %rs8;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 8;
-; CHECK-PTX-NEXT:    or.b64 %rd6, %rd5, 4;
 ; CHECK-PTX-NEXT:    mov.b16 %rs9, 1;
-; CHECK-PTX-NEXT:    st.u8 [%rd6], %rs9;
-; CHECK-PTX-NEXT:    mov.b64 %rd7, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd7;
+; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs9;
+; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
+; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
+; CHECK-PTX-NEXT:    add.u64 %rd6, %SP, 8;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
 ; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd6;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
 ; CHECK-PTX-NEXT:    call.uni (retval0),
 ; CHECK-PTX-NEXT:    variadics2,
diff --git a/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll b/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll
new file mode 100644
index 0000000..ec6fd7e
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll
@@ -0,0 +1,20 @@
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr7 < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr7 --filetype=obj -o %t.o < %s
+; RUN: llvm-objdump --syms %t.o | FileCheck %s --check-prefix=OBJ
+
+%struct.anon = type {}
+
+@a = internal constant %struct.anon zeroinitializer, align 1
+@b = internal constant [6 x i8] c"hello\00", align 1
+
+; CHECK:      	.csect L.._MergedGlobals[RO],2
+; CHECK-NEXT: 	.lglobl	a                               # @_MergedGlobals
+; CHECK-NEXT: 	.lglobl	b
+; CHECK-NEXT: a:
+; CHECK-NEXT: b:
+; CHECK-NEXT: 	.string	"hello"
+
+; OBJ:      0000000000000000 l       .text	0000000000000006 L.._MergedGlobals
+; OBJ-NEXT: 0000000000000000 l       .text (csect: L.._MergedGlobals) 	0000000000000000 a
+; OBJ-NEXT: 0000000000000000 l       .text (csect: L.._MergedGlobals) 	0000000000000000 b
diff --git a/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll b/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
index 83763f5..2c5ad94 100644
--- a/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
+++ b/llvm/test/CodeGen/PowerPC/lsr-postinc-pos.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -print-lsr-output 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -stop-after=loop-reduce | FileCheck %s
 
 ; The icmp is a post-inc use, and the increment is in %bb11, but the
 ; scevgep needs to be inserted in %bb so that it is dominated by %t.
diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll
index 0634534..39f9269 100644
--- a/llvm/test/CodeGen/PowerPC/memcmp.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmp.ll
@@ -6,13 +6,12 @@ define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture reado
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ldbrx 3, 0, 3
 ; CHECK-NEXT:    ldbrx 4, 0, 4
-; CHECK-NEXT:    subc 5, 4, 3
-; CHECK-NEXT:    subfe 5, 4, 4
-; CHECK-NEXT:    subc 4, 3, 4
-; CHECK-NEXT:    subfe 3, 3, 3
-; CHECK-NEXT:    neg 5, 5
+; CHECK-NEXT:    cmpld 3, 4
+; CHECK-NEXT:    subc 3, 4, 3
+; CHECK-NEXT:    subfe 3, 4, 4
+; CHECK-NEXT:    li 4, -1
 ; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    sub 3, 5, 3
+; CHECK-NEXT:    isellt 3, 4, 3
 ; CHECK-NEXT:    extsw 3, 3
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8)
@@ -24,12 +23,11 @@ define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture reado
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lwbrx 3, 0, 3
 ; CHECK-NEXT:    lwbrx 4, 0, 4
+; CHECK-NEXT:    cmplw 3, 4
 ; CHECK-NEXT:    sub 5, 4, 3
-; CHECK-NEXT:    sub 3, 3, 4
+; CHECK-NEXT:    li 3, -1
 ; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    rldicl 3, 3, 1, 63
-; CHECK-NEXT:    sub 3, 5, 3
-; CHECK-NEXT:    extsw 3, 3
+; CHECK-NEXT:    isellt 3, 3, 5
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
   ret i32 %call
diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
index 0a8bec7..b57d2b5 100644
--- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
@@ -59,22 +59,14 @@ define signext i32 @test2(ptr nocapture readonly %buffer1, ptr nocapture readonl
   ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
   ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
   ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
-  ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
-  ; CHECK-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
-  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
-  ; CHECK-NEXT: ret i32 [[SUB]]
+  ; CHECK-NEXT: [[UCMP:%[0-9]+]] = call i32 @llvm.ucmp.i32.i32(i32 [[BSWAP1]], i32 [[BSWAP2]])
+  ; CHECK-NEXT: ret i32 [[UCMP]]
 
   ; CHECK-BE-LABEL: @test2(
   ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
   ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
-  ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
-  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
-  ; CHECK-BE-NEXT: ret i32 [[SUB]]
+  ; CHECK-BE-NEXT: [[UCMP:%[0-9]+]] = call i32 @llvm.ucmp.i32.i32(i32 [[LOAD1]], i32 [[LOAD2]])
+  ; CHECK-BE-NEXT: ret i32 [[UCMP]]
 
 entry:
   %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
diff --git a/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll b/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
index 025a5ad..2f0b929 100644
--- a/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll
@@ -13,6 +13,8 @@ $bar1 = comdat any
 @bar2 = global i32 0, align 4, comdat($bar1)
 
 declare i32 @call_foo(i32, ...)
+declare i32 @call_strictfp() strictfp
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 
 define i32 @foo() {
 entry:
@@ -21,6 +23,23 @@ entry:
   ret i32 %0
 }
 
+define i32 @foo1() strictfp {
+entry:
+  %call = call i32 (i32, ...) @call_foo(i32 0)
+  ret i32 %call
+}
+
+define i32 @foo1_strictfp() strictfp {
+entry:
+  %call = call i32 () @call_strictfp()
+  ret i32 %call
+}
+
+define void @foo2(ptr %a) {
+  call void @llvm.memset.p0.i64(ptr align 1 %a, i8 1, i64 1000, i1 false)
+  ret void
+}
+
 define i32 @load() {
 entry:
   %0 = load i32, ptr @bar1
@@ -49,6 +68,31 @@ entry:
 ; LARGE-SECUREPLT:   addi 30, 30, .LTOC-.L0$pb@l
 ; LARGE-SECUREPLT:   bl call_foo@PLT+32768
 
+; LARGE-SECUREPLT-LABEL: foo1:
+; LARGE-SECUREPLT:       .L1$pb:
+; LARGE-SECUREPLT-NEXT:    crxor 6, 6, 6
+; LARGE-SECUREPLT-NEXT:    mflr 30
+; LARGE-SECUREPLT-NEXT:    addis 30, 30, .LTOC-.L1$pb@ha
+; LARGE-SECUREPLT-NEXT:    addi 30, 30, .LTOC-.L1$pb@l
+; LARGE-SECUREPLT-NEXT:    li 3, 0
+; LARGE-SECUREPLT-NEXT:    bl call_foo@PLT+32768
+
+; LARGE-SECUREPLT-LABEL: foo1_strictfp:
+; LARGE-SECUREPLT:       .L2$pb:
+; LARGE-SECUREPLT-NEXT:    mflr 30
+; LARGE-SECUREPLT-NEXT:    addis 30, 30, .LTOC-.L2$pb@ha
+; LARGE-SECUREPLT-NEXT:    addi 30, 30, .LTOC-.L2$pb@l
+; LARGE-SECUREPLT-NEXT:    bl call_strictfp@PLT+32768
+
+; LARGE-SECUREPLT-LABEL: foo2:
+; LARGE-SECUREPLT:       .L3$pb:
+; LARGE-SECUREPLT:         mflr 30
+; LARGE-SECUREPLT-NEXT:    addis 30, 30, .LTOC-.L3$pb@ha
+; LARGE-SECUREPLT-NEXT:    addi 30, 30, .LTOC-.L3$pb@l
+; LARGE-SECUREPLT:         bl memset@PLT+32768
+
+; LARGE-SECUREPLT-LABEEL: load:
+
 ; LARGE:      .section .bss.bar1,"awG",@nobits,bar1,comdat
 ; LARGE:      bar1:
 ; LARGE:      .section .bss.bar2,"awG",@nobits,bar1,comdat
diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
index b6799c8..f62f70c 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_5.ll
@@ -11,24 +11,35 @@
 define i8 @scalar_to_vector_half(ptr nocapture readonly %ad) {
 ; P9LE-LABEL: scalar_to_vector_half:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    lhz r3, 0(r3)
+; P9LE-NEXT:    lxsihzx v2, 0, r3
+; P9LE-NEXT:    li r3, 0
+; P9LE-NEXT:    vsplth v2, v2, 3
+; P9LE-NEXT:    vextubrx r3, r3, v2
 ; P9LE-NEXT:    blr
 ;
 ; P9BE-LABEL: scalar_to_vector_half:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    lhz r3, 0(r3)
-; P9BE-NEXT:    srwi r3, r3, 24
+; P9BE-NEXT:    lxsihzx v2, 0, r3
+; P9BE-NEXT:    li r3, 0
+; P9BE-NEXT:    vsplth v2, v2, 3
+; P9BE-NEXT:    vextublx r3, r3, v2
 ; P9BE-NEXT:    blr
 ;
 ; P8LE-LABEL: scalar_to_vector_half:
 ; P8LE:       # %bb.0: # %entry
 ; P8LE-NEXT:    lhz r3, 0(r3)
+; P8LE-NEXT:    mtfprd f0, r3
+; P8LE-NEXT:    mffprd r3, f0
+; P8LE-NEXT:    clrldi r3, r3, 56
 ; P8LE-NEXT:    blr
 ;
 ; P8BE-LABEL: scalar_to_vector_half:
 ; P8BE:       # %bb.0: # %entry
 ; P8BE-NEXT:    lhz r3, 0(r3)
-; P8BE-NEXT:    srwi r3, r3, 24
+; P8BE-NEXT:    sldi r3, r3, 48
+; P8BE-NEXT:    mtfprd f0, r3
+; P8BE-NEXT:    mffprd r3, f0
+; P8BE-NEXT:    rldicl r3, r3, 8, 56
 ; P8BE-NEXT:    blr
 entry:
     %0 = load <2 x i8>, ptr %ad, align 1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
index ff56ab1..0fd23a7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
@@ -14,7 +14,7 @@ define i32 @add_positive_low_bound_reject(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_positive_low_bound_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2047
   ret i32 %1
@@ -30,7 +30,7 @@ define i32 @add_positive_low_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_low_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    addiw a0, a0, 1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2048
   ret i32 %1
@@ -46,7 +46,7 @@ define i32 @add_positive_high_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_high_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 4094
   ret i32 %1
@@ -63,8 +63,8 @@ define i32 @add_positive_high_bound_reject(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_high_bound_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1
-; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 4095
   ret i32 %1
@@ -78,7 +78,7 @@ define i32 @add_negative_high_bound_reject(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_negative_high_bound_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -2048
   ret i32 %1
@@ -94,7 +94,7 @@ define i32 @add_negative_high_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_high_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, -2048
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    addiw a0, a0, -1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -2049
   ret i32 %1
@@ -110,7 +110,7 @@ define i32 @add_negative_low_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_low_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, -2048
-; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -4096
   ret i32 %1
@@ -127,8 +127,8 @@ define i32 @add_negative_low_bound_reject(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_low_bound_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1048575
-; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -4097
   ret i32 %1
@@ -144,7 +144,7 @@ define i32 @add32_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add32_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 952
+; RV64I-NEXT:    addiw a0, a0, 952
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2999
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
index ee41499..f1c0fcc 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
@@ -37,7 +37,7 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) {
 ; RV64IM-NEXT:    slli a1, a1, 56
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    srai a1, a1, 56
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sext i8 %a to i32
@@ -58,7 +58,7 @@ define i32 @add_i8_zeroext_i32(i8 %a, i8 %b) {
 ; RV64IM:       # %bb.0: # %entry
 ; RV64IM-NEXT:    andi a0, a0, 255
 ; RV64IM-NEXT:    andi a1, a1, 255
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = zext i8 %a to i32
@@ -78,7 +78,7 @@ define i32 @add_i32(i32 %a, i32 %b) {
 ;
 ; RV64IM-LABEL: add_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = add i32 %a, %b
@@ -93,7 +93,7 @@ define i32 @addi_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: addi_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    addi a0, a0, 1234
+; RV64IM-NEXT:    addiw a0, a0, 1234
 ; RV64IM-NEXT:    ret
 entry:
   %0 = add i32 %a, 1234
@@ -108,7 +108,7 @@ define i32 @sub_i32(i32 %a, i32 %b) {
 ;
 ; RV64IM-LABEL: sub_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    sub a0, a0, a1
+; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 %a, %b
@@ -123,7 +123,7 @@ define i32 @subi_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: subi_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    addi a0, a0, -1234
+; RV64IM-NEXT:    addiw a0, a0, -1234
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 %a, 1234
@@ -138,7 +138,7 @@ define i32 @neg_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: neg_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    neg a0, a0
+; RV64IM-NEXT:    negw a0, a0
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 0, %a
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
new file mode 100644
index 0000000..3a55189
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
@@ -0,0 +1,453 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -global-isel -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32ZBB
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64I
+; RUN: llc -mtriple=riscv64 -global-isel -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64ZBB
+
+define i32 @expanded_neg_abs32(i32 %x) {
+; RV32I-LABEL: expanded_neg_abs32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    blt a0, a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    min a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    blt a2, a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB0_2:
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    negw a1, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    max a0, a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.smax.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i32 @expanded_neg_abs32_unsigned(i32 %x) {
+; RV32I-LABEL: expanded_neg_abs32_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    bltu a0, a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs32_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    minu a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs32_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    bltu a2, a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB1_2:
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs32_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    negw a1, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    maxu a0, a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.umax.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i64 @expanded_neg_abs64(i64 %x) {
+; RV32I-LABEL: expanded_neg_abs64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a2
+; RV32I-NEXT:    beqz a4, .LBB2_3
+; RV32I-NEXT:    j .LBB2_4
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    bnez a4, .LBB2_4
+; RV32I-NEXT:  .LBB2_3:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:  .LBB2_4:
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB2_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a4, a1, a2
+; RV32ZBB-NEXT:    beqz a4, .LBB2_3
+; RV32ZBB-NEXT:    j .LBB2_4
+; RV32ZBB-NEXT:  .LBB2_2:
+; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    bnez a4, .LBB2_4
+; RV32ZBB-NEXT:  .LBB2_3:
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:  .LBB2_4:
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    snez a1, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    blt a0, a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB2_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    min a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.smax.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i64 @expanded_neg_abs64_unsigned(i64 %x) {
+; RV32I-LABEL: expanded_neg_abs64_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a2
+; RV32I-NEXT:    beqz a4, .LBB3_3
+; RV32I-NEXT:    j .LBB3_4
+; RV32I-NEXT:  .LBB3_2:
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    bnez a4, .LBB3_4
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:  .LBB3_4:
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs64_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB3_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a4, a1, a2
+; RV32ZBB-NEXT:    beqz a4, .LBB3_3
+; RV32ZBB-NEXT:    j .LBB3_4
+; RV32ZBB-NEXT:  .LBB3_2:
+; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    bnez a4, .LBB3_4
+; RV32ZBB-NEXT:  .LBB3_3:
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:  .LBB3_4:
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    snez a1, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs64_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    bltu a0, a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB3_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs64_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    minu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.umax.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i32 @expanded_neg_inv_abs32(i32 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    blt a1, a0, .LBB4_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB4_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    max a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    blt a1, a2, .LBB4_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB4_2:
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    negw a1, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    min a0, a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.smin.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    bltu a1, a0, .LBB5_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB5_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    maxu a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    bltu a1, a2, .LBB5_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB5_2:
+; RV64I-NEXT:    negw a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    negw a1, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    minu a0, a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.umin.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i64 @expanded_neg_inv_abs64(i64 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a2, a1
+; RV32I-NEXT:    beqz a4, .LBB6_3
+; RV32I-NEXT:    j .LBB6_4
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    bnez a4, .LBB6_4
+; RV32I-NEXT:  .LBB6_3:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:  .LBB6_4:
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB6_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a4, a2, a1
+; RV32ZBB-NEXT:    beqz a4, .LBB6_3
+; RV32ZBB-NEXT:    j .LBB6_4
+; RV32ZBB-NEXT:  .LBB6_2:
+; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    bnez a4, .LBB6_4
+; RV32ZBB-NEXT:  .LBB6_3:
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:  .LBB6_4:
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    snez a1, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    blt a1, a0, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB6_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.smin.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a2, a1
+; RV32I-NEXT:    beqz a4, .LBB7_3
+; RV32I-NEXT:    j .LBB7_4
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    bnez a4, .LBB7_4
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:  .LBB7_4:
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB7_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a4, a2, a1
+; RV32ZBB-NEXT:    beqz a4, .LBB7_3
+; RV32ZBB-NEXT:    j .LBB7_4
+; RV32ZBB-NEXT:  .LBB7_2:
+; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    bnez a4, .LBB7_4
+; RV32ZBB-NEXT:  .LBB7_3:
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:  .LBB7_4:
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    snez a1, a3
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    bltu a1, a0, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB7_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.umin.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
index 9c7fd68..360e84d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
@@ -21,6 +21,7 @@ define i32 @constant_to_rhs(i32 %x) {
 ; RV64-O0-NEXT:    mv a1, a0
 ; RV64-O0-NEXT:    li a0, 1
 ; RV64-O0-NEXT:    add a0, a0, a1
+; RV64-O0-NEXT:    sext.w a0, a0
 ; RV64-O0-NEXT:    ret
 ;
 ; RV32-OPT-LABEL: constant_to_rhs:
@@ -30,7 +31,7 @@ define i32 @constant_to_rhs(i32 %x) {
 ;
 ; RV64-OPT-LABEL: constant_to_rhs:
 ; RV64-OPT:       # %bb.0:
-; RV64-OPT-NEXT:    addi a0, a0, 1
+; RV64-OPT-NEXT:    addiw a0, a0, 1
 ; RV64-OPT-NEXT:    ret
   %a = add i32 1, %x
   ret i32 %a
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
index 385156b..4878699 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-
 ; RUN: llc -mtriple=riscv32 -mattr=+zfa,d -global-isel < %s \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV32IDZFA
 ; RUN: llc -mtriple=riscv64 -mattr=+zfa,d -global-isel < %s \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV64DZFA
 
 
 define double @fceil(double %a) {
@@ -86,3 +85,32 @@ define double @fminimum(double %a, double %b) {
   %c = call double @llvm.minimum.f64(double %a, double %b)
   ret double %c
 }
+
+define i64 @fmvh_x_d(double %fa) {
+; RV32IDZFA-LABEL: fmvh_x_d:
+; RV32IDZFA:       # %bb.0:
+; RV32IDZFA-NEXT:    fmv.x.w a0, fa0
+; RV32IDZFA-NEXT:    fmvh.x.d a1, fa0
+; RV32IDZFA-NEXT:    ret
+;
+; RV64DZFA-LABEL: fmvh_x_d:
+; RV64DZFA:       # %bb.0:
+; RV64DZFA-NEXT:    fmv.x.d a0, fa0
+; RV64DZFA-NEXT:    ret
+  %i = bitcast double %fa to i64
+  ret i64 %i
+}
+
+define double @fmvp_d_x(i64 %a) {
+; RV32IDZFA-LABEL: fmvp_d_x:
+; RV32IDZFA:       # %bb.0:
+; RV32IDZFA-NEXT:    fmvp.d.x fa0, a0, a1
+; RV32IDZFA-NEXT:    ret
+;
+; RV64DZFA-LABEL: fmvp_d_x:
+; RV64DZFA:       # %bb.0:
+; RV64DZFA-NEXT:    fmv.d.x fa0, a0
+; RV64DZFA-NEXT:    ret
+  %or = bitcast i64 %a to double
+  ret double %or
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
index 72f0ab1..234f338 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
@@ -96,12 +96,19 @@ define ptr @freeze_ptr(ptr %x) {
 %struct.T = type { i32, i32 }
 
 define i32 @freeze_struct(ptr %p) {
-; CHECK-LABEL: freeze_struct:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_struct:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_struct:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load %struct.T, ptr %p
   %y1 = freeze %struct.T %s
   %v1 = extractvalue %struct.T %y1, 0
@@ -111,12 +118,19 @@ define i32 @freeze_struct(ptr %p) {
 }
 
 define i32 @freeze_anonstruct(ptr %p) {
-; CHECK-LABEL: freeze_anonstruct:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_anonstruct:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_anonstruct:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load {i32, i32}, ptr %p
   %y1 = freeze {i32, i32} %s
   %v1 = extractvalue {i32, i32} %y1, 0
@@ -141,7 +155,7 @@ define i32 @freeze_anonstruct2(ptr %p) {
 ; RV64-NEXT:    lw a0, 0(a0)
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    srli a1, a1, 48
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a1
 ; RV64-NEXT:    ret
   %s = load {i32, i16}, ptr %p
   %y1 = freeze {i32, i16} %s
@@ -168,7 +182,7 @@ define i32 @freeze_anonstruct2_sext(ptr %p) {
 ; RV64-NEXT:    lw a0, 0(a0)
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    srai a1, a1, 48
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a1
 ; RV64-NEXT:    ret
   %s = load {i32, i16}, ptr %p
   %y1 = freeze {i32, i16} %s
@@ -180,12 +194,19 @@ define i32 @freeze_anonstruct2_sext(ptr %p) {
 }
 
 define i32 @freeze_array(ptr %p) nounwind {
-; CHECK-LABEL: freeze_array:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_array:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_array:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load [2 x i32], ptr %p
   %y1 = freeze [2 x i32] %s
   %v1 = extractvalue [2 x i32] %y1, 0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
index 1156edf..31a78d4 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
@@ -98,7 +98,7 @@ define i32 @abs32(i32 %x) {
 ; RV64I-LABEL: abs32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a1, a0, 31
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index a27e2b8..dbc1384 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -23,7 +23,7 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_SUB (opcode [[SUB_OPC:[0-9]+]]): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode [[SUB_OPC]] is aliased to [[ADD_OPC]]
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
@@ -59,7 +59,6 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_AND (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
index 22ce8a0..78a2227b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
@@ -86,9 +86,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[ASSERT_SEXT]], [[C]](s64)
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASSERT_SEXT]], [[ASHR]]
-    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ADD]], [[ASHR]]
-    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
-    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[SEXT_INREG]], [[ASHR]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG1]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: abs_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
index 48b65a1..8f2b9f3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
@@ -69,7 +69,8 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
index f2ec709..eed1aac 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
@@ -339,7 +339,7 @@ body:             |
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ICMP]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
     %2:_(s64) = COPY $x10
@@ -454,10 +454,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
-    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
-    ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG1]](s64), [[SEXT_INREG2]]
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ICMP]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
     %2:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
index 57fc513..e28572d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
@@ -145,7 +145,8 @@ body:             |
     ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -64769
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = G_CONSTANT i32 -64769
     %1:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
index 6cc5477..62d7313 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
@@ -59,7 +59,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s8) = G_TRUNC %1(s64)
@@ -129,7 +130,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s16) = G_TRUNC %1(s64)
@@ -175,16 +177,19 @@ body:             |
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[LSHR5]], [[C6]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[OR4]], [[AND6]]
-    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND7]], [[C2]](s64)
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[LSHR6]], [[C7]]
-    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C7]]
+    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C7]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND8]], [[AND9]]
-    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C3]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C3]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C8]]
+    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C8]]
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND10]], [[C9]]
@@ -192,7 +197,8 @@ body:             |
     ; RV64I-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[AND11]], [[C10]](s64)
     ; RV64I-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; RV64I-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C11]], [[LSHR8]]
-    ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB1]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG3]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: ctlz_i32
@@ -328,7 +334,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s8) = G_TRUNC %1(s64)
@@ -398,7 +405,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s16) = G_TRUNC %1(s64)
@@ -444,16 +452,19 @@ body:             |
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[LSHR5]], [[C6]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[OR4]], [[AND6]]
-    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND7]], [[C2]](s64)
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[LSHR6]], [[C7]]
-    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C7]]
+    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C7]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND8]], [[AND9]]
-    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C3]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C3]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C8]]
+    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C8]]
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND10]], [[C9]]
@@ -461,7 +472,8 @@ body:             |
     ; RV64I-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[AND11]], [[C10]](s64)
     ; RV64I-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; RV64I-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C11]], [[LSHR8]]
-    ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB1]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG3]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: ctlz_zero_undef_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
index 1493514..c61c46d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
@@ -129,18 +129,21 @@ body:             |
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C2]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[AND1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND2]], [[C3]](s64)
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C4]]
-    ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C4]]
+    ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C4]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND3]], [[AND4]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C5]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C5]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C6]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C6]]
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND5]], [[C7]]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
index 252e792..87155bb 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
@@ -131,7 +131,8 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[C]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[SEXT_INREG]]
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[AND]], [[C2]]
@@ -139,18 +140,21 @@ body:             |
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C3]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND]], [[AND2]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C2]]
+    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C2]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND3]], [[C4]](s64)
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C5]]
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C5]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C5]]
     ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[AND4]], [[AND5]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD1]], [[C6]](s64)
-    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD1]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG2]], [[C6]](s64)
+    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG2]]
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD2]], 32
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ADD2]], [[C7]]
+    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG3]], [[C7]]
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND6]], [[C8]]
@@ -351,7 +355,8 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[C]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[SEXT_INREG]]
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[AND]], [[C2]]
@@ -359,18 +364,21 @@ body:             |
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C3]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND]], [[AND2]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C2]]
+    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C2]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND3]], [[C4]](s64)
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C5]]
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C5]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C5]]
     ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[AND4]], [[AND5]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD1]], [[C6]](s64)
-    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD1]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG2]], [[C6]](s64)
+    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG2]]
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD2]], 32
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ADD2]], [[C7]]
+    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG3]], [[C7]]
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND6]], [[C8]]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
index f3bc1ce..aff7d4d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
@@ -30,8 +30,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C]]
     ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
index 4689a7d..776f5f5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
@@ -88,9 +88,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; RV64I-NEXT: [[SLLW:%[0-9]+]]:_(s64) = G_SLLW [[COPY]], [[AND]]
-    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[SRLW:%[0-9]+]]:_(s64) = G_SRLW [[COPY]], [[AND1]]
     ; RV64I-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SLLW]], [[SRLW]]
     ; RV64I-NEXT: $x10 = COPY [[OR]](s64)
@@ -233,9 +234,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; RV64I-NEXT: [[SRLW:%[0-9]+]]:_(s64) = G_SRLW [[COPY]], [[AND]]
-    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[SLLW:%[0-9]+]]:_(s64) = G_SLLW [[COPY]], [[AND1]]
     ; RV64I-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SRLW]], [[SLLW]]
     ; RV64I-NEXT: $x10 = COPY [[OR]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
index bf8c8d6..d162bfc 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
@@ -16,8 +16,8 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ADD]](s64)
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG]](s64)
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
@@ -97,7 +97,8 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -2147483648
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ADD1]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG3]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[COPY2]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
@@ -173,10 +174,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SUB]](s64)
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
-    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG1]](s64), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[TRUNC]]
@@ -250,7 +252,8 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -2147483648
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG3]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[COPY2]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
index da3ab9e..7ab07ee 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
@@ -69,7 +69,8 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 0b876fe..9df319e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -18,7 +18,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -30,15 +30,15 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -75,7 +75,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -87,15 +87,15 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -133,15 +133,14 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    li s0, 32
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    addiw a0, a0, -1
 ; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    beqz a2, .LBB2_2
+; RV64I-NEXT:    beqz a0, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -153,15 +152,15 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -200,7 +199,7 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a0, a0, 1
 ; RV64I-NEXT:    lui a1, 349525
 ; RV64I-NEXT:    or a0, s0, a0
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addi a1, a1, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -212,15 +211,15 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -271,7 +270,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    srliw a0, a0, 2
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srli a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srli a2, a0, 4
@@ -283,15 +282,15 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -299,7 +298,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
 ; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    subw a0, a1, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    .cfi_restore ra
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -408,19 +407,19 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -451,19 +450,19 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -493,19 +492,19 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, s0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -549,19 +548,19 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, s0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -669,18 +668,18 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -706,18 +705,18 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -746,19 +745,19 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lwu a0, 0(a0)
 ; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addi a1, a1, 1365
 ; RV64I-NEXT:    srli a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -1057,7 +1056,7 @@ define i32 @abs_i32(i32 %x) {
 ; RV64I-LABEL: abs_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a1, a0, 31
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
index 4346e04..daeb2e6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
@@ -97,7 +97,7 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    slt a2, a1, a0
 ; RV64I-NEXT:    slt a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
   ret i32 %1
@@ -122,7 +122,7 @@ define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slt a2, a1, a0
 ; RV64I-NEXT:    slt a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
index 9784c58..463883b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
@@ -97,7 +97,7 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -115,7 +115,7 @@ define i32 @ucmp.32.32_sext(i32 signext %x, i32 signext %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -135,7 +135,7 @@ define i32 @ucmp.32.32_zext(i32 zeroext %x, i32 zeroext %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -160,7 +160,7 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
index 47b6c07..d7f62ae 100644
--- a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
+++ b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
@@ -1,17 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefix=RV64 %s
+; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefixes=RV64,NO-ZBA %s
+; RUN: llc -mtriple=riscv64 -mattr=+zba < %s | FileCheck -check-prefixes=RV64,ZBA %s
 
 define void @add_sext_shl_moreOneUse_add(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a3, a1, 5
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a2, 20(a0)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    sw a3, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sw a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sw a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %idxprom = sext i32 %add to i64
@@ -29,19 +40,32 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_addexceedsign12(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a3, a1, 2047
-; RV64-NEXT:    lui a4, 2
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    addi a3, a3, 1
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    sw a2, 0(a0)
-; RV64-NEXT:    sw a3, 4(a0)
-; RV64-NEXT:    sw a2, 120(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 2047
+; NO-ZBA-NEXT:    lui a4, 2
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    addi a3, a3, 1
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a2, 0(a0)
+; NO-ZBA-NEXT:    sw a3, 4(a0)
+; NO-ZBA-NEXT:    sw a2, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 2047
+; ZBA-NEXT:    lui a4, 2
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    addi a3, a3, 1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a4
+; ZBA-NEXT:    sw a2, 0(a0)
+; ZBA-NEXT:    sw a3, 4(a0)
+; ZBA-NEXT:    sw a2, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %idxprom = sext i32 %add to i64
@@ -57,16 +81,26 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_sext(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_sext:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    addi a3, a1, 5
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a2, 20(a0)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    sd a3, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_sext:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sd a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_sext:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sd a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %idxprom = sext i32 %add to i64
@@ -85,20 +119,34 @@ entry:
 
 ; test of jumpping, find add's operand has one more use can simplified
 define void @add_sext_shl_moreOneUse_add_inSelect(ptr %array1, i32 signext  %a, i32 %b, i32 signext %x) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add_inSelect:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    bgtz a3, .LBB3_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a5, a2
-; RV64-NEXT:  .LBB3_2: # %entry
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a5, 20(a0)
-; RV64-NEXT:    sw a5, 24(a0)
-; RV64-NEXT:    sw a4, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB3_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB3_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a5, 20(a0)
+; NO-ZBA-NEXT:    sw a5, 24(a0)
+; NO-ZBA-NEXT:    sw a4, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB3_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB3_2: # %entry
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a5, 20(a0)
+; ZBA-NEXT:    sw a5, 24(a0)
+; ZBA-NEXT:    sw a4, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %cmp = icmp sgt i32 %x, 0
@@ -118,23 +166,40 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_add_inSelect_addexceedsign12(ptr %array1, i32 signext  %a, i32 %b, i32 signext %x) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 2047
-; RV64-NEXT:    lui a5, 2
-; RV64-NEXT:    slli a6, a1, 2
-; RV64-NEXT:    addi a1, a4, 1
-; RV64-NEXT:    add a0, a0, a6
-; RV64-NEXT:    add a0, a0, a5
-; RV64-NEXT:    mv a4, a1
-; RV64-NEXT:    bgtz a3, .LBB4_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a4, a2
-; RV64-NEXT:  .LBB4_2: # %entry
-; RV64-NEXT:    sw a4, 0(a0)
-; RV64-NEXT:    sw a4, 4(a0)
-; RV64-NEXT:    sw a1, 120(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 2047
+; NO-ZBA-NEXT:    lui a5, 2
+; NO-ZBA-NEXT:    slli a6, a1, 2
+; NO-ZBA-NEXT:    addi a1, a4, 1
+; NO-ZBA-NEXT:    add a0, a0, a6
+; NO-ZBA-NEXT:    add a0, a0, a5
+; NO-ZBA-NEXT:    mv a4, a1
+; NO-ZBA-NEXT:    bgtz a3, .LBB4_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a4, a2
+; NO-ZBA-NEXT:  .LBB4_2: # %entry
+; NO-ZBA-NEXT:    sw a4, 0(a0)
+; NO-ZBA-NEXT:    sw a4, 4(a0)
+; NO-ZBA-NEXT:    sw a1, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 2047
+; ZBA-NEXT:    lui a5, 2
+; ZBA-NEXT:    addi a4, a4, 1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a5
+; ZBA-NEXT:    mv a1, a4
+; ZBA-NEXT:    bgtz a3, .LBB4_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a1, a2
+; ZBA-NEXT:  .LBB4_2: # %entry
+; ZBA-NEXT:    sw a1, 0(a0)
+; ZBA-NEXT:    sw a1, 4(a0)
+; ZBA-NEXT:    sw a4, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %cmp = icmp sgt i32 %x, 0
@@ -152,20 +217,34 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect(ptr %array1, i64 %a, i64 %b, i64 %x) {
-; RV64-LABEL: add_shl_moreOneUse_inSelect:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    bgtz a3, .LBB5_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a5, a2
-; RV64-NEXT:  .LBB5_2: # %entry
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sd a5, 40(a0)
-; RV64-NEXT:    sd a5, 48(a0)
-; RV64-NEXT:    sd a4, 280(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB5_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB5_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 3
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sd a5, 40(a0)
+; NO-ZBA-NEXT:    sd a5, 48(a0)
+; NO-ZBA-NEXT:    sd a4, 280(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB5_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB5_2: # %entry
+; ZBA-NEXT:    sh3add a0, a1, a0
+; ZBA-NEXT:    sd a5, 40(a0)
+; ZBA-NEXT:    sd a5, 48(a0)
+; ZBA-NEXT:    sd a4, 280(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i64 %a, 5
   %cmp = icmp sgt i64 %x, 0
@@ -180,3 +259,103 @@ entry:
   store i64 %add, ptr %arrayidx6
   ret void
 }
+
+define i64 @add_shl_moreOneUse_sh1add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 1
+; NO-ZBA-NEXT:    ori a0, a0, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 1
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_moreOneUse_sh2add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 2
+; NO-ZBA-NEXT:    ori a0, a0, 4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh2add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 2
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_moreOneUse_sh3add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    ori a0, a0, 8
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh3add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 3
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+;; Covers a case which previously crashed (pr119527)
+define i64 @add_shl_sext(i32 %1) {
+; RV64-LABEL: add_shl_sext:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi a1, a0, 3
+; RV64-NEXT:    sllw a0, a1, a0
+; RV64-NEXT:    ret
+  %3 = add i32 %1, 3
+  %4 = shl i32 %3, %1
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+
+define i64 @add_shl_moreOneUse_sh4add(i64 %x) {
+; RV64-LABEL: add_shl_moreOneUse_sh4add:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ori a1, a0, 1
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    ori a0, a0, 16
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 4
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_rhs_constant(i64 %x, i64 %y) {
+; RV64-LABEL: add_shl_rhs_constant:
+; RV64:       # %bb.0:
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    ret
+  %a = add i64 %x, 1
+  %b = add i64 %y, %a
+  %c = shl i64 %b, 3
+  %d = add i64 %c, -8
+  ret i64 %d
+}
diff --git a/llvm/test/CodeGen/RISCV/add_shl_constant.ll b/llvm/test/CodeGen/RISCV/add_shl_constant.ll
index 71b6186..a4da9e2 100644
--- a/llvm/test/CodeGen/RISCV/add_shl_constant.ll
+++ b/llvm/test/CodeGen/RISCV/add_shl_constant.ll
@@ -1,13 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=riscv32  < %s | FileCheck -check-prefix=RV32 %s
+; RUN: llc -mtriple=riscv32  < %s | FileCheck -check-prefixes=RV32,NO-ZBA %s
+; RUN: llc -mtriple=riscv32 -mattr=+zba  < %s | FileCheck -check-prefixes=RV32,ZBA %s
 
 define i32 @add_shl_oneUse(i32 %x, i32 %y) nounwind {
-; RV32-LABEL: add_shl_oneUse:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    addi a0, a0, 984
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_oneUse:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    addi a0, a0, 984
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_oneUse:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    addi a0, a0, 123
+; ZBA-NEXT:    sh3add a0, a0, a1
+; ZBA-NEXT:    ret
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -15,15 +22,24 @@ define i32 @add_shl_oneUse(i32 %x, i32 %y) nounwind {
 }
 
 define void @add_shl_moreOneUse_inStore(ptr %array1, i32 %a, i32 %b)  {
-; RV32-LABEL: add_shl_moreOneUse_inStore:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a3, a1, 5
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sw a2, 20(a0)
-; RV32-NEXT:    sw a2, 24(a0)
-; RV32-NEXT:    sw a3, 140(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inStore:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sw a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inStore:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sw a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %arrayidx = getelementptr inbounds i32, ptr %array1, i32 %add
@@ -37,18 +53,30 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inStore_addexceedsign12(ptr %array1, i32 %a, i32 %b)  {
-; RV32-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a3, a1, 2047
-; RV32-NEXT:    lui a4, 2
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    addi a3, a3, 1
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw a2, 120(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 2047
+; NO-ZBA-NEXT:    lui a4, 2
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    addi a3, a3, 1
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    add a0, a0, a4
+; NO-ZBA-NEXT:    sw a2, 0(a0)
+; NO-ZBA-NEXT:    sw a3, 4(a0)
+; NO-ZBA-NEXT:    sw a2, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 2047
+; ZBA-NEXT:    lui a4, 2
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    addi a3, a3, 1
+; ZBA-NEXT:    add a0, a0, a4
+; ZBA-NEXT:    sw a2, 0(a0)
+; ZBA-NEXT:    sw a3, 4(a0)
+; ZBA-NEXT:    sw a2, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %arrayidx = getelementptr inbounds i32, ptr %array1, i32 %add
@@ -62,20 +90,34 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect(ptr %array1, i32 %a, i32 %b, i32 %x) {
-; RV32-LABEL: add_shl_moreOneUse_inSelect:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a4, a1, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    bgtz a3, .LBB3_2
-; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    mv a5, a2
-; RV32-NEXT:  .LBB3_2: # %entry
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sw a5, 20(a0)
-; RV32-NEXT:    sw a5, 24(a0)
-; RV32-NEXT:    sw a4, 140(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB3_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB3_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a5, 20(a0)
+; NO-ZBA-NEXT:    sw a5, 24(a0)
+; NO-ZBA-NEXT:    sw a4, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB3_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB3_2: # %entry
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a5, 20(a0)
+; ZBA-NEXT:    sw a5, 24(a0)
+; ZBA-NEXT:    sw a4, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %cmp = icmp sgt i32 %x, 0
@@ -91,23 +133,40 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect_addexceedsign12(ptr %array1, i32 %a, i32 %b, i32 %x) {
-; RV32-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a4, a1, 2047
-; RV32-NEXT:    addi a4, a4, 1
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    bgtz a3, .LBB4_2
-; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    mv a5, a2
-; RV32-NEXT:  .LBB4_2: # %entry
-; RV32-NEXT:    lui a2, 2
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sw a5, 0(a0)
-; RV32-NEXT:    sw a5, 4(a0)
-; RV32-NEXT:    sw a4, 120(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 2047
+; NO-ZBA-NEXT:    addi a4, a4, 1
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB4_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB4_2: # %entry
+; NO-ZBA-NEXT:    lui a2, 2
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    add a0, a0, a2
+; NO-ZBA-NEXT:    sw a5, 0(a0)
+; NO-ZBA-NEXT:    sw a5, 4(a0)
+; NO-ZBA-NEXT:    sw a4, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 2047
+; ZBA-NEXT:    addi a4, a4, 1
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB4_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB4_2: # %entry
+; ZBA-NEXT:    lui a2, 2
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a2
+; ZBA-NEXT:    sw a5, 0(a0)
+; ZBA-NEXT:    sw a5, 4(a0)
+; ZBA-NEXT:    sw a4, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %cmp = icmp sgt i32 %x, 0
@@ -121,3 +180,77 @@ entry:
   store i32 %add, ptr %arrayidx6
   ret void
 }
+
+define i32 @add_shl_moreOneUse_sh1add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 1
+; NO-ZBA-NEXT:    ori a0, a0, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 1
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh2add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 2
+; NO-ZBA-NEXT:    ori a0, a0, 4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh2add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 2
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh3add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    ori a0, a0, 8
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh3add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 3
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh4add(i32 %x) {
+; RV32-LABEL: add_shl_moreOneUse_sh4add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ori a1, a0, 1
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    ori a0, a0, 16
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 4
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f63bc94..c0fcc6f 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -82,6 +82,9 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV32XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcia %s -o - | FileCheck --check-prefix=RV32XQCIA %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciac %s -o - | FileCheck --check-prefix=RV32XQCIAC %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicli %s -o - | FileCheck --check-prefix=RV32XQCICLI %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm %s -o - | FileCheck --check-prefix=RV32XQCICM %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s
@@ -294,6 +297,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+supm %s -o - | FileCheck --check-prefix=RV64SUPM %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-smctr  %s -o - | FileCheck --check-prefix=RV64SMCTR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssctr  %s -o - | FileCheck --check-prefix=RV64SSCTR %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-sdext  %s -o - | FileCheck --check-prefix=RV64SDEXT %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-sdtrig  %s -o - | FileCheck --check-prefix=RV64SDTRIG %s
 
 ; Tests for profile features.
 ; RUN: llc -mtriple=riscv32 -mattr=+rvi20u32 %s -o - | FileCheck --check-prefix=RVI20U32 %s
@@ -391,6 +396,9 @@
 ; RV32XTHEADSYNC: .attribute 5, "rv32i2p1_xtheadsync1p0"
 ; RV32XWCHC: .attribute 5, "rv32i2p1_xwchc2p2"
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p2"
+; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p2"
+; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p2"
+; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
 ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2"
 ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2"
@@ -601,6 +609,8 @@
 ; RV64SUPM: .attribute 5, "rv64i2p1_supm1p0"
 ; RV64SMCTR: .attribute 5, "rv64i2p1_smctr1p0_sscsrind1p0"
 ; RV64SSCTR: .attribute 5, "rv64i2p1_sscsrind1p0_ssctr1p0"
+; RV64SDEXT: .attribute 5, "rv64i2p1_sdext1p0"
+; RV64SDTRIG: .attribute 5, "rv64i2p1_sdtrig1p0"
 
 ; RVI20U32: .attribute 5, "rv32i2p1"
 ; RVI20U64: .attribute 5, "rv64i2p1"
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 99db90c..70fbda4 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -15,6 +15,8 @@
 ; CHECK:   e                                - 'E' (Embedded Instruction Set with 16 GPRs).
 ; CHECK:   experimental                     - Experimental intrinsics.
 ; CHECK:   experimental-rvm23u32            - RISC-V experimental-rvm23u32 profile.
+; CHECK:   experimental-sdext               - 'Sdext' (External debugger).
+; CHECK:   experimental-sdtrig              - 'Sdtrig' (Debugger triggers).
 ; CHECK:   experimental-smctr               - 'Smctr' (Control Transfer Records Machine Level).
 ; CHECK:   experimental-ssctr               - 'Ssctr' (Control Transfer Records Supervisor Level).
 ; CHECK:   experimental-svukte              - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses).
diff --git a/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll b/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
index 4c47b5f..2c428cf 100644
--- a/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
+++ b/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
@@ -20,7 +20,7 @@ define void @f2(ptr noundef %x) #0 {
   ; CHECK-NEXT:   liveins: $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprtc = COPY $x10
-  ; CHECK-NEXT:   PseudoTAILIndirect [[COPY]], implicit $x2, cfi-type 12345678
+  ; CHECK-NEXT:   PseudoTAILIndirect [[COPY]], csr_ilp32_lp64, implicit $x2, cfi-type 12345678
   tail call void %x() [ "kcfi"(i32 12345678) ]
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/kcfi-mir.ll b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
index f9f383a..0c0d39a 100644
--- a/llvm/test/CodeGen/RISCV/kcfi-mir.ll
+++ b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
@@ -30,7 +30,7 @@ define void @f2(ptr noundef %x) #0 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   BUNDLE implicit-def $x6, implicit-def $x6_w, implicit-def $x6_h, implicit-def $x7, implicit-def $x7_w, implicit-def $x7_h, implicit-def $x28, implicit-def $x28_w, implicit-def $x28_h, implicit-def $x29, implicit-def $x29_w, implicit-def $x29_h, implicit-def $x30, implicit-def $x30_w, implicit-def $x30_h, implicit-def $x31, implicit-def $x31_w, implicit-def $x31_h, implicit killed $x10, implicit $x2 {
   ; CHECK-NEXT:     KCFI_CHECK $x10, 12345678, implicit-def $x6, implicit-def $x7, implicit-def $x28, implicit-def $x29, implicit-def $x30, implicit-def $x31
-  ; CHECK-NEXT:     PseudoTAILIndirect killed $x10, implicit $x2
+  ; CHECK-NEXT:     PseudoTAILIndirect killed $x10, csr_ilp32_lp64, implicit $x2
   ; CHECK-NEXT:   }
   tail call void %x() [ "kcfi"(i32 12345678) ]
   ret void
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index d529ae6..f9086ba9 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2449,82 +2449,72 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3:
@@ -2658,9 +2648,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_4:
@@ -2671,9 +2661,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_4:
@@ -2682,9 +2672,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_4:
@@ -2695,9 +2685,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_4:
@@ -2845,22 +2835,19 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB26_2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB26_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5:
@@ -2883,22 +2870,17 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB26_2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB26_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5:
@@ -3052,28 +3034,19 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB27_3: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a3, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6:
@@ -3102,28 +3075,17 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB27_3: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6:
@@ -3500,9 +3462,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_8:
@@ -3533,9 +3495,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_8:
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index 860c3a9..f029029 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -2710,6 +2710,216 @@ entry:
   ret i1 %ret
 }
 
+define i1 @bcmp_le_zero(ptr %s1, ptr %s2) nounwind {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_le_zero:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-LABEL: bcmp_le_zero:
+; CHECK-UNALIGNED:       # %bb.0: # %entry
+; CHECK-UNALIGNED-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4)
+  %ret = icmp slt i32 %bcmp, 1
+  ret i1 %ret
+}
+
+define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-V-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_ge_zero:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-LABEL: bcmp_ge_zero:
+; CHECK-UNALIGNED:       # %bb.0: # %entry
+; CHECK-UNALIGNED-NEXT:    li a0, 1
+; CHECK-UNALIGNED-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4)
+  %ret = icmp sgt i32 %bcmp, -1
+  ret i1 %ret
+}
+
 define i32 @memcmp_size_0(ptr %s1, ptr %s2) nounwind {
 ; CHECK-LABEL: memcmp_size_0:
 ; CHECK:       # %bb.0: # %entry
@@ -3145,82 +3355,72 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 2(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB24_2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 2(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 2(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB24_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a2, 2(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a2, a2, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_3:
@@ -3354,9 +3554,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_4:
@@ -3367,9 +3567,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_4:
@@ -3378,9 +3578,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_4:
@@ -3391,9 +3591,9 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_4:
@@ -3527,13 +3727,13 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB26_2
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB28_2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB26_2: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB28_2: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -3541,22 +3741,19 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB26_2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB26_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a2, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a3, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_5:
@@ -3565,13 +3762,13 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB26_2
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB28_2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB26_2: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB28_2: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -3579,22 +3776,17 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB26_2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB26_2: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_5:
@@ -3728,7 +3920,7 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB27_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB29_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 4(a1)
@@ -3736,11 +3928,11 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB27_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB29_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB27_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB29_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -3748,28 +3940,19 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB27_3: # %res_block
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a2, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a3, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lwu a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a1, a1, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_6:
@@ -3778,7 +3961,7 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB27_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB29_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 4(a1)
@@ -3786,11 +3969,11 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB27_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB29_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB27_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB29_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -3798,28 +3981,17 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 4(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB27_3: # %res_block
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_6:
@@ -3953,17 +4125,17 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB28_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB30_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -3977,7 +4149,7 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 3(a1)
@@ -3985,11 +4157,11 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB28_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB30_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -4001,17 +4173,17 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB28_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB30_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4025,7 +4197,7 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 3(a1)
@@ -4033,11 +4205,11 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB30_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB28_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB30_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -4174,17 +4346,17 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB29_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB29_3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB29_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB31_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -4196,9 +4368,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_size_8:
@@ -4207,17 +4379,17 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB29_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB29_3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB29_3: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB31_3: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4229,9 +4401,9 @@ define i32 @memcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_size_8:
@@ -4365,29 +4537,29 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 11(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 11(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB30_5: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB32_5: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -4399,17 +4571,17 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB30_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 7(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 7(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB30_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB30_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB32_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -4421,29 +4593,29 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 11(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 11(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB30_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB30_5: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB32_5: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4455,17 +4627,17 @@ define i32 @memcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB30_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 7(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 7(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB30_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB30_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB32_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -4602,29 +4774,29 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB31_5: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB33_5: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -4636,17 +4808,17 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB31_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB31_3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB31_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB33_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -4658,29 +4830,29 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB31_5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB31_5: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB33_5: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4692,17 +4864,17 @@ define i32 @memcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB31_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB31_3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB31_3: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB33_3: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -4839,53 +5011,53 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 16(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 16(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 20(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 20(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 24(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 27(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB32_9: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB34_9: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -4897,29 +5069,29 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 23(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 23(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB32_5: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB34_5: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -4931,53 +5103,53 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 16(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 16(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 20(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 20(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 24(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 27(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB32_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB34_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB32_9: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB34_9: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -4989,29 +5161,29 @@ define i32 @memcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 23(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 23(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB32_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB32_5: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB34_5: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -5148,53 +5320,53 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 16(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 16(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 20(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 20(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 24(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 28(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB33_9: # %res_block
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:  .LBB35_9: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ori a0, a0, 1
@@ -5206,29 +5378,29 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB33_5: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB35_5: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -5240,53 +5412,53 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 8(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 12(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 16(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 16(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 20(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 20(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 24(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 28(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB33_9
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB35_9
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB33_9: # %res_block
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  .LBB35_9: # %res_block
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ori a0, a0, 1
@@ -5298,29 +5470,29 @@ define i32 @memcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB33_5
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.4:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB33_5: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB35_5: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -5417,53 +5589,53 @@ define i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 32(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 32(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 40(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 40(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 48(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 55(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB34_9: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB36_9: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -5475,53 +5647,53 @@ define i32 @memcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 32(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 32(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 40(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 40(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 48(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 55(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB34_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB36_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB34_9: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB36_9: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -5608,53 +5780,53 @@ define i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 32(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 32(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 40(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 40(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 48(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 56(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB35_9: # %res_block
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:  .LBB37_9: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ori a0, a0, 1
@@ -5666,53 +5838,53 @@ define i32 @memcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 8(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 8(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2: # %loadbb2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 16(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.3: # %loadbb3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 24(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.4: # %loadbb4
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 32(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 32(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.5: # %loadbb5
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 40(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 40(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.6: # %loadbb6
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 48(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.7: # %loadbb7
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 56(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB35_9
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB37_9
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.8:
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a0, 0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB35_9: # %res_block
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  .LBB37_9: # %res_block
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    neg a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ori a0, a0, 1
@@ -6374,5 +6546,381 @@ entry:
   %ret = icmp sgt i32 %memcmp, 0
   ret i1 %ret
 }
+
+define i1 @memcmp_le_zero(ptr %s1, ptr %s2) nounwind {
+; CHECK-ALIGNED-RV32-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: memcmp_le_zero:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 1
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV32-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV32-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV64-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV64-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV32-V-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV32-V-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_le_zero:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV64-V-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV64-V-NEXT:    slti a0, a0, 1
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4)
+  %ret = icmp slt i32 %memcmp, 1
+  ret i1 %ret
+}
+
+define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
+; CHECK-ALIGNED-RV32-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    call memcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-V-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: memcmp_ge_zero:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    xori a0, a0, 1
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV32-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV32-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV64-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sltu a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sltu a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV32-V-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_ge_zero:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-UNALIGNED-RV64-V-NEXT:    call memcmp
+; CHECK-UNALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xori a0, a0, 1
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
+entry:
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4)
+  %ret = icmp sgt i32 %memcmp, -1
+  ret i1 %ret
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-ALIGNED: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index 7d6a6d7..fe19a4fa 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -258,3 +258,447 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
   %neg = sub nsw i64 0, %abs
   ret i64 %neg
 }
+
+define i32 @expanded_neg_abs32(i32 %x) {
+; RV32I-LABEL: expanded_neg_abs32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    blt a0, a1, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    min a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    blt a1, a0, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB6_2:
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.smax.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i32 @expanded_neg_abs32_unsigned(i32 %x) {
+; RV32I-LABEL: expanded_neg_abs32_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    bltu a0, a1, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs32_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    minu a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs32_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    bltu a1, a0, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB7_2:
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs32_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.umax.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i64 @expanded_neg_abs64(i64 %x) {
+; RV32I-LABEL: expanded_neg_abs64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a2
+; RV32I-NEXT:    beqz a4, .LBB8_3
+; RV32I-NEXT:    j .LBB8_4
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    bnez a4, .LBB8_4
+; RV32I-NEXT:  .LBB8_3:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:  .LBB8_4:
+; RV32I-NEXT:    snez a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB8_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a4, a1, a2
+; RV32ZBB-NEXT:    beqz a4, .LBB8_3
+; RV32ZBB-NEXT:    j .LBB8_4
+; RV32ZBB-NEXT:  .LBB8_2:
+; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    bnez a4, .LBB8_4
+; RV32ZBB-NEXT:  .LBB8_3:
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:  .LBB8_4:
+; RV32ZBB-NEXT:    snez a0, a3
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    blt a0, a1, .LBB8_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB8_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    min a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.smax.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i64 @expanded_neg_abs64_unsigned(i64 %x) {
+; RV32I-LABEL: expanded_neg_abs64_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a2
+; RV32I-NEXT:    beqz a4, .LBB9_3
+; RV32I-NEXT:    j .LBB9_4
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    sltu a4, a0, a3
+; RV32I-NEXT:    bnez a4, .LBB9_4
+; RV32I-NEXT:  .LBB9_3:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:  .LBB9_4:
+; RV32I-NEXT:    snez a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_abs64_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB9_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a4, a1, a2
+; RV32ZBB-NEXT:    beqz a4, .LBB9_3
+; RV32ZBB-NEXT:    j .LBB9_4
+; RV32ZBB-NEXT:  .LBB9_2:
+; RV32ZBB-NEXT:    sltu a4, a0, a3
+; RV32ZBB-NEXT:    bnez a4, .LBB9_4
+; RV32ZBB-NEXT:  .LBB9_3:
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:  .LBB9_4:
+; RV32ZBB-NEXT:    snez a0, a3
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_abs64_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    bltu a0, a1, .LBB9_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB9_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_abs64_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    minu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.umax.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i32 @expanded_neg_inv_abs32(i32 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    blt a1, a0, .LBB10_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    max a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    blt a0, a1, .LBB10_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB10_2:
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    min a0, a0, a1
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.smin.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    bltu a1, a0, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:  .LBB11_2:
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    maxu a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    bltu a0, a1, .LBB11_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB11_2:
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a1, a0
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    minu a0, a0, a1
+; RV64ZBB-NEXT:    negw a0, a0
+; RV64ZBB-NEXT:    ret
+  %n = sub i32 0, %x
+  %t = call i32 @llvm.umin.i32(i32 %n, i32 %x)
+  %r = sub i32 0, %t
+  ret i32 %r
+}
+
+define i64 @expanded_neg_inv_abs64(i64 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB12_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a2, a1
+; RV32I-NEXT:    beqz a4, .LBB12_3
+; RV32I-NEXT:    j .LBB12_4
+; RV32I-NEXT:  .LBB12_2:
+; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    bnez a4, .LBB12_4
+; RV32I-NEXT:  .LBB12_3:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:  .LBB12_4:
+; RV32I-NEXT:    snez a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB12_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    slt a4, a2, a1
+; RV32ZBB-NEXT:    beqz a4, .LBB12_3
+; RV32ZBB-NEXT:    j .LBB12_4
+; RV32ZBB-NEXT:  .LBB12_2:
+; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    bnez a4, .LBB12_4
+; RV32ZBB-NEXT:  .LBB12_3:
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:  .LBB12_4:
+; RV32ZBB-NEXT:    snez a0, a3
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    blt a1, a0, .LBB12_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB12_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.smin.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
+
+define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) {
+; RV32I-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    snez a2, a0
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    sub a2, a3, a2
+; RV32I-NEXT:    neg a3, a0
+; RV32I-NEXT:    beq a2, a1, .LBB13_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a2, a1
+; RV32I-NEXT:    beqz a4, .LBB13_3
+; RV32I-NEXT:    j .LBB13_4
+; RV32I-NEXT:  .LBB13_2:
+; RV32I-NEXT:    sltu a4, a3, a0
+; RV32I-NEXT:    bnez a4, .LBB13_4
+; RV32I-NEXT:  .LBB13_3:
+; RV32I-NEXT:    mv a2, a1
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:  .LBB13_4:
+; RV32I-NEXT:    snez a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    snez a2, a0
+; RV32ZBB-NEXT:    neg a3, a1
+; RV32ZBB-NEXT:    sub a2, a3, a2
+; RV32ZBB-NEXT:    neg a3, a0
+; RV32ZBB-NEXT:    beq a2, a1, .LBB13_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    sltu a4, a2, a1
+; RV32ZBB-NEXT:    beqz a4, .LBB13_3
+; RV32ZBB-NEXT:    j .LBB13_4
+; RV32ZBB-NEXT:  .LBB13_2:
+; RV32ZBB-NEXT:    sltu a4, a3, a0
+; RV32ZBB-NEXT:    bnez a4, .LBB13_4
+; RV32ZBB-NEXT:  .LBB13_3:
+; RV32ZBB-NEXT:    mv a2, a1
+; RV32ZBB-NEXT:    mv a3, a0
+; RV32ZBB-NEXT:  .LBB13_4:
+; RV32ZBB-NEXT:    snez a0, a3
+; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    neg a1, a0
+; RV32ZBB-NEXT:    neg a0, a3
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    bltu a1, a0, .LBB13_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:  .LBB13_2:
+; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: expanded_neg_inv_abs64_unsigned:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %n = sub i64 0, %x
+  %t = call i64 @llvm.umin.i64(i64 %n, i64 %x)
+  %r = sub i64 0, %t
+  ret i64 %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
index 332e497..44ab0e1 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
 ; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV32I
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32I
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+xtheadba -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV32XTHEADBA
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32XTHEADBA
 
 define signext i16 @th_addsl_1(i64 %0, ptr %1) {
 ; RV32I-LABEL: th_addsl_1:
@@ -324,3 +324,563 @@ define i32 @mul288(i32 %a) {
   %c = mul i32 %a, 288
   ret i32 %c
 }
+
+define i32 @mul258(i32 %a) {
+; RV32I-LABEL: mul258:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 258
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul258:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 8
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 258
+  ret i32 %c
+}
+
+define i32 @mul260(i32 %a) {
+; RV32I-LABEL: mul260:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 260
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul260:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 8
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 260
+  ret i32 %c
+}
+
+define i32 @mul264(i32 %a) {
+; RV32I-LABEL: mul264:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 264
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul264:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 8
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 264
+  ret i32 %c
+}
+
+define i32 @mul11(i32 %a) {
+; RV32I-LABEL: mul11:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul11:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 11
+  ret i32 %c
+}
+
+define i32 @mul19(i32 %a) {
+; RV32I-LABEL: mul19:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 19
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul19:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 19
+  ret i32 %c
+}
+
+define i32 @mul13(i32 %a) {
+; RV32I-LABEL: mul13:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 13
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul13:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 1
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 13
+  ret i32 %c
+}
+
+define i32 @mul21(i32 %a) {
+; RV32I-LABEL: mul21:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 21
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul21:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 21
+  ret i32 %c
+}
+
+define i32 @mul37(i32 %a) {
+; RV32I-LABEL: mul37:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 37
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul37:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 37
+  ret i32 %c
+}
+
+define i32 @mul25(i32 %a) {
+; RV32I-LABEL: mul25:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 25
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul25:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 25
+  ret i32 %c
+}
+
+define i32 @mul41(i32 %a) {
+; RV32I-LABEL: mul41:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 41
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul41:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 41
+  ret i32 %c
+}
+
+define i32 @mul73(i32 %a) {
+; RV32I-LABEL: mul73:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 73
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul73:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 73
+  ret i32 %c
+}
+
+define i32 @mul27(i32 %a) {
+; RV32I-LABEL: mul27:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 27
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul27:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 27
+  ret i32 %c
+}
+
+define i32 @mul45(i32 %a) {
+; RV32I-LABEL: mul45:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 45
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul45:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 45
+  ret i32 %c
+}
+
+define i32 @mul81(i32 %a) {
+; RV32I-LABEL: mul81:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 81
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul81:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 81
+  ret i32 %c
+}
+
+define i32 @mul4098(i32 %a) {
+; RV32I-LABEL: mul4098:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 1
+; RV32I-NEXT:    slli a0, a0, 12
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul4098:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 12
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 4098
+  ret i32 %c
+}
+
+define i32 @mul4100(i32 %a) {
+; RV32I-LABEL: mul4100:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    slli a0, a0, 12
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul4100:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 12
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 4100
+  ret i32 %c
+}
+
+define i32 @mul4104(i32 %a) {
+; RV32I-LABEL: mul4104:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 3
+; RV32I-NEXT:    slli a0, a0, 12
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul4104:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    slli a1, a0, 12
+; RV32XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 4104
+  ret i32 %c
+}
+
+define i32 @add4104(i32 %a) {
+; RV32I-LABEL: add4104:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a1, a1, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: add4104:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    li a1, 1026
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    ret
+  %c = add i32 %a, 4104
+  ret i32 %c
+}
+
+define i32 @add8208(i32 %a) {
+; RV32I-LABEL: add8208:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 2
+; RV32I-NEXT:    addi a1, a1, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: add8208:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    li a1, 1026
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV32XTHEADBA-NEXT:    ret
+  %c = add i32 %a, 8208
+  ret i32 %c
+}
+
+define i32 @add8192(i32 %a) {
+; CHECK-LABEL: add8192:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = add i32 %a, 8192
+  ret i32 %c
+}
+
+define i32 @addshl_5_6(i32 %a, i32 %b) {
+; CHECK-LABEL: addshl_5_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 6
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 6
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i32 @addshl_5_7(i32 %a, i32 %b) {
+; CHECK-LABEL: addshl_5_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 7
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 7
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i32 @addshl_5_8(i32 %a, i32 %b) {
+; CHECK-LABEL: addshl_5_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 8
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 8
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i32 @srli_1_sh2add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_1_sh2add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_1_sh2add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 1
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = getelementptr inbounds i32, ptr %0, i32 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_2_sh3add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_2_sh3add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_2_sh3add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 2
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a1, 3
+; RV32XTHEADBA-NEXT:    lw a0, 0(a1)
+; RV32XTHEADBA-NEXT:    lw a1, 4(a1)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = getelementptr inbounds i64, ptr %0, i32 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define signext i16 @srli_2_sh1add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_2_sh1add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lh a0, 0(a0)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_2_sh1add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 2
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV32XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = getelementptr inbounds i16, ptr %0, i32 %3
+  %5 = load i16, ptr %4, align 2
+  ret i16 %5
+}
+
+define i32 @srli_3_sh2add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_3_sh2add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_3_sh2add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 3
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV32XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 3
+  %4 = getelementptr inbounds i32, ptr %0, i32 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_4_sh3add(ptr %0, i32 %1) {
+; RV32I-LABEL: srli_4_sh3add:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    andi a1, a1, -8
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    lw a0, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: srli_4_sh3add:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    srli a1, a1, 4
+; RV32XTHEADBA-NEXT:    th.addsl a1, a0, a1, 3
+; RV32XTHEADBA-NEXT:    lw a0, 0(a1)
+; RV32XTHEADBA-NEXT:    lw a1, 4(a1)
+; RV32XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 4
+  %4 = getelementptr inbounds i64, ptr %0, i32 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define i32 @mul_neg1(i32 %a) {
+; CHECK-LABEL: mul_neg1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -1
+  ret i32 %c
+}
+
+define i32 @mul_neg2(i32 %a) {
+; CHECK-LABEL: mul_neg2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -2
+  ret i32 %c
+}
+
+define i32 @mul_neg3(i32 %a) {
+; RV32I-LABEL: mul_neg3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul_neg3:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV32XTHEADBA-NEXT:    neg a0, a0
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, -3
+  ret i32 %c
+}
+
+define i32 @mul_neg4(i32 %a) {
+; CHECK-LABEL: mul_neg4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -4
+  ret i32 %c
+}
+
+define i32 @mul_neg5(i32 %a) {
+; RV32I-LABEL: mul_neg5:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XTHEADBA-LABEL: mul_neg5:
+; RV32XTHEADBA:       # %bb.0:
+; RV32XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV32XTHEADBA-NEXT:    neg a0, a0
+; RV32XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, -5
+  ret i32 %c
+}
+
+define i32 @mul_neg6(i32 %a) {
+; CHECK-LABEL: mul_neg6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -6
+  ret i32 %c
+}
+
+define i32 @mul_neg7(i32 %a) {
+; CHECK-LABEL: mul_neg7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -7
+  ret i32 %c
+}
+
+define i32 @mul_neg8(i32 %a) {
+; CHECK-LABEL: mul_neg8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -8
+  ret i32 %c
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index 2d44ffb..2272c17b 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
 ; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV64I
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64I
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+xtheadba -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=RV64XTHEADBA
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64XTHEADBA
 
 define signext i16 @th_addsl_1(i64 %0, ptr %1) {
 ; RV64I-LABEL: th_addsl_1:
@@ -109,6 +109,25 @@ define i64 @addmul6(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @disjointormul6(i64 %a, i64 %b) {
+; RV64I-LABEL: disjointormul6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: disjointormul6:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 6
+  %d = or disjoint i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul10(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul10:
 ; RV64I:       # %bb.0:
@@ -182,6 +201,18 @@ define i64 @addmul20(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul22(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul22:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 22
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 22
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul24(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul24:
 ; RV64I:       # %bb.0:
@@ -255,6 +286,461 @@ define i64 @addmul72(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @mul50(i64 %a) {
+; RV64I-LABEL: mul50:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 50
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul50:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 50
+  ret i64 %c
+}
+
+define i64 @addmul50(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul50:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 50
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul50:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 50
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul100(i64 %a) {
+; RV64I-LABEL: mul100:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 100
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul100:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 100
+  ret i64 %c
+}
+
+define i64 @addmul100(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul100:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 100
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul100:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 100
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul162(i64 %a) {
+; RV64I-LABEL: mul162:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 162
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul162:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 162
+  ret i64 %c
+}
+
+define i64 @addmul162(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul162:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 162
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul162:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 162
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul180(i64 %a) {
+; RV64I-LABEL: mul180:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 180
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul180:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 180
+  ret i64 %c
+}
+
+define i64 @addmul180(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul180:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 180
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul180:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 180
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @add255mul180(i64 %a) {
+; RV64I-LABEL: add255mul180:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 180
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    addi a0, a0, 255
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: add255mul180:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 2
+; RV64XTHEADBA-NEXT:    addi a0, a0, 255
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 180
+  %d = add i64 %c, 255
+  ret i64 %d
+}
+
+define i64 @mul200(i64 %a) {
+; RV64I-LABEL: mul200:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 200
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul200:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 3
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 200
+  ret i64 %c
+}
+
+define i64 @addmul200(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul200:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 200
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul200:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 200
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul4096(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul4096:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 12
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 4096
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul4230(i64 %a, i64 %b) {
+; CHECK-LABEL: addmul4230:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addiw a2, a2, 134
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 4230
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul96(i64 %a) {
+; RV64I-LABEL: mul96:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 5
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul96:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 96
+  ret i64 %c
+}
+
+define i64 @mul119(i64 %a) {
+; RV64I-LABEL: mul119:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 119
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul119:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    sub a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 119
+  ret i64 %c
+}
+
+define i64 @mul123(i64 %a) {
+; RV64I-LABEL: mul123:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 123
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul123:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    sub a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 123
+  ret i64 %c
+}
+
+define i64 @mul125(i64 %a) {
+; RV64I-LABEL: mul125:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 125
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul125:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 1
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    sub a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 125
+  ret i64 %c
+}
+
+define i64 @mul131(i64 %a) {
+; RV64I-LABEL: mul131:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 131
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul131:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 1
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 131
+  ret i64 %c
+}
+
+define i64 @mul133(i64 %a) {
+; RV64I-LABEL: mul133:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 133
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul133:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 133
+  ret i64 %c
+}
+
+define i64 @mul137(i64 %a) {
+; RV64I-LABEL: mul137:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 137
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul137:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 137
+  ret i64 %c
+}
+
+define i64 @mul160(i64 %a) {
+; RV64I-LABEL: mul160:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 160
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul160:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 160
+  ret i64 %c
+}
+
+define i64 @mul288(i64 %a) {
+; RV64I-LABEL: mul288:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 288
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul288:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 288
+  ret i64 %c
+}
+
+define i64 @sh1add_imm(i64 %0) {
+; CHECK-LABEL: sh1add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, 5
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 1
+  %b = add i64 %a, 5
+  ret i64 %b
+}
+
+define i64 @sh2add_imm(i64 %0) {
+; CHECK-LABEL: sh2add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -6
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 2
+  %b = add i64 %a, -6
+  ret i64 %b
+}
+
+define i64 @sh3add_imm(i64 %0) {
+; CHECK-LABEL: sh3add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, 7
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 3
+  %b = add i64 %a, 7
+  ret i64 %b
+}
+
+define i64 @mul258(i64 %a) {
+; RV64I-LABEL: mul258:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 258
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul258:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a0, 8
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 258
+  ret i64 %c
+}
+
+define i64 @mul260(i64 %a) {
+; RV64I-LABEL: mul260:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 260
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul260:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a0, 8
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 260
+  ret i64 %c
+}
+
+define i64 @mul264(i64 %a) {
+; RV64I-LABEL: mul264:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 264
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul264:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a0, 8
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 264
+  ret i64 %c
+}
 define i64 @mul11(i64 %a) {
 ; RV64I-LABEL: mul11:
 ; RV64I:       # %bb.0:
@@ -431,86 +917,1148 @@ define i64 @mul81(i64 %a) {
   ret i64 %c
 }
 
+define i64 @mul4098(i64 %a) {
+; RV64I-LABEL: mul4098:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul4098:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a0, 12
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 4098
+  ret i64 %c
+}
 
-define i64 @mul96(i64 %a) {
-; RV64I-LABEL: mul96:
+define i64 @mul4100(i64 %a) {
+; RV64I-LABEL: mul4100:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 5
-; RV64I-NEXT:    slli a0, a0, 7
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul96:
+; RV64XTHEADBA-LABEL: mul4100:
 ; RV64XTHEADBA:       # %bb.0:
-; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
-; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    slli a1, a0, 12
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 2
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 96
+  %c = mul i64 %a, 4100
   ret i64 %c
 }
 
-define i64 @mul137(i64 %a) {
-; RV64I-LABEL: mul137:
+define i64 @mul4104(i64 %a) {
+; RV64I-LABEL: mul4104:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a1, 137
-; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 3
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul137:
+; RV64XTHEADBA-LABEL: mul4104:
 ; RV64XTHEADBA:       # %bb.0:
-; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
-; RV64XTHEADBA-NEXT:    slli a0, a0, 7
-; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    slli a1, a0, 12
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 3
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 137
+  %c = mul i64 %a, 4104
   ret i64 %c
 }
 
-define i64 @mul160(i64 %a) {
-; RV64I-LABEL: mul160:
+define signext i32 @mulw192(i32 signext %a) {
+; RV64I-LABEL: mulw192:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a1, 160
-; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 6
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul160:
+; RV64XTHEADBA-LABEL: mulw192:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 6
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 192
+  ret i32 %c
+}
+
+define signext i32 @mulw320(i32 signext %a) {
+; RV64I-LABEL: mulw320:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 320
+; RV64I-NEXT:    mulw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mulw320:
 ; RV64XTHEADBA:       # %bb.0:
 ; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
-; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 6
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 160
+  %c = mul i32 %a, 320
+  ret i32 %c
+}
+
+define signext i32 @mulw576(i32 signext %a) {
+; RV64I-LABEL: mulw576:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 576
+; RV64I-NEXT:    mulw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mulw576:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slliw a0, a0, 6
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i32 %a, 576
+  ret i32 %c
+}
+
+define i64 @add4104(i64 %a) {
+; RV64I-LABEL: add4104:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: add4104:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    li a1, 1026
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    ret
+  %c = add i64 %a, 4104
   ret i64 %c
 }
 
-define i64 @mul200(i64 %a) {
-; RV64I-LABEL: mul200:
+define i64 @add4104_2(i64 %a) {
+; RV64I-LABEL: add4104_2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a1, 200
-; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 8
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul200:
+; RV64XTHEADBA-LABEL: add4104_2:
 ; RV64XTHEADBA:       # %bb.0:
-; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
-; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
-; RV64XTHEADBA-NEXT:    slli a0, a0, 3
+; RV64XTHEADBA-NEXT:    li a1, 1026
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 200
+  %c = or disjoint i64 %a, 4104
   ret i64 %c
 }
 
-define i64 @mul288(i64 %a) {
-; RV64I-LABEL: mul288:
+define i64 @add8208(i64 %a) {
+; RV64I-LABEL: add8208:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    li a1, 288
-; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    lui a1, 2
+; RV64I-NEXT:    addiw a1, a1, 16
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64XTHEADBA-LABEL: mul288:
+; RV64XTHEADBA-LABEL: add8208:
 ; RV64XTHEADBA:       # %bb.0:
-; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 3
-; RV64XTHEADBA-NEXT:    slli a0, a0, 5
+; RV64XTHEADBA-NEXT:    li a1, 1026
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
 ; RV64XTHEADBA-NEXT:    ret
-  %c = mul i64 %a, 288
+  %c = add i64 %a, 8208
+  ret i64 %c
+}
+
+; Make sure we prefer LUI for the 8192 instead of using sh3add.
+define signext i32 @add8192_i32(i32 signext %a) {
+; CHECK-LABEL: add8192_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = add i32 %a, 8192
+  ret i32 %c
+}
+
+; Make sure we prefer LUI for the 8192 instead of using sh3add.
+define i64 @add8192(i64 %a) {
+; CHECK-LABEL: add8192:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = add i64 %a, 8192
+  ret i64 %c
+}
+
+define signext i32 @addshl32_5_6(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: addshl32_5_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 6
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 6
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_6(i64 %a, i64 %b) {
+; CHECK-LABEL: addshl64_5_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 6
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 6
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+define signext i32 @addshl32_5_7(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: addshl32_5_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 7
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 7
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_7(i64 %a, i64 %b) {
+; CHECK-LABEL: addshl64_5_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 7
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 7
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+define signext i32 @addshl32_5_8(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: addshl32_5_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 8
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 8
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_8(i64 %a, i64 %b) {
+; CHECK-LABEL: addshl64_5_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a1, a1, 8
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 8
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+define i64 @sh6_sh3_add1(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add1:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: sh6_sh3_add1:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    th.addsl a1, a1, a2, 3
+; RV64XTHEADBA-NEXT:    add a0, a1, a0
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %shl
+  %add2 = add nsw i64 %add, %x
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add2(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: sh6_sh3_add2:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    add a0, a1, a0
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %x
+  %add2 = add nsw i64 %add, %shl
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add3(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add3:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: sh6_sh3_add3:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    th.addsl a1, a1, a2, 3
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %shl1, %shl
+  %add2 = add nsw i64 %x, %add
+  ret i64 %add2
+}
+
+define i64 @sh6_sh3_add4(i64 noundef %x, i64 noundef %y, i64 noundef %z) {
+; RV64I-LABEL: sh6_sh3_add4:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: sh6_sh3_add4:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %shl = shl i64 %z, 3
+  %shl1 = shl i64 %y, 6
+  %add = add nsw i64 %x, %shl
+  %add2 = add nsw i64 %add, %shl1
+  ret i64 %add2
+}
+
+define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) {
+; CHECK-LABEL: srliw_1_sh1add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a1, 1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    lh a0, 0(a0)
+; CHECK-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i16, ptr %0, i64 %4
+  %6 = load i16, ptr %5, align 2
+  ret i16 %6
+}
+
+define signext i32 @srliw_2_sh2add(ptr %0, i32 signext %1) {
+; CHECK-LABEL: srliw_2_sh2add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a1, 2
+; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    lw a0, 0(a0)
+; CHECK-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_3_sh3add(ptr %0, i32 signext %1) {
+; CHECK-LABEL: srliw_3_sh3add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ld a0, 0(a0)
+; CHECK-NEXT:    ret
+  %3 = lshr i32 %1, 3
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i32 @srliw_1_sh2add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_1_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_1_sh2add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_1_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_1_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_1_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define i64 @srliw_2_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_2_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 2
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_2_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i16 @srliw_2_sh1add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_2_sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 2
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_2_sh1add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i16, ptr %0, i64 %4
+  %6 = load i16, ptr %5, align 2
+  ret i16 %6
+}
+
+
+define signext i32 @srliw_3_sh2add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_3_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 3
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_3_sh2add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 3
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_4_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_4_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 4
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srliw_4_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srliw a1, a1, 4
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i32 %1, 4
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i32 @srli_1_sh2add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_1_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_1_sh2add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 1
+  %4 = getelementptr inbounds i32, ptr %0, i64 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_2_sh3add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_2_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_2_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 2
+  %4 = getelementptr inbounds i64, ptr %0, i64 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define signext i16 @srli_2_sh1add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_2_sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_2_sh1add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 2
+  %4 = getelementptr inbounds i16, ptr %0, i64 %3
+  %5 = load i16, ptr %4, align 2
+  ret i16 %5
+}
+
+define signext i32 @srli_3_sh2add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_3_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_3_sh2add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 3
+  %4 = getelementptr inbounds i32, ptr %0, i64 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_4_sh3add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_4_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srli_4_sh3add:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 4
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %3 = lshr i64 %1, 4
+  %4 = getelementptr inbounds i64, ptr %0, i64 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define i8 @array_index_sh1_sh0(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh1_sh0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh1_sh0:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 1
+; RV64XTHEADBA-NEXT:    add a0, a0, a2
+; RV64XTHEADBA-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [2 x i8], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i8, ptr %a, align 1
+  ret i8 %b
+}
+
+define i16 @array_index_sh1_sh1(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh1_sh1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh1_sh1:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [2 x i16], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i16, ptr %a, align 2
+  ret i16 %b
+}
+
+define i32 @array_index_sh1_sh2(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh1_sh2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh1_sh2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [2 x i32], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i32, ptr %a, align 4
+  ret i32 %b
+}
+
+define i64 @array_index_sh1_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh1_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh1_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 4
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [2 x i64], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+define i8 @array_index_sh2_sh0(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh2_sh0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh2_sh0:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 2
+; RV64XTHEADBA-NEXT:    add a0, a0, a2
+; RV64XTHEADBA-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [4 x i8], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i8, ptr %a, align 1
+  ret i8 %b
+}
+
+define i16 @array_index_sh2_sh1(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh2_sh1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh2_sh1:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [4 x i16], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i16, ptr %a, align 2
+  ret i16 %b
+}
+
+define i32 @array_index_sh2_sh2(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh2_sh2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh2_sh2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 4
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [4 x i32], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i32, ptr %a, align 4
+  ret i32 %b
+}
+
+define i64 @array_index_sh2_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh2_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh2_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 5
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [4 x i64], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+define i8 @array_index_sh3_sh0(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh3_sh0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh3_sh0:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    add a0, a0, a2
+; RV64XTHEADBA-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [8 x i8], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i8, ptr %a, align 1
+  ret i8 %b
+}
+
+define i16 @array_index_sh3_sh1(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh3_sh1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh3_sh1:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 4
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [8 x i16], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i16, ptr %a, align 2
+  ret i16 %b
+}
+
+define i32 @array_index_sh3_sh2(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh3_sh2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh3_sh2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 5
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [8 x i32], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i32, ptr %a, align 4
+  ret i32 %b
+}
+
+define i64 @array_index_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh3_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh3_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [8 x i64], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+; Similar to above, but with a lshr on one of the indices. This requires
+; special handling during isel to form a shift pair.
+define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_lshr_sh3_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 58
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_lshr_sh3_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    srli a1, a1, 58
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %shr = lshr i64 %idx1, 58
+  %a = getelementptr inbounds [8 x i64], ptr %p, i64 %shr, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+define i8 @array_index_sh4_sh0(ptr %p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: array_index_sh4_sh0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    lbu a0, 0(a0)
+; CHECK-NEXT:    ret
+  %a = getelementptr inbounds [16 x i8], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i8, ptr %a, align 1
+  ret i8 %b
+}
+
+define i16 @array_index_sh4_sh1(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh4_sh1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh4_sh1:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 5
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    lh a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [16 x i16], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i16, ptr %a, align 2
+  ret i16 %b
+}
+
+define i32 @array_index_sh4_sh2(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh4_sh2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh4_sh2:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 6
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 2
+; RV64XTHEADBA-NEXT:    lw a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [16 x i32], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i32, ptr %a, align 4
+  ret i32 %b
+}
+
+define i64 @array_index_sh4_sh3(ptr %p, i64 %idx1, i64 %idx2) {
+; RV64I-LABEL: array_index_sh4_sh3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 7
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: array_index_sh4_sh3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    slli a1, a1, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 3
+; RV64XTHEADBA-NEXT:    ld a0, 0(a0)
+; RV64XTHEADBA-NEXT:    ret
+  %a = getelementptr inbounds [16 x i64], ptr %p, i64 %idx1, i64 %idx2
+  %b = load i64, ptr %a, align 8
+  ret i64 %b
+}
+
+define i64 @mul_neg1(i64 %a) {
+; CHECK-LABEL: mul_neg1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -1
+  ret i64 %c
+}
+
+define i64 @mul_neg2(i64 %a) {
+; CHECK-LABEL: mul_neg2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -2
+  ret i64 %c
+}
+
+define i64 @mul_neg3(i64 %a) {
+; RV64I-LABEL: mul_neg3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul_neg3:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 1
+; RV64XTHEADBA-NEXT:    neg a0, a0
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, -3
+  ret i64 %c
+}
+
+define i64 @mul_neg4(i64 %a) {
+; CHECK-LABEL: mul_neg4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -4
+  ret i64 %c
+}
+
+define i64 @mul_neg5(i64 %a) {
+; RV64I-LABEL: mul_neg5:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul_neg5:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a0, 2
+; RV64XTHEADBA-NEXT:    neg a0, a0
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, -5
   ret i64 %c
 }
+
+define i64 @mul_neg6(i64 %a) {
+; CHECK-LABEL: mul_neg6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -6
+  ret i64 %c
+}
+
+define i64 @mul_neg7(i64 %a) {
+; CHECK-LABEL: mul_neg7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -7
+  ret i64 %c
+}
+
+define i64 @mul_neg8(i64 %a) {
+; CHECK-LABEL: mul_neg8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -8
+  ret i64 %c
+}
+
+define ptr @srai_srli_sh3add(ptr %0, i64 %1) nounwind {
+; RV64I-LABEL: srai_srli_sh3add:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srai a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 6
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: srai_srli_sh3add:
+; RV64XTHEADBA:       # %bb.0: # %entry
+; RV64XTHEADBA-NEXT:    srai a1, a1, 32
+; RV64XTHEADBA-NEXT:    srli a1, a1, 6
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a1, 3
+; RV64XTHEADBA-NEXT:    ret
+entry:
+  %2 = ashr i64 %1, 32
+  %3 = lshr i64 %2, 6
+  %4 = getelementptr i64, ptr %0, i64 %3
+  ret ptr %4
+}
+
+define ptr @srai_srli_slli(ptr %0, i64 %1) nounwind {
+; CHECK-LABEL: srai_srli_slli:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    srai a1, a1, 32
+; CHECK-NEXT:    srli a1, a1, 6
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %2 = ashr i64 %1, 32
+  %3 = lshr i64 %2, 6
+  %4 = getelementptr i128, ptr %0, i64 %3
+  ret ptr %4
+}
+
+; Negative to make sure the peephole added for srai_srli_slli and
+; srai_srli_sh3add doesn't break this.
+define i64 @srai_andi(i64 %x) nounwind {
+; CHECK-LABEL: srai_andi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    srai a0, a0, 8
+; CHECK-NEXT:    andi a0, a0, -8
+; CHECK-NEXT:    ret
+entry:
+  %y = ashr i64 %x, 8
+  %z = and i64 %y, -8
+  ret i64 %z
+}
+
+; Negative to make sure the peephole added for srai_srli_slli and
+; srai_srli_sh3add doesn't break this.
+define i64 @srai_lui_and(i64 %x) nounwind {
+; CHECK-LABEL: srai_lui_and:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    srai a0, a0, 8
+; CHECK-NEXT:    lui a1, 1048574
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %y = ashr i64 %x, 8
+  %z = and i64 %y, -8192
+  ret i64 %z
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 10d2492..4d34621 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -1445,10 +1445,9 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64(<vscale x 1 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v11, v8, a5, v0.t
@@ -1595,9 +1594,7 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va
 ; RV32-NEXT:    vand.vx v13, v8, a1
 ; RV32-NEXT:    vand.vx v12, v12, a1
 ; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v13, v13, a4
 ; RV32-NEXT:    vor.vv v10, v10, v13
 ; RV32-NEXT:    vsrl.vi v13, v8, 8
@@ -1730,10 +1727,9 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64(<vscale x 2 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v12, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v14, v8, a5, v0.t
@@ -1880,9 +1876,7 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va
 ; RV32-NEXT:    vand.vx v18, v8, a1
 ; RV32-NEXT:    vand.vx v16, v16, a1
 ; RV32-NEXT:    vor.vv v10, v16, v10
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v18, v18, a4
 ; RV32-NEXT:    vor.vv v12, v12, v18
 ; RV32-NEXT:    vsrl.vi v18, v8, 8
@@ -2015,10 +2009,9 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
@@ -2165,9 +2158,7 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va
 ; RV32-NEXT:    vand.vx v28, v8, a1
 ; RV32-NEXT:    vand.vx v24, v24, a1
 ; RV32-NEXT:    vor.vv v12, v24, v12
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v28, v28, a4
 ; RV32-NEXT:    vor.vv v16, v16, v28
 ; RV32-NEXT:    vsrl.vi v28, v8, 8
@@ -2315,7 +2306,6 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
@@ -2323,7 +2313,6 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
@@ -2528,9 +2517,7 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
@@ -2704,7 +2691,6 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
@@ -2712,7 +2698,6 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
@@ -2917,9 +2902,7 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0dc1d0c..0c58cca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -523,11 +523,9 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v11, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
@@ -538,7 +536,7 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
@@ -609,15 +607,13 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v10, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v11, v8, a2
 ; RV32-NEXT:    vsrl.vx v12, v8, a4
-; RV32-NEXT:    vand.vx v13, v8, a1
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vand.vx v13, v8, a0
+; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v13, v13, a4
 ; RV32-NEXT:    vor.vv v10, v10, v13
 ; RV32-NEXT:    vsrl.vi v13, v8, 8
@@ -695,11 +691,9 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v12, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v14, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
 ; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
@@ -710,7 +704,7 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
@@ -781,15 +775,13 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v12, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v14, v8, a2
 ; RV32-NEXT:    vsrl.vx v16, v8, a4
-; RV32-NEXT:    vand.vx v18, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vand.vx v18, v8, a0
+; RV32-NEXT:    vand.vx v16, v16, a0
 ; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v18, v18, a4
 ; RV32-NEXT:    vor.vv v12, v12, v18
 ; RV32-NEXT:    vsrl.vi v18, v8, 8
@@ -867,11 +859,9 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v20, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
@@ -882,7 +872,7 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
@@ -953,15 +943,13 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v16, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v20, v8, a2
 ; RV32-NEXT:    vsrl.vx v24, v8, a4
-; RV32-NEXT:    vand.vx v28, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vand.vx v28, v8, a0
+; RV32-NEXT:    vand.vx v24, v24, a0
 ; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v28, v28, a4
 ; RV32-NEXT:    vor.vv v16, v16, v28
 ; RV32-NEXT:    vsrl.vi v28, v8, 8
@@ -1043,51 +1031,49 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1193,24 +1179,22 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a0
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a0
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v0, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
@@ -1221,7 +1205,6 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1318,51 +1301,49 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1468,24 +1449,22 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a0
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a0
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v0, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
@@ -1496,7 +1475,6 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1716,11 +1694,9 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v11, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
@@ -1731,7 +1707,7 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll
new file mode 100644
index 0000000..6f1efb6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-neg-abs.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+define <2 x i64> @expanded_fixed_neg_abs64(<2 x i64> %x) {
+; CHECK-LABEL: expanded_fixed_neg_abs64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vmin.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %t = sub <2 x i64> <i64 0, i64 0>, %x
+  %t1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %t, <2 x i64> %x)
+  %t2 = sub <2 x i64> <i64 0, i64 0>, %t1
+  ret <2 x i64> %t2
+}
+
+define <2 x i64> @expanded_fixed_neg_abs64_unsigned(<2 x i64> %x) {
+; CHECK-LABEL: expanded_fixed_neg_abs64_unsigned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %t = sub <2 x i64> <i64 0, i64 0>, %x
+  %t1 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %t, <2 x i64> %x)
+  %t2 = sub <2 x i64> <i64 0, i64 0>, %t1
+  ret <2 x i64> %t2
+}
+
+define <2 x i64> @expanded_fixed_neg_inv_abs64(<2 x i64> %x) {
+; CHECK-LABEL: expanded_fixed_neg_inv_abs64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vmax.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %t = sub <2 x i64> <i64 0, i64 0>, %x
+  %t1 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %t, <2 x i64> %x)
+  %t2 = sub <2 x i64> <i64 0, i64 0>, %t1
+  ret <2 x i64> %t2
+}
+
+define <2 x i64> @expanded_fixed_neg_inv_abs64_unsigned(<2 x i64> %x) {
+; CHECK-LABEL: expanded_fixed_neg_inv_abs64_unsigned:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    vmaxu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %t = sub <2 x i64> <i64 0, i64 0>, %x
+  %t1 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %t, <2 x i64> %x)
+  %t2 = sub <2 x i64> <i64 0, i64 0>, %t1
+  ret <2 x i64> %t2
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 15793ea..66952ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -1254,12 +1254,10 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) {
 define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_neg_trunc_v3f16_v3f32:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vle32.v v8, (a1)
 ; ZVFH-NEXT:    vle16.v v9, (a0)
-; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFH-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfsgnjn.vv v8, v9, v10
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -1272,9 +1270,7 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    addi a2, a1, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
@@ -4013,9 +4009,10 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4197,10 +4194,11 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 3
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4388,10 +4386,11 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 2
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4579,10 +4578,11 @@ define void @round_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 4
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 59c7feb..80e462c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1142,9 +1142,7 @@ define void @mulhu_v6i16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI67_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI67_0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 4f0f5dd..bf8baaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -530,7 +530,7 @@ define i32 @reduce_and_16xi32_prefix5(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, -1
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredand.vs v8, v8, v10
@@ -725,7 +725,7 @@ define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, -1
 ; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV32-NEXT:    vredminu.vs v8, v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 4e06d00..bb05eb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -168,11 +168,12 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64>
 define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_slide_two_source:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v13, v10, 1
-; CHECK-NEXT:    vslideup.vi v13, v11, 1
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v12, v8, 0
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vslideup.vi v12, v10, 1, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 5, i32 6>
   ret <4 x i64> %res
@@ -182,17 +183,18 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a0, a0, 252
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v11, (a0)
-; CHECK-NEXT:    vmv.v.i v0, 5
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vsrl.vi v10, v10, 1
-; CHECK-NEXT:    vadd.vi v10, v10, 1
-; CHECK-NEXT:    vrgather.vv v9, v11, v10, v0.t
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    li a0, 175
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    vrgather.vv v11, v9, v8
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v10, 0, v0
 ; CHECK-NEXT:    addi a0, a1, 672
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -209,15 +211,15 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
 define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 ; CHECK-LABEL: shuffle2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    li a0, -97
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vrsub.vi v9, v9, 4
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgather.vv v13, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmv1r.v v12, v8
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v13
-; CHECK-NEXT:    vadd.vv v13, v13, v13
-; CHECK-NEXT:    vmv.v.i v0, 6
-; CHECK-NEXT:    vrsub.vi v13, v13, 4
-; CHECK-NEXT:    vrgather.vv v9, v12, v13, v0.t
+; CHECK-NEXT:    vmerge.vim v8, v12, 0, v0
 ; CHECK-NEXT:    ret
   %b = extractelement <4 x float> %a, i32 2
   %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
@@ -229,15 +231,16 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
 define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vscale_range(2,2) {
 ; RV32-LABEL: extract_any_extend_vector_inreg_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
 ; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; RV32-NEXT:    vmv.v.i v0, 1
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vrgather.vi v18, v15, 1, v0.t
-; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT:    vrgather.vi v16, v8, 15, v0.t
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vx v8, v16, a0
 ; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -255,14 +258,13 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 1
+; RV64-NEXT:    li a1, -17
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; RV64-NEXT:    vrgather.vi v18, v15, 1, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    vrgather.vi v16, v8, 15
+; RV64-NEXT:    vmerge.vim v8, v16, 0, v0
 ; RV64-NEXT:    mv s2, sp
-; RV64-NEXT:    vs8r.v v16, (s2)
+; RV64-NEXT:    vs8r.v v8, (s2)
 ; RV64-NEXT:    andi a0, a0, 15
 ; RV64-NEXT:    li a1, 8
 ; RV64-NEXT:    call __muldi3
@@ -288,16 +290,21 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
 define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range(2,2) {
 ; CHECK-LABEL: shuffles_add:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT:    vmv1r.v v13, v10
-; CHECK-NEXT:    vslideup.vi v13, v11, 1
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    vmv.v.i v0, 1
-; CHECK-NEXT:    vrgather.vi v12, v9, 0
-; CHECK-NEXT:    vmv1r.v v9, v11
-; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v8, v12, v8
+; CHECK-NEXT:    vrgather.vi v12, v8, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgather.vi v16, v8, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v14, v14
+; CHECK-NEXT:    vadd.vi v9, v8, -4
+; CHECK-NEXT:    vadd.vi v8, v8, -3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vrgatherei16.vv v12, v10, v9, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v16, v10, v8, v0.t
+; CHECK-NEXT:    vfadd.vv v8, v12, v16
 ; CHECK-NEXT:    ret
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
@@ -305,3 +312,91 @@ define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range
   ret <4 x double> %5
 }
 
+define <16 x i32> @m4_square_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) {
+; CHECK-LABEL: m4_square_num_of_shuffles_in_chunks:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI17_0)
+; CHECK-NEXT:    vl1r.v v12, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v16, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> <i32 0, i32 5, i32 8, i32 12, i32 1, i32 4, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) {
+; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a0, %hi(.LCPI18_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI18_0)
+; CHECK-NEXT:    vl2re16.v v16, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 8, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 11, i32 poison>
+  ret <16 x i32> %1
+}
+
+define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) {
+; RV32-LABEL: multi_chunks_shuffle:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v12, v10, a0
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vsrl.vv v12, v8, v12
+; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vmerge.vvm v8, v10, v8, v0
+; RV32-NEXT:    vrgather.vi v10, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    srai a1, a0, 31
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: multi_chunks_shuffle:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetivli zero, 16, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    addi a0, a0, -241
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
+; RV64-NEXT:    vrgather.vi v10, v8, 2
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+entry:
+  %1 = shufflevector <32 x i32> %0, <32 x i32> zeroinitializer, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 37, i32 36, i32 39, i32 38, i32 9, i32 8, i32 11, i32 10, i32 45, i32 44, i32 47, i32 46, i32 17, i32 16, i32 19, i32 18, i32 53, i32 52, i32 55, i32 54, i32 25, i32 24, i32 27, i32 26, i32 61, i32 60, i32 63, i32 62>
+  %2 = shufflevector <32 x i32> zeroinitializer, <32 x i32> %1, <32 x i32> <i32 3, i32 34, i32 33, i32 0, i32 7, i32 38, i32 37, i32 4, i32 11, i32 42, i32 41, i32 8, i32 15, i32 46, i32 45, i32 12, i32 19, i32 50, i32 49, i32 16, i32 23, i32 54, i32 53, i32 20, i32 27, i32 58, i32 57, i32 24, i32 31, i32 62, i32 61, i32 28>
+  %3 = or <32 x i32> %1, %2
+  %4 = extractelement <32 x i32> %3, i64 1
+  %conv199 = sext i32 %4 to i64
+  ret i64 %conv199
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index 5d407ca..05254e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -473,6 +473,7 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c)
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 7649d60..33fe73a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -582,14 +582,14 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -616,13 +616,13 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -652,14 +652,14 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -686,13 +686,13 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -722,15 +722,15 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
@@ -758,13 +758,13 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
@@ -796,15 +796,15 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
@@ -832,13 +832,13 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
@@ -876,15 +876,15 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -927,10 +927,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
@@ -995,64 +995,62 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    li a5, 24
+; ZVFHMIN-NEXT:    mul a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1070,35 +1068,43 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
@@ -1110,7 +1116,7 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    li a1, 25
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
@@ -1152,68 +1158,61 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1221,7 +1220,8 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1229,43 +1229,49 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 8e448fc..c65712e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -582,14 +582,14 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -616,13 +616,13 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -652,14 +652,14 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -686,13 +686,13 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -722,15 +722,15 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
@@ -758,13 +758,13 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
@@ -796,15 +796,15 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
@@ -832,13 +832,13 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
@@ -876,15 +876,15 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -927,10 +927,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
@@ -995,64 +995,62 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    li a5, 24
+; ZVFHMIN-NEXT:    mul a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1070,35 +1068,43 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
@@ -1110,7 +1116,7 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    li a1, 25
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
@@ -1152,68 +1158,61 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1221,7 +1220,8 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1229,43 +1229,49 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
index 2fda344..6787c8c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
@@ -18,7 +18,7 @@ entry:
 define i64 @reduce_add2(<4 x i64> %v) {
 ; CHECK-LABEL: reduce_add2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 8
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 70b5384..06f4876 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1658,10 +1658,10 @@ define <vscale x 1 x i1> @fcmp_oeq_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"oeq", <vscale x 1 x i1> %m, i32 %evl)
@@ -1678,11 +1678,11 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1701,11 +1701,11 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1723,10 +1723,10 @@ define <vscale x 1 x i1> @fcmp_ogt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ogt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"ogt", <vscale x 1 x i1> %m, i32 %evl)
@@ -1743,11 +1743,11 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1766,11 +1766,11 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1788,10 +1788,10 @@ define <vscale x 1 x i1> @fcmp_oge_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oge_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"oge", <vscale x 1 x i1> %m, i32 %evl)
@@ -1808,11 +1808,11 @@ define <vscale x 1 x i1> @fcmp_oge_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oge_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1831,11 +1831,11 @@ define <vscale x 1 x i1> @fcmp_oge_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oge_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1853,10 +1853,10 @@ define <vscale x 1 x i1> @fcmp_olt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_olt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"olt", <vscale x 1 x i1> %m, i32 %evl)
@@ -1873,11 +1873,11 @@ define <vscale x 1 x i1> @fcmp_olt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_olt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1896,11 +1896,11 @@ define <vscale x 1 x i1> @fcmp_olt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_olt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1918,10 +1918,10 @@ define <vscale x 1 x i1> @fcmp_ole_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ole_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"ole", <vscale x 1 x i1> %m, i32 %evl)
@@ -1938,11 +1938,11 @@ define <vscale x 1 x i1> @fcmp_ole_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ole_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1961,11 +1961,11 @@ define <vscale x 1 x i1> @fcmp_ole_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ole_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1985,10 +1985,10 @@ define <vscale x 1 x i1> @fcmp_one_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_one_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2009,11 +2009,11 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_one_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2036,11 +2036,11 @@ define <vscale x 1 x i1> @fcmp_one_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_one_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2062,10 +2062,10 @@ define <vscale x 1 x i1> @fcmp_ord_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ord_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
@@ -2088,14 +2088,14 @@ define <vscale x 1 x i1> @fcmp_ord_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ord_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2119,14 +2119,14 @@ define <vscale x 1 x i1> @fcmp_ord_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ord_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -2147,10 +2147,10 @@ define <vscale x 1 x i1> @fcmp_ueq_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ueq_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2171,11 +2171,11 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2198,11 +2198,11 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2223,10 +2223,10 @@ define <vscale x 1 x i1> @fcmp_ugt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ugt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2245,11 +2245,11 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2270,11 +2270,11 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2294,10 +2294,10 @@ define <vscale x 1 x i1> @fcmp_uge_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uge_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2316,11 +2316,11 @@ define <vscale x 1 x i1> @fcmp_uge_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uge_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2341,11 +2341,11 @@ define <vscale x 1 x i1> @fcmp_uge_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uge_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2365,10 +2365,10 @@ define <vscale x 1 x i1> @fcmp_ult_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ult_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2387,11 +2387,11 @@ define <vscale x 1 x i1> @fcmp_ult_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ult_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2412,11 +2412,11 @@ define <vscale x 1 x i1> @fcmp_ult_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ult_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2436,10 +2436,10 @@ define <vscale x 1 x i1> @fcmp_ule_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ule_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2458,11 +2458,11 @@ define <vscale x 1 x i1> @fcmp_ule_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ule_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2483,11 +2483,11 @@ define <vscale x 1 x i1> @fcmp_ule_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ule_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2506,10 +2506,10 @@ define <vscale x 1 x i1> @fcmp_une_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_une_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"une", <vscale x 1 x i1> %m, i32 %evl)
@@ -2526,11 +2526,11 @@ define <vscale x 1 x i1> @fcmp_une_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_une_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -2549,11 +2549,11 @@ define <vscale x 1 x i1> @fcmp_une_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_une_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -2573,10 +2573,10 @@ define <vscale x 1 x i1> @fcmp_uno_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uno_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2599,14 +2599,14 @@ define <vscale x 1 x i1> @fcmp_uno_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uno_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2630,14 +2630,14 @@ define <vscale x 1 x i1> @fcmp_uno_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uno_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -2658,10 +2658,10 @@ define <vscale x 3 x i1> @fcmp_oeq_vv_nxv3f16(<vscale x 3 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv3f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2681,10 +2681,10 @@ define <vscale x 8 x i1> @fcmp_oeq_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2703,11 +2703,11 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2728,11 +2728,11 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2752,10 +2752,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ogt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2774,11 +2774,11 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2799,11 +2799,11 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2823,10 +2823,10 @@ define <vscale x 8 x i1> @fcmp_oge_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oge_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2845,11 +2845,11 @@ define <vscale x 8 x i1> @fcmp_oge_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oge_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2870,11 +2870,11 @@ define <vscale x 8 x i1> @fcmp_oge_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oge_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2894,10 +2894,10 @@ define <vscale x 8 x i1> @fcmp_olt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_olt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2916,11 +2916,11 @@ define <vscale x 8 x i1> @fcmp_olt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_olt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2941,11 +2941,11 @@ define <vscale x 8 x i1> @fcmp_olt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_olt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2965,10 +2965,10 @@ define <vscale x 8 x i1> @fcmp_ole_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ole_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2987,11 +2987,11 @@ define <vscale x 8 x i1> @fcmp_ole_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ole_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3012,11 +3012,11 @@ define <vscale x 8 x i1> @fcmp_ole_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ole_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3037,10 +3037,10 @@ define <vscale x 8 x i1> @fcmp_one_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_one_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3061,11 +3061,11 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_one_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3088,11 +3088,11 @@ define <vscale x 8 x i1> @fcmp_one_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_one_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3114,13 +3114,13 @@ define <vscale x 8 x i1> @fcmp_ord_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ord_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3142,14 +3142,14 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ord_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3173,14 +3173,14 @@ define <vscale x 8 x i1> @fcmp_ord_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ord_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3201,10 +3201,10 @@ define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ueq_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3225,11 +3225,11 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3252,11 +3252,11 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3277,10 +3277,10 @@ define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ugt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3299,11 +3299,11 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3324,11 +3324,11 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3348,10 +3348,10 @@ define <vscale x 8 x i1> @fcmp_uge_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uge_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3370,11 +3370,11 @@ define <vscale x 8 x i1> @fcmp_uge_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uge_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3395,11 +3395,11 @@ define <vscale x 8 x i1> @fcmp_uge_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uge_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3419,10 +3419,10 @@ define <vscale x 8 x i1> @fcmp_ult_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ult_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3441,11 +3441,11 @@ define <vscale x 8 x i1> @fcmp_ult_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ult_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3466,11 +3466,11 @@ define <vscale x 8 x i1> @fcmp_ult_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ult_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3490,10 +3490,10 @@ define <vscale x 8 x i1> @fcmp_ule_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ule_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3512,11 +3512,11 @@ define <vscale x 8 x i1> @fcmp_ule_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ule_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3537,11 +3537,11 @@ define <vscale x 8 x i1> @fcmp_ule_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ule_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3561,10 +3561,10 @@ define <vscale x 8 x i1> @fcmp_une_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_une_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3583,11 +3583,11 @@ define <vscale x 8 x i1> @fcmp_une_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_une_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3608,11 +3608,11 @@ define <vscale x 8 x i1> @fcmp_une_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_une_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3633,13 +3633,13 @@ define <vscale x 8 x i1> @fcmp_uno_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uno_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3661,14 +3661,14 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uno_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3692,14 +3692,14 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uno_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3829,14 +3829,14 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli t0, a0, 4
-; ZVFHMIN-NEXT:    add a0, t0, a0
+; ZVFHMIN-NEXT:    slli a7, a0, 4
+; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -3844,7 +3844,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB171_2
 ; ZVFHMIN-NEXT:  # %bb.1:
@@ -3857,16 +3857,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a7, a0, 4
-; ZVFHMIN-NEXT:    add a0, a7, a0
+; ZVFHMIN-NEXT:    slli a6, a0, 4
+; ZVFHMIN-NEXT:    add a0, a6, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
@@ -3881,6 +3881,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a6, a2, a5
+; ZVFHMIN-NEXT:    addi a6, a6, -1
+; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
 ; ZVFHMIN-NEXT:    mv a7, a6
 ; ZVFHMIN-NEXT:    slli a6, a6, 3
@@ -3890,31 +3893,28 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 4
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 5
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 4
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 5
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    sltu a6, a2, a5
-; ZVFHMIN-NEXT:    addi a6, a6, -1
-; ZVFHMIN-NEXT:    and a5, a6, a5
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 4
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 4
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v4, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v5, v6, a3
@@ -3923,16 +3923,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a5, a4, 5
-; ZVFHMIN-NEXT:    add a4, a5, a4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a4, a2, 5
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 61cc754..9c733b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2367,9 +2367,8 @@ define <vscale x 1 x i1> @icmp_eq_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmseq.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2394,9 +2393,8 @@ define <vscale x 1 x i1> @icmp_eq_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmseq.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2451,9 +2449,8 @@ define <vscale x 1 x i1> @icmp_ne_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsne.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2478,9 +2475,8 @@ define <vscale x 1 x i1> @icmp_ne_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsne.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2535,9 +2531,8 @@ define <vscale x 1 x i1> @icmp_ugt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2562,9 +2557,8 @@ define <vscale x 1 x i1> @icmp_ugt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2619,9 +2613,8 @@ define <vscale x 1 x i1> @icmp_uge_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2647,9 +2640,8 @@ define <vscale x 1 x i1> @icmp_uge_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2704,9 +2696,8 @@ define <vscale x 1 x i1> @icmp_ult_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2731,9 +2722,8 @@ define <vscale x 1 x i1> @icmp_ult_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2788,9 +2778,8 @@ define <vscale x 1 x i1> @icmp_sgt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2815,9 +2804,8 @@ define <vscale x 1 x i1> @icmp_sgt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2872,9 +2860,8 @@ define <vscale x 1 x i1> @icmp_sge_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2900,9 +2887,8 @@ define <vscale x 1 x i1> @icmp_sge_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2957,9 +2943,8 @@ define <vscale x 1 x i1> @icmp_slt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2984,9 +2969,8 @@ define <vscale x 1 x i1> @icmp_slt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3041,9 +3025,8 @@ define <vscale x 1 x i1> @icmp_sle_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3068,9 +3051,8 @@ define <vscale x 1 x i1> @icmp_sle_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3129,9 +3111,8 @@ define <vscale x 8 x i1> @icmp_eq_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3158,9 +3139,8 @@ define <vscale x 8 x i1> @icmp_eq_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmseq.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3220,9 +3200,8 @@ define <vscale x 8 x i1> @icmp_ne_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsne.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3249,9 +3228,8 @@ define <vscale x 8 x i1> @icmp_ne_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsne.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3311,9 +3289,8 @@ define <vscale x 8 x i1> @icmp_ugt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3340,9 +3317,8 @@ define <vscale x 8 x i1> @icmp_ugt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3402,9 +3378,8 @@ define <vscale x 8 x i1> @icmp_uge_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3432,9 +3407,8 @@ define <vscale x 8 x i1> @icmp_uge_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3494,9 +3468,8 @@ define <vscale x 8 x i1> @icmp_ult_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3523,9 +3496,8 @@ define <vscale x 8 x i1> @icmp_ult_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3585,9 +3557,8 @@ define <vscale x 8 x i1> @icmp_sgt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3614,9 +3585,8 @@ define <vscale x 8 x i1> @icmp_sgt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3676,9 +3646,8 @@ define <vscale x 8 x i1> @icmp_sge_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3706,9 +3675,8 @@ define <vscale x 8 x i1> @icmp_sge_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3768,9 +3736,8 @@ define <vscale x 8 x i1> @icmp_slt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3797,9 +3764,8 @@ define <vscale x 8 x i1> @icmp_slt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3859,9 +3825,8 @@ define <vscale x 8 x i1> @icmp_sle_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3888,9 +3853,8 @@ define <vscale x 8 x i1> @icmp_sle_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index fee6799..77f3cf3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -1487,9 +1487,8 @@ define <vscale x 1 x i64> @vadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1514,9 +1513,8 @@ define <vscale x 1 x i64> @vadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1583,9 +1581,8 @@ define <vscale x 2 x i64> @vadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1610,9 +1607,8 @@ define <vscale x 2 x i64> @vadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1679,9 +1675,8 @@ define <vscale x 4 x i64> @vadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1706,9 +1701,8 @@ define <vscale x 4 x i64> @vadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1775,9 +1769,8 @@ define <vscale x 8 x i64> @vadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1802,9 +1795,8 @@ define <vscale x 8 x i64> @vadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
index b0c5a72..4866bb0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
@@ -1314,9 +1314,8 @@ define <vscale x 1 x i64> @vand_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1341,9 +1340,8 @@ define <vscale x 1 x i64> @vand_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1410,9 +1408,8 @@ define <vscale x 2 x i64> @vand_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1437,9 +1434,8 @@ define <vscale x 2 x i64> @vand_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1506,9 +1502,8 @@ define <vscale x 4 x i64> @vand_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1533,9 +1528,8 @@ define <vscale x 4 x i64> @vand_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1602,9 +1596,8 @@ define <vscale x 8 x i64> @vand_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1629,9 +1622,8 @@ define <vscale x 8 x i64> @vand_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
index 32992301..763b290 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
@@ -1115,9 +1115,8 @@ define <vscale x 1 x i64> @vandn_vx_vp_nxv1i64(i64 %a, <vscale x 1 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1139,9 +1138,8 @@ define <vscale x 1 x i64> @vandn_vx_vp_nxv1i64(i64 %a, <vscale x 1 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1208,9 +1206,8 @@ define <vscale x 2 x i64> @vandn_vx_vp_nxv2i64(i64 %a, <vscale x 2 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1232,9 +1229,8 @@ define <vscale x 2 x i64> @vandn_vx_vp_nxv2i64(i64 %a, <vscale x 2 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1301,9 +1297,8 @@ define <vscale x 4 x i64> @vandn_vx_vp_nxv4i64(i64 %a, <vscale x 4 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1325,9 +1320,8 @@ define <vscale x 4 x i64> @vandn_vx_vp_nxv4i64(i64 %a, <vscale x 4 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1394,9 +1388,8 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1418,9 +1411,8 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
index 2814be2..03e4e1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
@@ -893,9 +893,8 @@ define <vscale x 1 x i64> @vdiv_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -920,9 +919,8 @@ define <vscale x 1 x i64> @vdiv_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -969,9 +967,8 @@ define <vscale x 2 x i64> @vdiv_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -996,9 +993,8 @@ define <vscale x 2 x i64> @vdiv_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1045,9 +1041,8 @@ define <vscale x 4 x i64> @vdiv_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1072,9 +1067,8 @@ define <vscale x 4 x i64> @vdiv_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1121,9 +1115,8 @@ define <vscale x 8 x i64> @vdiv_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1148,9 +1141,8 @@ define <vscale x 8 x i64> @vdiv_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
index 3e913d4..2f35f91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
@@ -892,9 +892,8 @@ define <vscale x 1 x i64> @vdivu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -919,9 +918,8 @@ define <vscale x 1 x i64> @vdivu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -968,9 +966,8 @@ define <vscale x 2 x i64> @vdivu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -995,9 +992,8 @@ define <vscale x 2 x i64> @vdivu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1044,9 +1040,8 @@ define <vscale x 4 x i64> @vdivu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1071,9 +1066,8 @@ define <vscale x 4 x i64> @vdivu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1120,9 +1114,8 @@ define <vscale x 8 x i64> @vdivu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1147,9 +1140,8 @@ define <vscale x 8 x i64> @vdivu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 87bc9f2..31359c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -679,10 +679,10 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -700,10 +700,10 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -722,11 +722,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -747,11 +747,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_commute(<vscale x 1 x half> %va, ha
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -772,11 +772,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -797,11 +797,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x half
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -823,10 +823,10 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -844,10 +844,10 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -866,11 +866,11 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -891,11 +891,11 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -917,10 +917,10 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -938,10 +938,10 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -960,11 +960,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -985,11 +985,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -1011,10 +1011,10 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1032,10 +1032,10 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1054,11 +1054,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1079,11 +1079,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1105,10 +1105,10 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1126,10 +1126,10 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1148,11 +1148,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfadd_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1173,11 +1173,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfadd_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1205,23 +1205,22 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1231,10 +1230,11 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB48_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1266,22 +1266,21 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1291,9 +1290,10 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB49_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1324,14 +1324,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1352,15 +1348,18 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1370,20 +1369,21 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB50_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1416,16 +1416,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1436,18 +1430,22 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1457,14 +1455,15 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB51_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index 061af45..2205769 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -641,10 +641,10 @@ define <vscale x 1 x half> @vfdiv_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -662,10 +662,10 @@ define <vscale x 1 x half> @vfdiv_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -684,11 +684,11 @@ define <vscale x 1 x half> @vfdiv_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -709,11 +709,11 @@ define <vscale x 1 x half> @vfdiv_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -735,10 +735,10 @@ define <vscale x 2 x half> @vfdiv_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -756,10 +756,10 @@ define <vscale x 2 x half> @vfdiv_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -778,11 +778,11 @@ define <vscale x 2 x half> @vfdiv_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -803,11 +803,11 @@ define <vscale x 2 x half> @vfdiv_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -829,10 +829,10 @@ define <vscale x 4 x half> @vfdiv_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -850,10 +850,10 @@ define <vscale x 4 x half> @vfdiv_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -872,11 +872,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -897,11 +897,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -923,10 +923,10 @@ define <vscale x 8 x half> @vfdiv_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -944,10 +944,10 @@ define <vscale x 8 x half> @vfdiv_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -966,11 +966,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -991,11 +991,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1017,10 +1017,10 @@ define <vscale x 16 x half> @vfdiv_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1038,10 +1038,10 @@ define <vscale x 16 x half> @vfdiv_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1060,11 +1060,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1085,11 +1085,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1117,23 +1117,22 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1143,10 +1142,11 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB44_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1178,22 +1178,21 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1203,9 +1202,10 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB45_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1236,14 +1236,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1264,15 +1260,18 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1282,20 +1281,21 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB46_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1328,16 +1328,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1348,18 +1342,22 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1369,14 +1367,15 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB47_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 02d6229..5d998c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -290,10 +290,10 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -311,10 +311,10 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -334,10 +334,10 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -355,10 +355,10 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -378,10 +378,10 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -399,10 +399,10 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -422,10 +422,10 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -443,10 +443,10 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -466,10 +466,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -487,10 +487,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -516,23 +516,22 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -542,10 +541,11 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -577,22 +577,21 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -602,9 +601,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index f7f8029..48a4c13 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -290,10 +290,10 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -311,10 +311,10 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -334,10 +334,10 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -355,10 +355,10 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -378,10 +378,10 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -399,10 +399,10 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -422,10 +422,10 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -443,10 +443,10 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -466,10 +466,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -487,10 +487,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -516,23 +516,22 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -542,10 +541,11 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -577,22 +577,21 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -602,9 +601,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index 7e552304..06f74dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x half> @vfmul_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -40,10 +40,10 @@ define <vscale x 1 x half> @vfmul_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -62,11 +62,11 @@ define <vscale x 1 x half> @vfmul_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -87,11 +87,11 @@ define <vscale x 1 x half> @vfmul_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -113,10 +113,10 @@ define <vscale x 2 x half> @vfmul_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -134,10 +134,10 @@ define <vscale x 2 x half> @vfmul_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -156,11 +156,11 @@ define <vscale x 2 x half> @vfmul_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -181,11 +181,11 @@ define <vscale x 2 x half> @vfmul_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -207,10 +207,10 @@ define <vscale x 4 x half> @vfmul_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -228,10 +228,10 @@ define <vscale x 4 x half> @vfmul_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -250,11 +250,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -275,11 +275,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -301,10 +301,10 @@ define <vscale x 8 x half> @vfmul_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -322,10 +322,10 @@ define <vscale x 8 x half> @vfmul_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -344,11 +344,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -369,11 +369,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -395,10 +395,10 @@ define <vscale x 16 x half> @vfmul_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -416,10 +416,10 @@ define <vscale x 16 x half> @vfmul_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -438,11 +438,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfmul_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -463,11 +463,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfmul_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -495,23 +495,22 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -521,10 +520,11 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB20_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -556,22 +556,21 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -581,9 +580,10 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB21_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -614,14 +614,10 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -642,15 +638,18 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -660,20 +659,21 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -706,16 +706,10 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -726,18 +720,22 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -747,14 +745,15 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
index b7f2133..575d50d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
@@ -42,9 +42,9 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i1_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -62,9 +62,9 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i1_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index d990c74..e33ab98 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -130,9 +130,8 @@ define <vscale x 2 x i7> @vfptosi_v4i7_v4f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfptosi_v4i7_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -153,9 +152,8 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i8_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -174,9 +172,8 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i8_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
@@ -196,9 +193,8 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i16_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -214,9 +210,8 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i16_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -235,9 +230,9 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i32_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -254,9 +249,9 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i32_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -276,9 +271,9 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i64_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -286,21 +281,13 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 }
 
 define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
-; ZVFH-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v8, v10
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.rtz.x.f.v v8, v10
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
+; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
index 8ac5992..e1d0ad4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
@@ -42,9 +42,9 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i1_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -62,9 +62,9 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i1_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 3b24a64..86222ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -130,9 +130,8 @@ define <vscale x 2 x i7> @vfptoui_v4i7_v4f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfptoui_v4i7_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -153,9 +152,8 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i8_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -174,9 +172,8 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i8_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
@@ -196,9 +193,8 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i16_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -214,9 +210,8 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i16_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -235,9 +230,9 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i32_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -254,9 +249,9 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i32_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -276,9 +271,9 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i64_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.xu.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -286,21 +281,13 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 }
 
 define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
-; ZVFH-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
+; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 8e57be1..e94d0a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -242,9 +242,9 @@ define <vscale x 1 x half> @vfsqrt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -262,9 +262,9 @@ define <vscale x 1 x half> @vfsqrt_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -284,9 +284,9 @@ define <vscale x 2 x half> @vfsqrt_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -304,9 +304,9 @@ define <vscale x 2 x half> @vfsqrt_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -326,9 +326,9 @@ define <vscale x 4 x half> @vfsqrt_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -346,9 +346,9 @@ define <vscale x 4 x half> @vfsqrt_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -368,9 +368,9 @@ define <vscale x 8 x half> @vfsqrt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -388,9 +388,9 @@ define <vscale x 8 x half> @vfsqrt_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -410,9 +410,9 @@ define <vscale x 16 x half> @vfsqrt_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -430,9 +430,9 @@ define <vscale x 16 x half> @vfsqrt_vv_nxv16f16_unmasked(<vscale x 16 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -458,13 +458,13 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
@@ -472,9 +472,10 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -498,14 +499,14 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -513,8 +514,9 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index d034f65..56ed560 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -641,10 +641,10 @@ define <vscale x 1 x half> @vfsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -662,10 +662,10 @@ define <vscale x 1 x half> @vfsub_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -684,11 +684,11 @@ define <vscale x 1 x half> @vfsub_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -709,11 +709,11 @@ define <vscale x 1 x half> @vfsub_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -735,10 +735,10 @@ define <vscale x 2 x half> @vfsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -756,10 +756,10 @@ define <vscale x 2 x half> @vfsub_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -778,11 +778,11 @@ define <vscale x 2 x half> @vfsub_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -803,11 +803,11 @@ define <vscale x 2 x half> @vfsub_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -829,10 +829,10 @@ define <vscale x 4 x half> @vfsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -850,10 +850,10 @@ define <vscale x 4 x half> @vfsub_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -872,11 +872,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -897,11 +897,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -923,10 +923,10 @@ define <vscale x 8 x half> @vfsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -944,10 +944,10 @@ define <vscale x 8 x half> @vfsub_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -966,11 +966,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -991,11 +991,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1017,10 +1017,10 @@ define <vscale x 16 x half> @vfsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1038,10 +1038,10 @@ define <vscale x 16 x half> @vfsub_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1060,11 +1060,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1085,11 +1085,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfsub_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1117,23 +1117,22 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1143,10 +1142,11 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB44_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1178,22 +1178,21 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1203,9 +1202,10 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB45_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1236,14 +1236,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1264,15 +1260,18 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1282,20 +1281,21 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB46_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1328,16 +1328,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1348,18 +1342,22 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1369,14 +1367,15 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB47_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
index 6cd3884..a1d548e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
@@ -143,9 +143,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -170,9 +169,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_commute(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -198,9 +196,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -225,9 +222,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_tu(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_tu:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
@@ -254,9 +250,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_commute_tu(<vscale x 1 x half> %v
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_commute_tu:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
@@ -283,9 +278,8 @@ define <vscale x 1 x float> @vfmacc_vf_nxv1f32_unmasked_tu(<vscale x 1 x half> %
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv1f32_unmasked_tu:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
@@ -362,9 +356,8 @@ define <vscale x 2 x float> @vfmacc_vf_nxv2f32(<vscale x 2 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv2f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -389,9 +382,8 @@ define <vscale x 2 x float> @vfmacc_vf_nxv2f32_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv2f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -468,9 +460,8 @@ define <vscale x 4 x float> @vfmacc_vf_nxv4f32(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -495,9 +486,8 @@ define <vscale x 4 x float> @vfmacc_vf_nxv4f32_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -574,9 +564,8 @@ define <vscale x 8 x float> @vfmacc_vf_nxv8f32(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -601,9 +590,8 @@ define <vscale x 8 x float> @vfmacc_vf_nxv8f32_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -696,9 +684,8 @@ define <vscale x 16 x float> @vfmacc_vf_nxv16f32(<vscale x 16 x half> %va, half
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -723,9 +710,8 @@ define <vscale x 16 x float> @vfmacc_vf_nxv16f32_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-LABEL: vfmacc_vf_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
index c92a79e..94b8007 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll
@@ -120,9 +120,8 @@ define <vscale x 1 x float> @vmfsac_vf_nxv1f32(<vscale x 1 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -148,9 +147,8 @@ define <vscale x 1 x float> @vmfsac_vf_nxv1f32_commute(<vscale x 1 x half> %a, h
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -177,9 +175,8 @@ define <vscale x 1 x float> @vmfsac_vf_nxv1f32_unmasked(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv1f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -255,9 +252,8 @@ define <vscale x 2 x float> @vmfsac_vf_nxv2f32(<vscale x 2 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -283,9 +279,8 @@ define <vscale x 2 x float> @vmfsac_vf_nxv2f32_commute(<vscale x 2 x half> %a, h
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -312,9 +307,8 @@ define <vscale x 2 x float> @vmfsac_vf_nxv2f32_unmasked(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv2f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -392,9 +386,8 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -420,9 +413,8 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32_commute(<vscale x 4 x half> %a, h
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -449,9 +441,8 @@ define <vscale x 4 x float> @vmfsac_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -529,9 +520,8 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -557,9 +547,8 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32_commute(<vscale x 8 x half> %a, h
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -586,9 +575,8 @@ define <vscale x 8 x float> @vmfsac_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vmfsac_vf_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
index 0a0bc66..ea45706 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
@@ -71,9 +71,8 @@ define <vscale x 1 x float> @vfnmacc_vf_nxv1f32(<vscale x 1 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -101,9 +100,8 @@ define <vscale x 1 x float> @vfnmacc_vf_nxv1f32_commute(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -131,9 +129,8 @@ define <vscale x 1 x float> @vfnmacc_vf_nxv1f32_unmasked(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv1f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -212,9 +209,8 @@ define <vscale x 2 x float> @vfnmacc_vf_nxv2f32(<vscale x 2 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -242,9 +238,8 @@ define <vscale x 2 x float> @vfnmacc_vf_nxv2f32_commute(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -272,9 +267,8 @@ define <vscale x 2 x float> @vfnmacc_vf_nxv2f32_unmasked(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv2f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -355,9 +349,8 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -385,9 +378,8 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32_commute(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -415,9 +407,8 @@ define <vscale x 4 x float> @vfnmacc_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -498,9 +489,8 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -528,9 +518,8 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32_commute(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -558,9 +547,8 @@ define <vscale x 8 x float> @vfnmacc_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -657,9 +645,8 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32(<vscale x 16 x half> %a, half
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -687,9 +674,8 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32_commute(<vscale x 16 x half> %
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -717,9 +703,8 @@ define <vscale x 16 x float> @vfnmacc_vf_nxv16f32_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-LABEL: vfnmacc_vf_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
index b5f7ef3..4956da5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
@@ -69,9 +69,8 @@ define <vscale x 1 x float> @vfnmsac_vf_nxv1f32(<vscale x 1 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -97,9 +96,8 @@ define <vscale x 1 x float> @vfnmsac_vf_nxv1f32_commute(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -126,9 +124,8 @@ define <vscale x 1 x float> @vfnmsac_vf_nxv1f32_unmasked(<vscale x 1 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv1f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -204,9 +201,8 @@ define <vscale x 2 x float> @vfnmsac_vf_nxv2f32(<vscale x 2 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -232,9 +228,8 @@ define <vscale x 2 x float> @vfnmsac_vf_nxv2f32_commute(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v11, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -261,9 +256,8 @@ define <vscale x 2 x float> @vfnmsac_vf_nxv2f32_unmasked(<vscale x 2 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv2f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -341,9 +335,8 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32(<vscale x 4 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -369,9 +362,8 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32_commute(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -398,9 +390,8 @@ define <vscale x 4 x float> @vfnmsac_vf_nxv4f32_unmasked(<vscale x 4 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv4f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -478,9 +469,8 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32(<vscale x 8 x half> %a, half %b,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -506,9 +496,8 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32_commute(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -535,9 +524,8 @@ define <vscale x 8 x float> @vfnmsac_vf_nxv8f32_unmasked(<vscale x 8 x half> %a,
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv8f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -631,9 +619,8 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32(<vscale x 16 x half> %a, half
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -659,9 +646,8 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32_commute(<vscale x 16 x half> %
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -688,9 +674,8 @@ define <vscale x 16 x float> @vfnmsac_vf_nxv16f32_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-LABEL: vfnmsac_vf_nxv16f32_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index b304769b..c4a3834 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -513,6 +513,36 @@ body: |
     %y:vr = PseudoVNSRL_WV_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
 ...
 ---
+name: vfnop_vs2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfnop_vs2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0
+...
+---
+name: vfnop_vs2_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfnop_vs2_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 4 /* e16 */, 0
+...
+---
+name: vfnop_vs2_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfnop_vs2_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    early-clobber %y:vr = PseudoVFNCVT_X_F_W_MF2 $noreg, %x, 0, 1, 3 /* e8 */, 0
+...
+---
 name: vseN_v
 body: |
   bb.0:
@@ -543,6 +573,86 @@ body: |
     PseudoVSE8_V_MF2 %x, $noreg, 1, 3 /* e8 */
 ...
 ---
+name: vsm_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vsm_v
+    ; CHECK: %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, 1, 0 /* e8 */
+    ; CHECK-NEXT: PseudoVSM_V_B8 %x, $noreg, 1, 0 /* e8 */
+    %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0
+    PseudoVSM_V_B8 %x, $noreg, 1, 0
+...
+---
+name: vsm_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vsm_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0 /* e8 */
+    ; CHECK-NEXT: PseudoVSM_V_B16 %x, $noreg, 1, 0 /* e8 */
+    %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0
+    PseudoVSM_V_B16 %x, $noreg, 1, 0
+...
+---
+name: vleN_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vleN_v_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0
+...
+---
+name: vleN_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %x:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vlm_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, 1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B8 $noreg, %x, 1, 0 /* e8 */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVMAND_MM_B8 $noreg, %x, 1, 0
+...
+---
+name: vlm_v_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0
+...
+---
+name: vlm_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0 /* e8 */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0
+...
+---
 name: vsseN_v
 body: |
   bb.0:
@@ -675,6 +785,56 @@ body: |
     %y:vr = PseudoVLUXEI8_V_MF2_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0
 ...
 ---
+name: vluxeiN_v_idx_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_idx_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_idx_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_idx_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVLUXEI8_V_MF2_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVLUXEI8_V_MF2_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_vd
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_vd
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_vd_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_vd_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0
+...
+---
+name: vluxeiN_vd_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_vd_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
 name: vmop_mm
 body: |
   bb.0:
@@ -1064,3 +1224,116 @@ body: |
     %x:vr = PseudoVMAND_MM_B1 $noreg, $noreg, -1, 0
     %y:vr = PseudoVIOTA_M_MF2 $noreg, %x, 1, 3 /* e8 */, 0
 ...
+name: vred_vs2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1_vs2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1_vs2_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 4 /* e16 */, 0
+...
+---
+name: vred_vs1_vs2_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_MF2_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_MF2_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_other_user_is_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_other_user_is_vl0
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_both_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_both_vl0
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_vl0_and_vlreg
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vl0_and_vlreg
+    ; CHECK: %vl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, %vl, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %vl:gprnox0 = COPY $x1
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, %vl, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_vlreg_and_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vlreg_and_vl0
+    ; CHECK: %vl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0 /* tu, mu */
+    %vl:gprnox0 = COPY $x1
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0
+...
+---
+name: vred_other_user_is_vl2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_other_user_is_vl2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 2, 3 /* e8 */, 0
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
index 3f966b0..0a366f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
@@ -110,4 +110,24 @@ body: |
     %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
     %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, -1, 3 /* e8 */, 0
 ...
+---
+name: vfcvt_x_f_v_nofpexcept
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfcvt_x_f_v_nofpexcept
+    ; CHECK: %x:vr = nofpexcept PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = nofpexcept PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vfcvt_x_f_v_fpexcept
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfcvt_x_f_v_fpexcept
+    ; CHECK: %x:vr = PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir b/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir
new file mode 100644
index 0000000..e8f7957
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir
@@ -0,0 +1,13 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-vl-optimizer -verify-machineinstrs | FileCheck %s
+
+---
+name: vleN_v_volatile
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ :: (volatile load (<vscale x 1 x s64>))
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 :: (volatile load (<vscale x 1 x s64>))
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
index 333117c..c334e70 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
@@ -1654,9 +1654,9 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, mu
 ; RV32-NEXT:    vmacc.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1685,9 +1685,9 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64_unmasked(<vscale x 1 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV32-NEXT:    vmacc.vv v9, v8, v10
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1729,9 +1729,8 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmacc.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1791,9 +1790,9 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
 ; RV32-NEXT:    vmacc.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1822,9 +1821,9 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64_unmasked(<vscale x 2 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
 ; RV32-NEXT:    vmacc.vv v10, v8, v12
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1866,9 +1865,8 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmacc.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1928,9 +1926,9 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, mu
 ; RV32-NEXT:    vmacc.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1959,9 +1957,9 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64_unmasked(<vscale x 4 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
 ; RV32-NEXT:    vmacc.vv v12, v8, v16
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2003,9 +2001,8 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmacc.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2067,9 +2064,9 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, mu
 ; RV32-NEXT:    vmacc.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2098,9 +2095,9 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64_unmasked(<vscale x 8 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
 ; RV32-NEXT:    vmacc.vv v16, v8, v24
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2143,9 +2140,8 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmacc.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 7818e99..3df0763 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -1124,9 +1124,8 @@ define <vscale x 1 x i64> @vmax_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1151,9 +1150,8 @@ define <vscale x 1 x i64> @vmax_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1200,9 +1198,8 @@ define <vscale x 2 x i64> @vmax_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1227,9 +1224,8 @@ define <vscale x 2 x i64> @vmax_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1276,9 +1272,8 @@ define <vscale x 4 x i64> @vmax_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1303,9 +1298,8 @@ define <vscale x 4 x i64> @vmax_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1352,9 +1346,8 @@ define <vscale x 8 x i64> @vmax_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1379,9 +1372,8 @@ define <vscale x 8 x i64> @vmax_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 674b0b8..8147d46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -1123,9 +1123,8 @@ define <vscale x 1 x i64> @vmaxu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1149,8 @@ define <vscale x 1 x i64> @vmaxu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1199,9 +1197,8 @@ define <vscale x 2 x i64> @vmaxu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1226,9 +1223,8 @@ define <vscale x 2 x i64> @vmaxu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1275,9 +1271,8 @@ define <vscale x 4 x i64> @vmaxu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1302,9 +1297,8 @@ define <vscale x 4 x i64> @vmaxu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1351,9 +1345,8 @@ define <vscale x 8 x i64> @vmaxu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1378,9 +1371,8 @@ define <vscale x 8 x i64> @vmaxu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 79631cd..614bd4cb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -1124,9 +1124,8 @@ define <vscale x 1 x i64> @vmin_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1151,9 +1150,8 @@ define <vscale x 1 x i64> @vmin_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1200,9 +1198,8 @@ define <vscale x 2 x i64> @vmin_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1227,9 +1224,8 @@ define <vscale x 2 x i64> @vmin_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1276,9 +1272,8 @@ define <vscale x 4 x i64> @vmin_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1303,9 +1298,8 @@ define <vscale x 4 x i64> @vmin_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1352,9 +1346,8 @@ define <vscale x 8 x i64> @vmin_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1379,9 +1372,8 @@ define <vscale x 8 x i64> @vmin_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index bc93b62..21160553a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -1123,9 +1123,8 @@ define <vscale x 1 x i64> @vminu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1149,8 @@ define <vscale x 1 x i64> @vminu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1199,9 +1197,8 @@ define <vscale x 2 x i64> @vminu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1226,9 +1223,8 @@ define <vscale x 2 x i64> @vminu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1275,9 +1271,8 @@ define <vscale x 4 x i64> @vminu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1302,9 +1297,8 @@ define <vscale x 4 x i64> @vminu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1351,9 +1345,8 @@ define <vscale x 8 x i64> @vminu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1378,9 +1371,8 @@ define <vscale x 8 x i64> @vminu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
index b63098b..f0907e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
@@ -934,9 +934,8 @@ define <vscale x 1 x i64> @vmul_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -961,9 +960,8 @@ define <vscale x 1 x i64> @vmul_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1010,9 +1008,8 @@ define <vscale x 2 x i64> @vmul_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1037,9 +1034,8 @@ define <vscale x 2 x i64> @vmul_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1086,9 +1082,8 @@ define <vscale x 4 x i64> @vmul_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1113,9 +1108,8 @@ define <vscale x 4 x i64> @vmul_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1162,9 +1156,8 @@ define <vscale x 8 x i64> @vmul_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1189,9 +1182,8 @@ define <vscale x 8 x i64> @vmul_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
index 2e0daa6..3484d28 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
@@ -1654,9 +1654,9 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, mu
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1685,9 +1685,9 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64_unmasked(<vscale x 1 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1729,9 +1729,8 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1791,9 +1790,9 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1822,9 +1821,9 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64_unmasked(<vscale x 2 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1866,9 +1865,8 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1928,9 +1926,9 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, mu
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1959,9 +1957,9 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64_unmasked(<vscale x 4 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2003,9 +2001,8 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2067,9 +2064,9 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, mu
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2098,9 +2095,9 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64_unmasked(<vscale x 8 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2143,9 +2140,8 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
index ef281c5..e864d71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
@@ -1326,9 +1326,8 @@ define <vscale x 1 x i64> @vor_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1353,9 +1352,8 @@ define <vscale x 1 x i64> @vor_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1422,9 +1420,8 @@ define <vscale x 2 x i64> @vor_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1449,9 +1446,8 @@ define <vscale x 2 x i64> @vor_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1518,9 +1514,8 @@ define <vscale x 4 x i64> @vor_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1545,9 +1540,8 @@ define <vscale x 4 x i64> @vor_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1614,9 +1608,8 @@ define <vscale x 8 x i64> @vor_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1641,9 +1634,8 @@ define <vscale x 8 x i64> @vor_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask.ll
index d99fd03..ce9d6c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask.ll
@@ -785,8 +785,7 @@ define zeroext i1 @vreduce_and_nxv128i1(<vscale x 128 x i1> %v) {
 ; CHECK-LABEL: vreduce_and_nxv128i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmand.mm v8, v0, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v0, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -814,8 +813,7 @@ define zeroext i1 @vreduce_smax_nxv128i1(<vscale x 128 x i1> %v) {
 ; CHECK-LABEL: vreduce_smax_nxv128i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmand.mm v8, v0, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v0, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -829,8 +827,7 @@ define zeroext i1 @vreduce_umin_nxv128i1(<vscale x 128 x i1> %v) {
 ; CHECK-LABEL: vreduce_umin_nxv128i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmand.mm v8, v0, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v0, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -892,8 +889,7 @@ define zeroext i1 @vreduce_and_nxv256i1(<vscale x 256 x i1> %v) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v0, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -925,8 +921,7 @@ define zeroext i1 @vreduce_smax_nxv256i1(<vscale x 256 x i1> %v) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v0, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -942,8 +937,7 @@ define zeroext i1 @vreduce_umin_nxv256i1(<vscale x 256 x i1> %v) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v0, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1019,8 +1013,7 @@ define zeroext i1 @vreduce_and_nxv512i1(<vscale x 512 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v0, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1060,8 +1053,7 @@ define zeroext i1 @vreduce_smax_nxv512i1(<vscale x 512 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v0, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1081,8 +1073,7 @@ define zeroext i1 @vreduce_umin_nxv512i1(<vscale x 512 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v0, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1186,8 +1177,7 @@ define zeroext i1 @vreduce_and_nxv1024i1(<vscale x 1024 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v15, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1243,8 +1233,7 @@ define zeroext i1 @vreduce_smax_nxv1024i1(<vscale x 1024 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v15, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
@@ -1272,8 +1261,7 @@ define zeroext i1 @vreduce_umin_nxv1024i1(<vscale x 1024 x i1> %v) {
 ; CHECK-NEXT:    vmand.mm v11, v15, v11
 ; CHECK-NEXT:    vmand.mm v8, v8, v10
 ; CHECK-NEXT:    vmand.mm v9, v11, v9
-; CHECK-NEXT:    vmand.mm v8, v9, v8
-; CHECK-NEXT:    vmnot.m v8, v8
+; CHECK-NEXT:    vmnand.mm v8, v9, v8
 ; CHECK-NEXT:    vcpop.m a0, v8
 ; CHECK-NEXT:    seqz a0, a0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
index 3273274..66ba269 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
@@ -893,9 +893,8 @@ define <vscale x 1 x i64> @vrem_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -920,9 +919,8 @@ define <vscale x 1 x i64> @vrem_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -969,9 +967,8 @@ define <vscale x 2 x i64> @vrem_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -996,9 +993,8 @@ define <vscale x 2 x i64> @vrem_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1045,9 +1041,8 @@ define <vscale x 4 x i64> @vrem_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1072,9 +1067,8 @@ define <vscale x 4 x i64> @vrem_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1121,9 +1115,8 @@ define <vscale x 8 x i64> @vrem_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1148,9 +1141,8 @@ define <vscale x 8 x i64> @vrem_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
index 6b588d0..4608661 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
@@ -892,9 +892,8 @@ define <vscale x 1 x i64> @vremu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -919,9 +918,8 @@ define <vscale x 1 x i64> @vremu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -968,9 +966,8 @@ define <vscale x 2 x i64> @vremu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -995,9 +992,8 @@ define <vscale x 2 x i64> @vremu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1044,9 +1040,8 @@ define <vscale x 4 x i64> @vremu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1071,9 +1066,8 @@ define <vscale x 4 x i64> @vremu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1120,9 +1114,8 @@ define <vscale x 8 x i64> @vremu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1147,9 +1140,8 @@ define <vscale x 8 x i64> @vremu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
index 0f38e94..c41139c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
@@ -842,9 +842,8 @@ define <vscale x 1 x i64> @vrsub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -869,9 +868,8 @@ define <vscale x 1 x i64> @vrsub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -918,9 +916,8 @@ define <vscale x 2 x i64> @vrsub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -945,9 +942,8 @@ define <vscale x 2 x i64> @vrsub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -994,9 +990,8 @@ define <vscale x 4 x i64> @vrsub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1021,9 +1016,8 @@ define <vscale x 4 x i64> @vrsub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1070,9 +1064,8 @@ define <vscale x 8 x i64> @vrsub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1097,9 +1090,8 @@ define <vscale x 8 x i64> @vrsub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v16, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index 575d041..e471f4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -1425,9 +1425,8 @@ define <vscale x 1 x i64> @vsadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1452,9 +1451,8 @@ define <vscale x 1 x i64> @vsadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1521,9 +1519,8 @@ define <vscale x 2 x i64> @vsadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1548,9 +1545,8 @@ define <vscale x 2 x i64> @vsadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1617,9 +1613,8 @@ define <vscale x 4 x i64> @vsadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1644,9 +1639,8 @@ define <vscale x 4 x i64> @vsadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1713,9 +1707,8 @@ define <vscale x 8 x i64> @vsadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1740,9 +1733,8 @@ define <vscale x 8 x i64> @vsadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index c9ed72b..f76a2b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -1424,9 +1424,8 @@ define <vscale x 1 x i64> @vsaddu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1451,9 +1450,8 @@ define <vscale x 1 x i64> @vsaddu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1520,9 +1518,8 @@ define <vscale x 2 x i64> @vsaddu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1547,9 +1544,8 @@ define <vscale x 2 x i64> @vsaddu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1616,9 +1612,8 @@ define <vscale x 4 x i64> @vsaddu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1643,9 +1638,8 @@ define <vscale x 4 x i64> @vsaddu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1712,9 +1706,8 @@ define <vscale x 8 x i64> @vsaddu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1739,9 +1732,8 @@ define <vscale x 8 x i64> @vsaddu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 001f744..c041a16 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -131,10 +131,9 @@ declare <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i7(<vscale x 2 x i7>, <v
 define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i7(<vscale x 2 x i7> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vsitofp_nxv2f16_nxv2i7:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; ZVFH-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; ZVFH-NEXT:    vadd.vv v8, v8, v8
 ; ZVFH-NEXT:    vsra.vi v9, v8, 1
-; ZVFH-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; ZVFH-NEXT:    vfwcvt.f.x.v v8, v9, v0.t
 ; ZVFH-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index c0da928..ebf8d5e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -1468,9 +1468,8 @@ define <vscale x 1 x i64> @vssub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1495,9 +1494,8 @@ define <vscale x 1 x i64> @vssub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1566,9 +1564,8 @@ define <vscale x 2 x i64> @vssub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1593,9 +1590,8 @@ define <vscale x 2 x i64> @vssub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1664,9 +1660,8 @@ define <vscale x 4 x i64> @vssub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1691,9 +1686,8 @@ define <vscale x 4 x i64> @vssub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1762,9 +1756,8 @@ define <vscale x 8 x i64> @vssub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1789,9 +1782,8 @@ define <vscale x 8 x i64> @vssub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index b602f11..d54901c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -1466,9 +1466,8 @@ define <vscale x 1 x i64> @vssubu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1493,9 +1492,8 @@ define <vscale x 1 x i64> @vssubu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1564,9 +1562,8 @@ define <vscale x 2 x i64> @vssubu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1591,9 +1588,8 @@ define <vscale x 2 x i64> @vssubu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1662,9 +1658,8 @@ define <vscale x 4 x i64> @vssubu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1689,9 +1684,8 @@ define <vscale x 4 x i64> @vssubu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1760,9 +1754,8 @@ define <vscale x 8 x i64> @vssubu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1787,9 +1780,8 @@ define <vscale x 8 x i64> @vssubu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
index 65ba791..e28da6b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
@@ -922,9 +922,8 @@ define <vscale x 1 x i64> @vsub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -949,9 +948,8 @@ define <vscale x 1 x i64> @vsub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -998,9 +996,8 @@ define <vscale x 2 x i64> @vsub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1025,9 +1022,8 @@ define <vscale x 2 x i64> @vsub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1074,9 +1070,8 @@ define <vscale x 4 x i64> @vsub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1101,9 +1096,8 @@ define <vscale x 4 x i64> @vsub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1144,8 @@ define <vscale x 8 x i64> @vsub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1177,9 +1170,8 @@ define <vscale x 8 x i64> @vsub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index 06d8519..3d27a1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -124,9 +124,8 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i7(<vscale x 2 x i7> %va, <vscal
 ; ZVFH-LABEL: vuitofp_nxv2f16_nxv2i7:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    li a1, 127
-; ZVFH-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; ZVFH-NEXT:    vand.vx v9, v8, a1
 ; ZVFH-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; ZVFH-NEXT:    vand.vx v9, v8, a1
 ; ZVFH-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; ZVFH-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
index f3dd7ec..1694a7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
@@ -1694,9 +1694,8 @@ define <vscale x 1 x i64> @vxor_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1721,9 +1720,8 @@ define <vscale x 1 x i64> @vxor_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1810,9 +1808,8 @@ define <vscale x 2 x i64> @vxor_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1837,9 +1834,8 @@ define <vscale x 2 x i64> @vxor_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1926,9 +1922,8 @@ define <vscale x 4 x i64> @vxor_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1953,9 +1948,8 @@ define <vscale x 4 x i64> @vxor_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2042,9 +2036,8 @@ define <vscale x 8 x i64> @vxor_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2069,9 +2062,8 @@ define <vscale x 8 x i64> @vxor_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll b/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll
new file mode 100644
index 0000000..c8832bf
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xcvmem-heuristic.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -mtriple=riscv32 -mattr=+m,+xcvmem -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK
+
+define i32 @test_heuristic(ptr %b, i32 %e, i1 %0) {
+; CHECK-LABEL: test_heuristic:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    add a3, a0, a1
+; CHECK-NEXT:    andi a2, a2, 1
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cv.lbu a1, (a3), 1
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    beqz a2, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %1 = getelementptr i8, ptr %b, i32 %e
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  %2 = phi ptr [ %b, %entry ], [ %7, %loop ]
+  %3 = phi ptr [ %1, %entry ], [ %8, %loop ]
+  %4 = load i8, ptr %2, align 1
+  %5 = load i8, ptr %3, align 1
+  %6 = zext i8 %5 to i32
+  %7 = getelementptr i8, ptr %2, i32 1
+  %8 = getelementptr i8, ptr %3, i32 1
+  br i1 %0, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret i32 %6
+}
diff --git a/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll b/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
index f8207c56..5ce4a19 100644
--- a/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
+++ b/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:      %[[#Int:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:      %[[#Int:]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG:  %[[#MemScope_CrossDevice:]] = OpConstant %[[#Int]] 0
 ; CHECK-SPIRV-DAG:  %[[#MemSemEqual_SeqCst:]] = OpConstant %[[#Int]] 16
 ; CHECK-SPIRV-DAG:  %[[#MemSemUnequal_Acquire:]] = OpConstant %[[#Int]] 2
diff --git a/llvm/test/CodeGen/SPIRV/event-zero-const.ll b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
index f3f20a0..523d2ad 100644
--- a/llvm/test/CodeGen/SPIRV/event-zero-const.ll
+++ b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
@@ -4,10 +4,10 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[#LongTy:]] = OpTypeInt 64 0
-; CHECK: %[[#EventTy:]] = OpTypeEvent
-; CHECK: %[[#LongNull:]] = OpConstantNull %[[#LongTy]]
-; CHECK: %[[#EventNull:]] = OpConstantNull %[[#EventTy]]
+; CHECK-DAG: %[[#LongTy:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#EventTy:]] = OpTypeEvent
+; CHECK-DAG: %[[#LongNull:]] = OpConstantNull %[[#LongTy]]
+; CHECK-DAG: %[[#EventNull:]] = OpConstantNull %[[#EventTy]]
 ; CHECK: OpFunction
 ; CHECK: OpINotEqual %[[#]] %[[#]] %[[#LongNull]]
 ; CHECK: OpGroupAsyncCopy %[[#EventTy]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#EventNull]]
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
index 368c5d4..80309e9 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
@@ -1,16 +1,17 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
 ; CHECK-DAG: OpName %[[I9:.*]] "_ZN13BaseIncrement9incrementEPi"
 ; CHECK-DAG: OpName %[[I29:.*]] "_ZN12IncrementBy29incrementEPi"
 ; CHECK-DAG: OpName %[[I49:.*]] "_ZN12IncrementBy49incrementEPi"
 ; CHECK-DAG: OpName %[[I89:.*]] "_ZN12IncrementBy89incrementEPi"
+; CHECK-DAG: OpName %[[Foo:.*]] "foo"
 
 ; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
-; CHECK-DAG: %[[TyArr:.*]] = OpTypeArray
+; CHECK-DAG: %[[TyInt32:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[Const8:.*]] = OpConstant %[[TyInt32]] 8
+; CHECK-DAG: %[[TyArr:.*]] = OpTypeArray %[[TyInt8]] %[[Const8]]
 ; CHECK-DAG: %[[TyStruct1:.*]] = OpTypeStruct %[[TyArr]]
 ; CHECK-DAG: %[[TyStruct2:.*]] = OpTypeStruct %[[TyStruct1]]
 ; CHECK-DAG: %[[TyPtrStruct2:.*]] = OpTypePointer Generic %[[TyStruct2]]
@@ -18,16 +19,21 @@
 ; CHECK-DAG: %[[TyPtrFun:.*]] = OpTypePointer Generic %[[TyFun]]
 ; CHECK-DAG: %[[TyPtrPtrFun:.*]] = OpTypePointer Generic %[[TyPtrFun]]
 
-; CHECK: %[[I9]] = OpFunction
-; CHECK: %[[I29]] = OpFunction
-; CHECK: %[[I49]] = OpFunction
-; CHECK: %[[I89]] = OpFunction
+; CHECK-DAG: %[[I9]] = OpFunction
+; CHECK-DAG: %[[I29]] = OpFunction
+; CHECK-DAG: %[[I49]] = OpFunction
+; CHECK-DAG: %[[I89]] = OpFunction
+
+; CHECK: %[[Foo]] = OpFunction
+; CHECK-4: OpFunctionParameter
 
 ; CHECK: %[[Arg1:.*]] = OpPhi %[[TyPtrStruct2]]
 ; CHECK: %[[VTbl:.*]] = OpBitcast %[[TyPtrPtrFun]] %[[#]]
 ; CHECK: %[[FP:.*]] = OpLoad %[[TyPtrFun]] %[[VTbl]]
 ; CHECK: %[[#]] = OpFunctionPointerCallINTEL %[[TyVoid]] %[[FP]] %[[Arg1]] %[[#]]
 
+; CHECK-NO: OpFunction
+
 %"cls::id" = type { %"cls::detail::array" }
 %"cls::detail::array" = type { [1 x i64] }
 %struct.obj_storage_t = type { %"struct.aligned_storage<BaseIncrement, IncrementBy2, IncrementBy4, IncrementBy8>::type" }
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
index 75ad382..b96da63 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
@@ -1,9 +1,6 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
 ; CHECK-DAG: OpCapability FunctionPointersINTEL
 ; CHECK-DAG: OpCapability Int64
 ; CHECK: OpExtension "SPV_INTEL_function_pointers"
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
index d38de21..8edecc1 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
@@ -2,15 +2,9 @@
 ; pointers/PtrCast-null-in-OpSpecConstantOp.ll (that is OpSpecConstantOp with ptr-cast operation) correctly
 ; work also for function pointers.
 
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
-; Running with -verify-machineinstrs would lead to "Reading virtual register without a def"
-; error, because OpConstantFunctionPointerINTEL forward-refers to a function definition.
-
 ; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] 121 %[[#]]
 ; CHECK-COUNT-3: OpPtrCastToGeneric
 
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
index e006651..91286d5 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
@@ -31,20 +31,20 @@
 ; CHECK-DAG: %[[#Const123:]] = OpConstant %[[#Int32Ty]] 123
 ; CHECK-DAG: %[[#Const42:]] = OpConstant %[[#DoubleTy:]] 42
 
-; CHECK: %[[#Dialect:]] = OpAsmTargetINTEL "spirv64-unknown-unknown"
+; CHECK-DAG: %[[#Dialect:]] = OpAsmTargetINTEL "spirv64-unknown-unknown"
 ; CHECK-NO: OpAsmTargetINTEL
 
-; CHECK: %[[#Asm1:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" ""
-; CHECK: %[[#Asm2:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "nop" ""
-; CHECK: %[[#Asm3:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" "~{cc},~{memory}"
-; CHECK: %[[#Asm4:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun2Ty:]] %[[#Dialect]] "clobber_out $0" "=&r"
-; CHECK: %[[#Asm5:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun3Ty]] %[[#Dialect]] "icmd $0 $1" "=r,r"
-; CHECK: %[[#Asm6:]] = OpAsmINTEL %[[#FloatTy]] %[[#Fun4Ty]] %[[#Dialect]] "fcmd $0 $1" "=r,r"
-; CHECK: %[[#Asm7:]] = OpAsmINTEL %[[#HalfTy]] %[[#Fun5Ty]] %[[#Dialect]] "fcmdext $0 $1 $2" "=r,r,r"
-; CHECK: %[[#Asm8:]] = OpAsmINTEL %[[#Int8Ty]] %[[#Fun6Ty]] %[[#Dialect]] "cmdext $0 $3 $1 $2" "=r,r,r,r"
-; CHECK: %[[#Asm9:]] = OpAsmINTEL %[[#Int64Ty]] %[[#Fun7Ty]] %[[#Dialect]] "icmdext $0 $3 $1 $2" "=r,r,r,r"
-; CHECK: %[[#Asm10:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "r,r"
-; CHECK: %[[#Asm11:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "i,i"
+; CHECK-DAG: %[[#Asm1:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" ""
+; CHECK-DAG: %[[#Asm2:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "nop" ""
+; CHECK-DAG: %[[#Asm3:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" "~{cc},~{memory}"
+; CHECK-DAG: %[[#Asm4:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun2Ty:]] %[[#Dialect]] "clobber_out $0" "=&r"
+; CHECK-DAG: %[[#Asm5:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun3Ty]] %[[#Dialect]] "icmd $0 $1" "=r,r"
+; CHECK-DAG: %[[#Asm6:]] = OpAsmINTEL %[[#FloatTy]] %[[#Fun4Ty]] %[[#Dialect]] "fcmd $0 $1" "=r,r"
+; CHECK-DAG: %[[#Asm7:]] = OpAsmINTEL %[[#HalfTy]] %[[#Fun5Ty]] %[[#Dialect]] "fcmdext $0 $1 $2" "=r,r,r"
+; CHECK-DAG: %[[#Asm8:]] = OpAsmINTEL %[[#Int8Ty]] %[[#Fun6Ty]] %[[#Dialect]] "cmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK-DAG: %[[#Asm9:]] = OpAsmINTEL %[[#Int64Ty]] %[[#Fun7Ty]] %[[#Dialect]] "icmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK-DAG: %[[#Asm10:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "r,r"
+; CHECK-DAG: %[[#Asm11:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "i,i"
 ; CHECK-NO: OpAsmINTEL
 
 ; CHECK: OpFunction
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
index 8ecd0a2..bd07ba1 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -6,9 +6,11 @@
 
 ; CHECK: OpCapability ShaderClockKHR
 ; CHECK: OpExtension "SPV_KHR_shader_clock"
-; CHECK-DAG: [[uint:%[a-z0-9_]+]] = OpTypeInt 32
+; CHECK-DAG: [[uint:%[a-z0-9_]+]] = OpTypeInt 32 0
 ; CHECK-DAG: [[ulong:%[a-z0-9_]+]] = OpTypeInt 64
 ; CHECK-DAG: [[v2uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 2
+; CHECK-DAG: OpConstant [[uint]] 8
+; CHECK-DAG: OpConstant [[uint]] 16
 ; CHECK-DAG: [[uint_1:%[a-z0-9_]+]] = OpConstant [[uint]] 1
 ; CHECK-DAG: [[uint_2:%[a-z0-9_]+]] = OpConstant [[uint]] 2
 ; CHECK-DAG: [[uint_3:%[a-z0-9_]+]] = OpConstant [[uint]] 3
diff --git a/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll b/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll
new file mode 100644
index 0000000..4501819
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll
@@ -0,0 +1,59 @@
+; Check names and decoration of global variables.
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[#id18:]] "G1"
+; CHECK-DAG: OpName %[[#id22:]] "g1"
+; CHECK-DAG: OpName %[[#id23:]] "g2"
+; CHECK-DAG: OpName %[[#id27:]] "g4"
+; CHECK-DAG: OpName %[[#id30:]] "c1"
+; CHECK-DAG: OpName %[[#id31:]] "n_t"
+; CHECK-DAG: OpName %[[#id32:]] "w"
+; CHECK-DAG: OpName %[[#id34:]] "a.b"
+; CHECK-DAG: OpName %[[#id35:]] "e"
+; CHECK-DAG: OpName %[[#id36:]] "y.z"
+; CHECK-DAG: OpName %[[#id38:]] "x"
+
+; CHECK-NOT: OpDecorate %[[#id18]] LinkageAttributes
+; CHECK-DAG: OpDecorate %[[#id18]] Constant
+; CHECK-DAG: OpDecorate %[[#id22]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id22]] LinkageAttributes "g1" Export
+; CHECK-DAG: OpDecorate %[[#id23]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id27]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id27]] LinkageAttributes "g4" Export
+; CHECK-DAG: OpDecorate %[[#id30]] Constant
+; CHECK-DAG: OpDecorate %[[#id30]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id30]] LinkageAttributes "c1" Export
+; CHECK-DAG: OpDecorate %[[#id31]] Constant
+; CHECK-DAG: OpDecorate %[[#id31]] LinkageAttributes "n_t" Import
+; CHECK-DAG: OpDecorate %[[#id32]] Constant
+; CHECK-DAG: OpDecorate %[[#id32]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id32]] LinkageAttributes "w" Export
+; CHECK-DAG: OpDecorate %[[#id34]] Constant
+; CHECK-DAG: OpDecorate %[[#id34]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id35]] LinkageAttributes "e" Import
+; CHECK-DAG: OpDecorate %[[#id36]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id38]] Constant
+; CHECK-DAG: OpDecorate %[[#id38]] Alignment 4
+
+%"class.sycl::_V1::nd_item" = type { i8 }
+
+@G1 = private unnamed_addr addrspace(1) constant %"class.sycl::_V1::nd_item" poison, align 1
+@g1 = addrspace(1) global i32 1, align 4
+@g2 = internal addrspace(1) global i32 2, align 4
+@g4 = common addrspace(1) global i32 0, align 4
+@c1 = addrspace(2) constant [2 x i32] [i32 0, i32 1], align 4
+@n_t = external addrspace(2) constant [256 x i32]
+@w = addrspace(1) constant i32 0, align 4
+@a.b = internal addrspace(2) constant [2 x i32] [i32 2, i32 3], align 4
+@e = external addrspace(1) global i32
+@y.z = internal addrspace(1) global i32 0, align 4
+@x = internal addrspace(2) constant float 1.000000e+00, align 4
+
+define internal spir_func void @foo(ptr addrspace(4) align 1 %arg) {
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll
new file mode 100644
index 0000000..92947f7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_GroupID.ll
@@ -0,0 +1,52 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG:        %[[#int:]] = OpTypeInt 32 0
+; CHECK-DAG:        %[[#v3int:]] = OpTypeVector %[[#int]] 3
+; CHECK-DAG:        %[[#ptr_Input_v3int:]] = OpTypePointer Input %[[#v3int]]
+; CHECK-DAG:        %[[#tempvar:]] = OpUndef %[[#v3int]]
+; CHECK-DAG:        %[[#WorkgroupId:]] = OpVariable %[[#ptr_Input_v3int]] Input
+
+; CHECK-DAG:        OpEntryPoint GLCompute {{.*}} %[[#WorkgroupId]]
+; CHECK-DAG:        OpName %[[#WorkgroupId]] "__spirv_BuiltInWorkgroupId"
+; CHECK-DAG:        OpDecorate %[[#WorkgroupId]] LinkageAttributes "__spirv_BuiltInWorkgroupId" Import
+; CHECK-DAG:        OpDecorate %[[#WorkgroupId]] BuiltIn WorkgroupId
+
+target triple = "spirv-unknown-vulkan-library"
+
+declare void @group_id_user(<3 x i32>)
+
+; Function Attrs: convergent noinline norecurse
+define void @main() #1 {
+entry:
+
+; CHECK:        %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]]
+; CHECK:        %[[#load0:]] = OpCompositeExtract %[[#int]] %[[#load]] 0
+  %1 = call i32 @llvm.spv.group.id(i32 0)
+
+; CHECK:        %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load0]] %[[#tempvar]]
+  %2 = insertelement <3 x i32> poison, i32 %1, i64 0
+
+; CHECK:        %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]]
+; CHECK:        %[[#load1:]] = OpCompositeExtract %[[#int]] %[[#load]] 1
+  %3 = call i32 @llvm.spv.group.id(i32 1)
+
+; CHECK:        %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load1]] %[[#tempvar]] 1
+  %4 = insertelement <3 x i32> %2, i32 %3, i64 1
+
+; CHECK:        %[[#load:]] = OpLoad %[[#v3int]] %[[#WorkgroupId]]
+; CHECK:        %[[#load2:]] = OpCompositeExtract %[[#int]] %[[#load]] 2
+  %5 = call i32 @llvm.spv.group.id(i32 2)
+
+; CHECK:        %[[#tempvar:]] = OpCompositeInsert %[[#v3int]] %[[#load2]] %[[#tempvar]] 2
+  %6 = insertelement <3 x i32> %4, i32 %5, i64 2
+
+  call spir_func void @group_id_user(<3 x i32> %6)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare i32 @llvm.spv.group.id(i32) #3
+
+attributes #1 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
index 2e0eb8c..b1625c0 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
@@ -15,7 +15,7 @@ entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]]
   ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]]
   ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
-  %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b)
+  %hlsl.cross = call <3 x half> @llvm.spv.cross.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %hlsl.cross
 }
 
@@ -25,9 +25,9 @@ entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]]
   ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]]
   ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
-  %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b)
+  %hlsl.cross = call <3 x float> @llvm.spv.cross.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %hlsl.cross
 }
 
-declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>)
-declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>)
+declare <3 x half> @llvm.spv.cross.v3f16(<3 x half>, <3 x half>)
+declare <3 x float> @llvm.spv.cross.v3f32(<3 x float>, <3 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll
new file mode 100644
index 0000000..85a24a01
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/distance.ll
@@ -0,0 +1,33 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for distance are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef half @distance_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call half @llvm.spv.distance.f16(<4 x half> %a, <4 x half> %b)
+  ret half %spv.distance
+}
+
+define noundef float @distance_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call float @llvm.spv.distance.f32(<4 x float> %a, <4 x float> %b)
+  ret float %spv.distance
+}
+
+declare half @llvm.spv.distance.f16(<4 x half>, <4 x half>)
+declare float @llvm.spv.distance.f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
index b4a9d8e0..1ac862b 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
@@ -11,19 +11,21 @@
 
 define noundef half @length_half4(<4 x half> noundef %a) {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
   ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]]
-  %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a)
+  %hlsl.length = call half @llvm.spv.length.f16(<4 x half> %a)
   ret half %hlsl.length
 }
 
 define noundef float @length_float4(<4 x float> noundef %a) {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
   ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]]
-  %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a)
+  %hlsl.length = call float @llvm.spv.length.f32(<4 x float> %a)
   ret float %hlsl.length
 }
 
-declare half @llvm.spv.length.v4f16(<4 x half>)
-declare float @llvm.spv.length.v4f32(<4 x float>)
+declare half @llvm.spv.length.f16(<4 x half>)
+declare float @llvm.spv.length.f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll b/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
index 8f14eba2..49aaa45 100644
--- a/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
+++ b/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
@@ -25,9 +25,7 @@
 ; CHECK-SPIRV-DAG:                    [[v4uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 4
 ; CHECK-SPIRV-DAG:                 [[vecstruct:%[a-z0-9_]+]] = OpTypeStruct [[v4uint]] [[v4uint]]
 ; CHECK-SPIRV-DAG:   [[_ptr_Function_vecstruct:%[a-z0-9_]+]] = OpTypePointer Function [[vecstruct]]
-; CHECK-SPIRV-DAG:               [[struct_anon:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[uint]]
-; CHECK-SPIRV-DAG: [[_ptr_Function_struct_anon:%[a-z0-9_]+]] = OpTypePointer Function [[struct_anon]]
-; CHECK-SPIRV-DAG:  [[_ptr_Generic_struct_anon:%[a-z0-9_]+]] = OpTypePointer Generic [[struct_anon]]
+; CHECK-SPIRV-DAG:    [[_ptr_Generic_i32struct:%[a-z0-9_]+]] = OpTypePointer Generic [[i32struct]]
 
 define spir_func void @test_builtin_iaddcarrycc(i8 %a, i8 %b) {
   entry:
@@ -116,9 +114,9 @@ define spir_func void @test_builtin_iaddcarry_anon(i32 %a, i32 %b) {
 ; CHECK-SPIRV:        [[a_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:        [[b_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:    [[entry_4:%[a-z0-9_]+]] = OpLabel
-; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_struct_anon]] Function
-; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_struct_anon]] [[var_59]]
-; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpIAddCarry [[struct_anon]] [[a_4]] [[b_4]]
+; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_i32struct]] Function
+; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_i32struct]] [[var_59]]
+; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpIAddCarry [[i32struct]] [[a_4]] [[b_4]]
 ; CHECK-SPIRV:                              OpStore [[var_61]] [[var_62]]
 
 declare void @_Z17__spirv_IAddCarryIiiE4anonIT_T0_ES1_S2_(ptr addrspace(4) sret(%struct.anon) align 4, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/image-unoptimized.ll b/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
index 0ce9c73..d7d5b1d 100644
--- a/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
+++ b/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 
-; CHECK:     %[[#TypeImage:]] = OpTypeImage
-; CHECK:     %[[#TypeSampler:]] = OpTypeSampler
+; CHECK-DAG: %[[#TypeImage:]] = OpTypeImage
+; CHECK-DAG: %[[#TypeSampler:]] = OpTypeSampler
 ; CHECK-DAG: %[[#TypeImagePtr:]] = OpTypePointer {{.*}} %[[#TypeImage]]
 ; CHECK-DAG: %[[#TypeSamplerPtr:]] = OpTypePointer {{.*}} %[[#TypeSampler]]
 
diff --git a/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll b/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
index 08b4d2a..ca842d2f 100644
--- a/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
+++ b/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
@@ -23,9 +23,7 @@
 ; CHECK-SPIRV-DAG:                    [[v4uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 4
 ; CHECK-SPIRV-DAG:                 [[vecstruct:%[a-z0-9_]+]] = OpTypeStruct [[v4uint]] [[v4uint]]
 ; CHECK-SPIRV-DAG:   [[_ptr_Function_vecstruct:%[a-z0-9_]+]] = OpTypePointer Function [[vecstruct]]
-; CHECK-SPIRV-DAG:               [[struct_anon:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[uint]]
-; CHECK-SPIRV-DAG: [[_ptr_Function_struct_anon:%[a-z0-9_]+]] = OpTypePointer Function [[struct_anon]]
-; CHECK-SPIRV-DAG:  [[_ptr_Generic_struct_anon:%[a-z0-9_]+]] = OpTypePointer Generic [[struct_anon]]
+; CHECK-SPIRV-DAG:    [[_ptr_Generic_i32struct:%[a-z0-9_]+]] = OpTypePointer Generic [[i32struct]]
 
 define spir_func void @test_builtin_isubborrowcc(i8 %a, i8 %b) {
   entry:
@@ -114,9 +112,9 @@ define spir_func void @test_builtin_isubborrow_anon(i32 %a, i32 %b) {
 ; CHECK-SPIRV:        [[a_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:        [[b_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:    [[entry_4:%[a-z0-9_]+]] = OpLabel
-; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_struct_anon]] Function
-; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_struct_anon]] [[var_59]]
-; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpISubBorrow [[struct_anon]] [[a_4]] [[b_4]]
+; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_i32struct]] Function
+; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_i32struct]] [[var_59]]
+; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpISubBorrow [[i32struct]] [[a_4]] [[b_4]]
 ; CHECK-SPIRV:                              OpStore [[var_61]] [[var_62]]
 
 declare void @_Z18__spirv_ISubBorrowIiiE4anonIT_T0_ES1_S2_(ptr addrspace(4) sret(%struct.anon) align 4, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll b/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
index 0dc8623..61d06fe 100644
--- a/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
+++ b/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
@@ -3,9 +3,9 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-SPIRV: %[[#Int:]] = OpTypeInt 32 0
-; CHECK-SPIRV: %[[#C0:]] = OpConstant %[[#Int]] 0
-; CHECK-SPIRV: %[[#C1:]] = OpConstant %[[#Int]] 1
+; CHECK-SPIRV-DAG: %[[#Int:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#C0:]] = OpConstant %[[#Int]] 0
+; CHECK-SPIRV-DAG: %[[#C1:]] = OpConstant %[[#Int]] 1
 ; CHECK-SPIRV: OpSelect %[[#Int]] %[[#]] %[[#C1]] %[[#C0]]
 
 
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
index 2d5b309..25b5304 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
@@ -1,21 +1,21 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_32:]] "spirv.llvm_fshl_i32"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_16:]] "spirv.llvm_fshl_i16"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_VEC_INT_16:]] "spirv.llvm_fshl_v2i16"
-; CHECK-SPIRV:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
-; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
-; CHECK-SPIRV:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
-; CHECK-SPIRV-DAG: %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_32:]] "spirv.llvm_fshl_i32"
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_16:]] "spirv.llvm_fshl_i16"
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_VEC_INT_16:]] "spirv.llvm_fshl_v2i16"
+; CHECK-SPIRV-DAG:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
+; CHECK-SPIRV-DAG:     %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
 
 ; CHECK-SPIRV: %[[#]] = OpFunction %[[#TYPE_INT_32]] {{.*}} %[[#TYPE_ORIG_FUNC_32]]
 ; CHECK-SPIRV: %[[#X:]] = OpFunctionParameter %[[#TYPE_INT_32]]
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
index 4cf5ca5..55fb2d9 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
@@ -1,20 +1,20 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_32:]] "spirv.llvm_fshr_i32"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_16:]] "spirv.llvm_fshr_i16"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_VEC_INT_16:]] "spirv.llvm_fshr_v2i16"
-; CHECK-SPIRV:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_32:]] "spirv.llvm_fshr_i32"
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_16:]] "spirv.llvm_fshr_i16"
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_VEC_INT_16:]] "spirv.llvm_fshr_v2i16"
+; CHECK-SPIRV-DAG: %[[#TYPE_INT_32:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG: %[[#TYPE_INT_16:]] = OpTypeInt 16 0
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
 ; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
 ; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
-; CHECK-SPIRV:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
+; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
 ; CHECK-SPIRV-DAG: %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
 
 ; CHECK-SPIRV: %[[#]] = OpFunction %[[#TYPE_INT_32]] {{.*}} %[[#TYPE_ORIG_FUNC_32]]
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
index e7a9869..d5e70ae 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
@@ -12,17 +12,17 @@
 ; CHECK-DAG: %[[#Int8Ptr:]] = OpTypePointer Generic %[[#Int8]]
 
 ; CHECK-DAG: %[[#Const4:]] = OpConstant %[[#Int32]] 4
-; CHECK: %[[#Int8x4:]] = OpTypeArray %[[#Int8]] %[[#Const4]]
+; CHECK-DAG: %[[#Int8x4:]] = OpTypeArray %[[#Int8]] %[[#Const4]]
 
 ; CHECK-DAG: %[[#Const12:]] = OpConstant %[[#Int32]] 12
-; CHECK: %[[#Int8x12:]] = OpTypeArray %[[#Int8]] %[[#Const12]]
+; CHECK-DAG: %[[#Int8x12:]] = OpTypeArray %[[#Int8]] %[[#Const12]]
 
 ; CHECK-DAG: %[[#Const21:]] = OpConstant %[[#Int8]] 21
 ; CHECK-DAG: %[[#False:]] = OpConstantFalse %[[#]]
 ; CHECK-DAG: %[[#ConstComp:]] = OpConstantComposite %[[#Int8x4]] %[[#Const21]] %[[#Const21]] %[[#Const21]] %[[#Const21]]
 ; CHECK-DAG: %[[#ConstNull:]] = OpConstantNull %[[#Int8x12]]
-; CHECK: %[[#VarComp:]] = OpVariable %[[#]] UniformConstant %[[#ConstComp]]
-; CHECK: %[[#VarNull:]] = OpVariable %[[#]] UniformConstant %[[#ConstNull]]
+; CHECK-DAG: %[[#VarComp:]] = OpVariable %[[#]] UniformConstant %[[#ConstComp]]
+; CHECK-DAG: %[[#VarNull:]] = OpVariable %[[#]] UniformConstant %[[#ConstNull]]
 
 ; CHECK-DAG: %[[#Int8PtrConst:]] = OpTypePointer UniformConstant %[[#Int8]]
 ; CHECK: OpCopyMemorySized %[[#Target:]] %[[#Source:]] %[[#Const12]] Aligned 4
diff --git a/llvm/test/CodeGen/SPIRV/logical-access-chain.ll b/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
index 39f6d33..d56678e 100644
--- a/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
+++ b/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
 
-; CHECK:      [[uint:%[0-9]+]] = OpTypeInt 32 0
-; CHECK:     [[uint2:%[0-9]+]] = OpTypeVector [[uint]] 2
-; CHECK:    [[uint_1:%[0-9]+]] = OpConstant [[uint]] 1
-; CHECK:  [[ptr_uint:%[0-9]+]] = OpTypePointer Function [[uint]]
-; CHECK: [[ptr_uint2:%[0-9]+]] = OpTypePointer Function [[uint2]]
+; CHECK-DAG:      [[uint:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG:     [[uint2:%[0-9]+]] = OpTypeVector [[uint]] 2
+; CHECK-DAG:    [[uint_1:%[0-9]+]] = OpConstant [[uint]] 1
+; CHECK-DAG:  [[ptr_uint:%[0-9]+]] = OpTypePointer Function [[uint]]
+; CHECK-DAG: [[ptr_uint2:%[0-9]+]] = OpTypePointer Function [[uint2]]
 
 define void @main() #1 {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/opencl/degrees.ll b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
index 88f9783..b8d4f52 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
@@ -3,7 +3,7 @@
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#op_ext_ocl:]] = OpExtInstImport "OpenCL.std"
 
 ; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
@@ -20,7 +20,7 @@ declare <4 x half> @llvm.spv.degrees.v4f16(<4 x half>)
 define noundef float @degrees_float(float noundef %a) {
 entry:
 ; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] degrees %[[#float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_ocl]] degrees %[[#float_32_arg]]
   %elt.degrees = call float @llvm.spv.degrees.f32(float %a)
   ret float %elt.degrees
 }
@@ -28,7 +28,7 @@ entry:
 define noundef half @degrees_half(half noundef %a) {
 entry:
 ; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] degrees %[[#float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_ocl]] degrees %[[#float_16_arg]]
   %elt.degrees = call half @llvm.spv.degrees.f16(half %a)
   ret half %elt.degrees
 }
@@ -36,7 +36,7 @@ entry:
 define noundef <4 x float> @degrees_float_vector(<4 x float> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] degrees %[[#vec4_float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_ocl]] degrees %[[#vec4_float_32_arg]]
   %elt.degrees = call <4 x float> @llvm.spv.degrees.v4f32(<4 x float> %a)
   ret <4 x float> %elt.degrees
 }
@@ -44,7 +44,7 @@ entry:
 define noundef <4 x half> @degrees_half_vector(<4 x half> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] degrees %[[#vec4_float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_ocl]] degrees %[[#vec4_float_16_arg]]
   %elt.degrees = call <4 x half> @llvm.spv.degrees.v4f16(<4 x half> %a)
   ret <4 x half> %elt.degrees
 }
diff --git a/llvm/test/CodeGen/SPIRV/opencl/distance.ll b/llvm/test/CodeGen/SPIRV/opencl/distance.ll
new file mode 100644
index 0000000..ac18804
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/distance.ll
@@ -0,0 +1,34 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#op_ext_cl:]] = OpExtInstImport "OpenCL.std"
+
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef half @distance_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_cl]] distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call half @llvm.spv.distance.f16(<4 x half> %a, <4 x half> %b)
+  ret half %spv.distance
+}
+
+define noundef float @distance_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_cl]] distance %[[#arg0]] %[[#arg1]]
+  %spv.distance = call float @llvm.spv.distance.f32(<4 x float> %a, <4 x float> %b)
+  ret float %spv.distance
+}
+
+declare half @llvm.spv.distance.f16(<4 x half>, <4 x half>)
+declare float @llvm.spv.distance.f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/radians.ll b/llvm/test/CodeGen/SPIRV/opencl/radians.ll
index f7bb8d5..5b4f26a 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/radians.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/radians.ll
@@ -3,7 +3,7 @@
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#op_ext_ocl:]] = OpExtInstImport "OpenCL.std"
 
 ; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
@@ -20,7 +20,7 @@ declare <4 x half> @llvm.spv.radians.v4f16(<4 x half>)
 define noundef float @radians_float(float noundef %a) {
 entry:
 ; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] radians %[[#float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_ocl]] radians %[[#float_32_arg]]
   %elt.radians = call float @llvm.spv.radians.f32(float %a)
   ret float %elt.radians
 }
@@ -28,7 +28,7 @@ entry:
 define noundef half @radians_half(half noundef %a) {
 entry:
 ; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] radians %[[#float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_ocl]] radians %[[#float_16_arg]]
   %elt.radians = call half @llvm.spv.radians.f16(half %a)
   ret half %elt.radians
 }
@@ -36,7 +36,7 @@ entry:
 define noundef <4 x float> @radians_float_vector(<4 x float> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] radians %[[#vec4_float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_ocl]] radians %[[#vec4_float_32_arg]]
   %elt.radians = call <4 x float> @llvm.spv.radians.v4f32(<4 x float> %a)
   ret <4 x float> %elt.radians
 }
@@ -44,7 +44,7 @@ entry:
 define noundef <4 x half> @radians_half_vector(<4 x half> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] radians %[[#vec4_float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_ocl]] radians %[[#vec4_float_16_arg]]
   %elt.radians = call <4 x half> @llvm.spv.radians.v4f16(<4 x half> %a)
   ret <4 x half> %elt.radians
 }
diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
index 99e2c3e..dee16da 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
@@ -5,10 +5,8 @@
 ; CHECK-DAG: %[[Struct:.*]] = OpTypeStruct %[[Array]]
 ; CHECK-DAG: %[[Zero:.*]] = OpTypeInt 64 0
 ; CHECK-DAG: %[[Null:.*]] = OpConstantNull %[[Zero]]
-; CHECK-DAG: %[[R1:.*]] = OpConstantComposite %[[Array]] %[[Null]]
-; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R1]]
-; CHECK-DAG: %[[R2:.*]] = OpConstantComposite %[[Array]] %[[Null]]
-; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R2]]
+; CHECK-DAG: %[[R:.*]] = OpConstantComposite %[[Array]] %[[Null]]
+; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R]]
 
 @G1 = addrspace(1) constant { [1 x ptr addrspace(4)] } { [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4))] }
 @G2 = addrspace(1) constant { [1 x ptr addrspace(4)] } { [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4))] }
diff --git a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
index 03ecf5e..59a2423 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
@@ -1,12 +1,12 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[TyInt64:.*]] = OpTypeInt 64 0
-; CHECK: %[[TyInt64Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt64]]
-; CHECK: %[[TyStruct:.*]] = OpTypeStruct %[[TyInt64Ptr]] %[[TyInt64Ptr]]
-; CHECK: %[[ConstStruct:.*]] = OpConstantComposite %[[TyStruct]] %[[ConstField:.*]] %[[ConstField]]
-; CHECK: %[[TyStructPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyStruct]]
-; CHECK: OpVariable %[[TyStructPtr]] {{[a-zA-Z]+}} %[[ConstStruct]]
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyInt64Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt64]]
+; CHECK-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyInt64Ptr]] %[[TyInt64Ptr]]
+; CHECK-DAG: %[[ConstStruct:.*]] = OpConstantComposite %[[TyStruct]] %[[ConstField:.*]] %[[ConstField]]
+; CHECK-DAG: %[[TyStructPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyStruct]]
+; CHECK-DAG: OpVariable %[[TyStructPtr]] {{[a-zA-Z]+}} %[[ConstStruct]]
 
 @a = addrspace(1) constant i64 42
 @struct = addrspace(1) global {ptr addrspace(1), ptr addrspace(1)} { ptr addrspace(1) @a, ptr addrspace(1) @a }
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll b/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
index e4c7bdb..8a90e40 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
@@ -24,12 +24,10 @@
 ; CHECK-SPIRV: OpName %[[#sample_kernel_float:]] "sample_kernel_float"
 ; CHECK-SPIRV: OpName %[[#sample_kernel_int:]] "sample_kernel_int"
 
-; CHECK-SPIRV:     %[[#TypeSampler:]] = OpTypeSampler
+; CHECK-SPIRV-DAG: %[[#TypeSampler:]] = OpTypeSampler
 ; CHECK-SPIRV-DAG: %[[#SampledImageTy:]] = OpTypeSampledImage
 ; CHECK-SPIRV-DAG: %[[#ConstSampler1:]] = OpConstantSampler %[[#TypeSampler]] None 0 Linear
 ; CHECK-SPIRV-DAG: %[[#ConstSampler2:]] = OpConstantSampler %[[#TypeSampler]] Repeat 0 Nearest
-; CHECK-SPIRV-DAG: %[[#ConstSampler3:]] = OpConstantSampler %[[#TypeSampler]] None 0 Linear
-; CHECK-SPIRV-DAG: %[[#ConstSampler4:]] = OpConstantSampler %[[#TypeSampler]] Repeat 0 Nearest
 
 ; CHECK-SPIRV: %[[#sample_kernel_float]] = OpFunction %{{.*}}
 ; CHECK-SPIRV: %[[#InputImage:]] = OpFunctionParameter %{{.*}}
@@ -65,13 +63,13 @@ declare spir_func target("spirv.Sampler") @__translate_sampler_initializer(i32)
 ; CHECK-SPIRV: %[[#InputImage:]] = OpFunctionParameter %{{.*}}
 ; CHECK-SPIRV: %[[#argSampl:]] = OpFunctionParameter %[[#TypeSampler]]
 
-; CHECK-SPIRV: %[[#SampledImage4:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler3]]
+; CHECK-SPIRV: %[[#SampledImage4:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler1]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage4]]
 
 ; CHECK-SPIRV: %[[#SampledImage5:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#argSampl]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage5]]
 
-; CHECK-SPIRV: %[[#SampledImage6:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler4]]
+; CHECK-SPIRV: %[[#SampledImage6:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler2]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage6]]
 
 define dso_local spir_kernel void @sample_kernel_int(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, <2 x float> noundef %coords, <4 x i32> addrspace(1)* nocapture noundef writeonly %results, target("spirv.Sampler") %argSampl) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
index 8b326e2..55f1125 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
@@ -39,7 +39,7 @@
 ; CHECK-SPIRV-DAG: %[[#SAMP:]] = OpTypeSampler
 ; CHECK-SPIRV-DAG: %[[#SAMPIMG:]] = OpTypeSampledImage %[[#IMG2D_RD]]
 
-; CHECK-SPIRV:     %[[#SAMP_CONST:]] = OpConstantSampler %[[#SAMP]] None 0 Linear
+; CHECK-SPIRV-DAG: %[[#SAMP_CONST:]] = OpConstantSampler %[[#SAMP]] None 0 Linear
 
 ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#PIPE_RD]]
 ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#PIPE_WR]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
index 74dbaab..5810d9c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
@@ -14,19 +14,19 @@
 ; CHECK-SPIRV-DAG:    %[[#twelve:]] = OpConstant %[[#i32]] 12
 ; CHECK-SPIRV-DAG:    %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]]
 
-; CHECK-SPIRV:        %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
-; CHECK-SPIRV:        %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+; CHECK-SPIRV-DAG:    %[[#test_arr1:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+; CHECK-SPIRV-DAG:    %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
 
 ; CHECK-SPIRV-DAG:    %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]]
 
-; CHECK-SPIRV:        %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function
+; CHECK-SPIRV:        %[[#arr1:]] = OpVariable %[[#i32x3_ptr]] Function
 ; CHECK-SPIRV:        %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function
 
-; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelve]] Aligned 4
+; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr1]] %[[#test_arr1]] %[[#twelve]] Aligned 4
 ; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelve]] Aligned 4
 
 ; CHECK-SPIRV-64:     %[[#twelvezext1:]] = OpUConvert %[[#i64:]] %[[#twelve:]]
-; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelvezext1]] Aligned 4
+; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr1]] %[[#test_arr1]] %[[#twelvezext1]] Aligned 4
 ; CHECK-SPIRV-64:     %[[#twelvezext2:]] = OpUConvert %[[#i64:]] %[[#twelve:]]
 ; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelvezext2]] Aligned 4
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
index adf73fe..62b09f6 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
@@ -329,12 +329,12 @@
 ; CHECK-SPIRV-DAG: %[[#double:]] = OpTypeFloat 64
 
 ; CHECK-SPIRV-DAG: %[[#false:]] = OpConstantFalse %[[#bool]]
+; CHECK-SPIRV-DAG: %[[#int_32:]] = OpConstant %[[#int]] 32
 ; CHECK-SPIRV-DAG: %[[#ScopeSubgroup:]] = OpConstant %[[#int]] 3
 ; CHECK-SPIRV-DAG: %[[#char_0:]] = OpConstant %[[#char]] 0
 ; CHECK-SPIRV-DAG: %[[#char_10:]] = OpConstant %[[#char]] 10
 ; CHECK-SPIRV-DAG: %[[#short_0:]] = OpConstant %[[#short]] 0
 ; CHECK-SPIRV-DAG: %[[#int_0:]] = OpConstant %[[#int]] 0
-; CHECK-SPIRV-DAG: %[[#int_32:]] = OpConstant %[[#int]] 32
 ; CHECK-SPIRV-DAG: %[[#long_0:]] = OpConstantNull %[[#long]]
 ; CHECK-SPIRV-DAG: %[[#half_0:]] = OpConstant %[[#half]] 0
 ; CHECK-SPIRV-DAG: %[[#float_0:]] = OpConstant %[[#float]] 0
diff --git a/llvm/test/CodeGen/SPIRV/unnamed-global.ll b/llvm/test/CodeGen/SPIRV/unnamed-global.ll
index f72334b..90bac50 100644
--- a/llvm/test/CodeGen/SPIRV/unnamed-global.ll
+++ b/llvm/test/CodeGen/SPIRV/unnamed-global.ll
@@ -4,10 +4,10 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[TyInt:.*]] = OpTypeInt 8 0
-; CHECK: %[[ConstInt:.*]] = OpConstant %[[TyInt]] 123
-; CHECK: %[[TyPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt]]
-; CHECK: %[[VarId:.*]] = OpVariable %[[TyPtr]] {{[a-zA-Z]+}} %[[ConstInt]]
+; CHECK-DAG: %[[TyInt:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[ConstInt:.*]] = OpConstant %[[TyInt]] 123
+; CHECK-DAG: %[[TyPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt]]
+; CHECK-DAG: %[[VarId:.*]] = OpVariable %[[TyPtr]] {{[a-zA-Z]+}} %[[ConstInt]]
 
 @0 = addrspace(1) global i8 123
 
diff --git a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
index 5de7afc..786e355 100644
--- a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
+++ b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP
-; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP
+; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP,CHECK-FPNO64
+; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16,+fp64 | FileCheck %s --check-prefixes=CHECK,CHECK-FP,CHECK-FP64
 
 define bfloat @test_fadd(bfloat %a, bfloat %b) {
 ; CHECK-NOFP-LABEL: test_fadd:
@@ -259,9 +260,8 @@ define void @test_truncstore64(double %a, ptr %b) {
 ; CHECK-FP-NEXT:    .save {r4, lr}
 ; CHECK-FP-NEXT:    push {r4, lr}
 ; CHECK-FP-NEXT:    mov r4, r0
-; CHECK-FP-NEXT:    vmov r0, r1, d0
-; CHECK-FP-NEXT:    bl __aeabi_d2f
-; CHECK-FP-NEXT:    lsrs r0, r0, #16
+; CHECK-FP-NEXT:    bl __truncdfbf2
+; CHECK-FP-NEXT:    vmov r0, s0
 ; CHECK-FP-NEXT:    strh r0, [r4]
 ; CHECK-FP-NEXT:    pop {r4, pc}
   %r = fptrunc double %a to bfloat
@@ -312,15 +312,23 @@ define double @test_loadext64(ptr %a) {
 ; CHECK-NOFP-NEXT:    bl __aeabi_f2d
 ; CHECK-NOFP-NEXT:    pop {r7, pc}
 ;
-; CHECK-FP-LABEL: test_loadext64:
-; CHECK-FP:       @ %bb.0:
-; CHECK-FP-NEXT:    .save {r7, lr}
-; CHECK-FP-NEXT:    push {r7, lr}
-; CHECK-FP-NEXT:    ldrh r0, [r0]
-; CHECK-FP-NEXT:    lsls r0, r0, #16
-; CHECK-FP-NEXT:    bl __aeabi_f2d
-; CHECK-FP-NEXT:    vmov d0, r0, r1
-; CHECK-FP-NEXT:    pop {r7, pc}
+; CHECK-FPNO64-LABEL: test_loadext64:
+; CHECK-FPNO64:       @ %bb.0:
+; CHECK-FPNO64-NEXT:    .save {r7, lr}
+; CHECK-FPNO64-NEXT:    push {r7, lr}
+; CHECK-FPNO64-NEXT:    ldrh r0, [r0]
+; CHECK-FPNO64-NEXT:    lsls r0, r0, #16
+; CHECK-FPNO64-NEXT:    bl __aeabi_f2d
+; CHECK-FPNO64-NEXT:    vmov d0, r0, r1
+; CHECK-FPNO64-NEXT:    pop {r7, pc}
+;
+; CHECK-FP64-LABEL: test_loadext64:
+; CHECK-FP64:       @ %bb.0:
+; CHECK-FP64-NEXT:    ldrh r0, [r0]
+; CHECK-FP64-NEXT:    lsls r0, r0, #16
+; CHECK-FP64-NEXT:    vmov s0, r0
+; CHECK-FP64-NEXT:    vcvt.f64.f32 d0, s0
+; CHECK-FP64-NEXT:    bx lr
   %r = load bfloat, ptr %a
   %d = fpext bfloat %r to double
   ret double %d
@@ -1374,10 +1382,7 @@ define bfloat @test_fptrunc_double(double %a) {
 ; CHECK-FP:       @ %bb.0:
 ; CHECK-FP-NEXT:    .save {r7, lr}
 ; CHECK-FP-NEXT:    push {r7, lr}
-; CHECK-FP-NEXT:    vmov r0, r1, d0
-; CHECK-FP-NEXT:    bl __aeabi_d2f
-; CHECK-FP-NEXT:    lsrs r0, r0, #16
-; CHECK-FP-NEXT:    vmov.f16 s0, r0
+; CHECK-FP-NEXT:    bl __truncdfbf2
 ; CHECK-FP-NEXT:    vmov.f16 r0, s0
 ; CHECK-FP-NEXT:    vmov s0, r0
 ; CHECK-FP-NEXT:    pop {r7, pc}
@@ -1410,15 +1415,23 @@ define double @test_fpext_double(bfloat %a) {
 ; CHECK-NOFP-NEXT:    bl __aeabi_f2d
 ; CHECK-NOFP-NEXT:    pop {r7, pc}
 ;
-; CHECK-FP-LABEL: test_fpext_double:
-; CHECK-FP:       @ %bb.0:
-; CHECK-FP-NEXT:    .save {r7, lr}
-; CHECK-FP-NEXT:    push {r7, lr}
-; CHECK-FP-NEXT:    vmov r0, s0
-; CHECK-FP-NEXT:    lsls r0, r0, #16
-; CHECK-FP-NEXT:    bl __aeabi_f2d
-; CHECK-FP-NEXT:    vmov d0, r0, r1
-; CHECK-FP-NEXT:    pop {r7, pc}
+; CHECK-FPNO64-LABEL: test_fpext_double:
+; CHECK-FPNO64:       @ %bb.0:
+; CHECK-FPNO64-NEXT:    .save {r7, lr}
+; CHECK-FPNO64-NEXT:    push {r7, lr}
+; CHECK-FPNO64-NEXT:    vmov r0, s0
+; CHECK-FPNO64-NEXT:    lsls r0, r0, #16
+; CHECK-FPNO64-NEXT:    bl __aeabi_f2d
+; CHECK-FPNO64-NEXT:    vmov d0, r0, r1
+; CHECK-FPNO64-NEXT:    pop {r7, pc}
+;
+; CHECK-FP64-LABEL: test_fpext_double:
+; CHECK-FP64:       @ %bb.0:
+; CHECK-FP64-NEXT:    vmov r0, s0
+; CHECK-FP64-NEXT:    lsls r0, r0, #16
+; CHECK-FP64-NEXT:    vmov s0, r0
+; CHECK-FP64-NEXT:    vcvt.f64.f32 d0, s0
+; CHECK-FP64-NEXT:    bx lr
   %r = fpext bfloat %a to double
   ret double %r
 }
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 93cab25..e63c625 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -14,6 +14,19 @@ define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(ptr noalias nocapture rea
   ret <4 x i32> %wide.masked.gather
 }
 
+define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32_i8(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) {
+; CHECK-LABEL: gather_inc_mini_4i32_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    movs r1, #16
+; CHECK-NEXT:    vadd.i32 q1, q0, r1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, q1]
+; CHECK-NEXT:    bx lr
+  %1 = add <4 x i32> %offs, <i32 16, i32 16, i32 16, i32 16>
+  %2 = getelementptr inbounds i8, i32* %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %wide.masked.gather
+}
+
 define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <4 x i32> %offs) {
 ; CHECK-LABEL: gather_inc_minipred_4i32:
 ; CHECK:       @ %bb.0:
@@ -207,20 +220,20 @@ define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(ptr noalias nocapture
 define arm_aapcs_vfpcc void @gather_pre_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) {
 ; CHECK-LABEL: gather_pre_inc:
 ; CHECK:       @ %bb.0: @ %vector.ph
-; CHECK-NEXT:    adr r3, .LCPI6_0
+; CHECK-NEXT:    adr r3, .LCPI7_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB6_1: @ %vector.body
+; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [q0, #96]!
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vstrb.8 q1, [r1], #16
-; CHECK-NEXT:    bne .LBB6_1
+; CHECK-NEXT:    bne .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:  .LCPI6_0:
+; CHECK-NEXT:  .LCPI7_0:
 ; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
@@ -246,23 +259,65 @@ end:
   ret void;
 }
 
+define arm_aapcs_vfpcc void @gather_pre_inc_i8(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: gather_pre_inc_i8:
+; CHECK:       @ %bb.0: @ %vector.ph
+; CHECK-NEXT:    adr r3, .LCPI8_0
+; CHECK-NEXT:    vldrw.u32 q0, [r3]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:  .LBB8_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [q0, #24]!
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vstrb.8 q1, [r1], #16
+; CHECK-NEXT:    bne .LBB8_1
+; CHECK-NEXT:  @ %bb.2: @ %end
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI8_0:
+; CHECK-NEXT:    .long 4294967278 @ 0xffffffee
+; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
+; CHECK-NEXT:    .long 4294967290 @ 0xfffffffa
+; CHECK-NEXT:    .long 0 @ 0x0
+vector.ph:                                        ; preds = %for.body.preheader
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
+  %2 = getelementptr inbounds i8, ptr %data, <4 x i32> %1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, ptr %dst, i32 %index
+  store <4 x i32> %wide.masked.gather, ptr %3, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
 define arm_aapcs_vfpcc void @gather_post_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec43) {
 ; CHECK-LABEL: gather_post_inc:
 ; CHECK:       @ %bb.0: @ %vector.ph41
-; CHECK-NEXT:    adr r3, .LCPI7_0
+; CHECK-NEXT:    adr r3, .LCPI9_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB7_1: @ %vector.body39
+; CHECK-NEXT:  .LBB9_1: @ %vector.body39
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [q0, #96]!
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vstrb.8 q1, [r1], #16
-; CHECK-NEXT:    bne .LBB7_1
+; CHECK-NEXT:    bne .LBB9_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:  .LCPI7_0:
+; CHECK-NEXT:  .LCPI9_0:
 ; CHECK-NEXT:    .long 4294967200 @ 0xffffffa0
 ; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
@@ -293,38 +348,38 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB8_1: @ %vector.ph.preheader
+; CHECK-NEXT:  .LBB10_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
 ; CHECK-NEXT:    add.w r4, r3, lr, lsr #2
-; CHECK-NEXT:    adr r3, .LCPI8_0
+; CHECK-NEXT:    adr r3, .LCPI10_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB8_2: @ %vector.ph
+; CHECK-NEXT:  .LBB10_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB8_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB10_3 Depth 2
 ; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:  .LBB8_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB8_2 Depth=1
+; CHECK-NEXT:  .LBB10_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB10_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q2, [q1, #16]!
 ; CHECK-NEXT:    vstrb.8 q2, [r0], #16
-; CHECK-NEXT:    le lr, .LBB8_3
+; CHECK-NEXT:    le lr, .LBB10_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB8_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB10_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB8_2
+; CHECK-NEXT:    bne .LBB10_2
 ; CHECK-NEXT:  @ %bb.5:
 ; CHECK-NEXT:    pop.w {r4, lr}
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI8_0:
+; CHECK-NEXT:  .LCPI10_0:
 ; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
 ; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
 ; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
@@ -363,7 +418,7 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB9_1: @ %vector.ph.preheader
+; CHECK-NEXT:  .LBB11_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -371,26 +426,26 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
-; CHECK-NEXT:    adr r4, .LCPI9_1
-; CHECK-NEXT:    adr r5, .LCPI9_2
+; CHECK-NEXT:    adr r4, .LCPI11_1
+; CHECK-NEXT:    adr r5, .LCPI11_2
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 ; CHECK-NEXT:    add.w r3, r3, lr, lsr #2
-; CHECK-NEXT:    adr.w lr, .LCPI9_0
+; CHECK-NEXT:    adr.w lr, .LCPI11_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [lr]
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:  .LBB9_2: @ %vector.ph
+; CHECK-NEXT:  .LBB11_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB9_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB11_3 Depth 2
 ; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:  .LBB9_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=1
+; CHECK-NEXT:  .LBB11_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q6, [q5, #48]!
 ; CHECK-NEXT:    vldrw.u32 q7, [q3, #48]!
@@ -398,28 +453,28 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vldrw.u32 q7, [q4, #48]!
 ; CHECK-NEXT:    vadd.i32 q6, q6, q7
 ; CHECK-NEXT:    vstrb.8 q6, [r0], #16
-; CHECK-NEXT:    le lr, .LBB9_3
+; CHECK-NEXT:    le lr, .LBB11_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB9_2
+; CHECK-NEXT:    bne .LBB11_2
 ; CHECK-NEXT:  @ %bb.5:
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop.w {r4, r5, r7, lr}
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI9_0:
+; CHECK-NEXT:  .LCPI11_0:
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967260 @ 0xffffffdc
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
 ; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
-; CHECK-NEXT:  .LCPI9_1:
+; CHECK-NEXT:  .LCPI11_1:
 ; CHECK-NEXT:    .long 4294967252 @ 0xffffffd4
 ; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
 ; CHECK-NEXT:    .long 4294967276 @ 0xffffffec
 ; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
-; CHECK-NEXT:  .LCPI9_2:
+; CHECK-NEXT:  .LCPI11_2:
 ; CHECK-NEXT:    .long 4294967256 @ 0xffffffd8
 ; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
 ; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
@@ -467,38 +522,38 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(ptr noalias nocapture readon
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB10_1: @ %vector.ph.preheader
+; CHECK-NEXT:  .LBB12_1: @ %vector.ph.preheader
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    bic r12, r2, #3
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w lr, r12, #4
 ; CHECK-NEXT:    add.w r4, r3, lr, lsr #2
-; CHECK-NEXT:    adr r3, .LCPI10_0
+; CHECK-NEXT:    adr r3, .LCPI12_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB10_2: @ %vector.ph
+; CHECK-NEXT:  .LBB12_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB10_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
 ; CHECK-NEXT:    dls lr, r4
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:  .LBB10_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB10_2 Depth=1
+; CHECK-NEXT:  .LBB12_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q2, [q1, #508]!
 ; CHECK-NEXT:    vstrb.8 q2, [r0], #16
-; CHECK-NEXT:    le lr, .LBB10_3
+; CHECK-NEXT:    le lr, .LBB12_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB10_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB10_2
+; CHECK-NEXT:    bne .LBB12_2
 ; CHECK-NEXT:  @ %bb.5:
 ; CHECK-NEXT:    pop.w {r4, lr}
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI10_0:
+; CHECK-NEXT:  .LCPI12_0:
 ; CHECK-NEXT:    .long 4294966788 @ 0xfffffe04
 ; CHECK-NEXT:    .long 4294966792 @ 0xfffffe08
 ; CHECK-NEXT:    .long 4294966796 @ 0xfffffe0c
@@ -543,7 +598,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    sub sp, #28
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    blt .LBB11_5
+; CHECK-NEXT:    blt .LBB13_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    movs r6, #1
@@ -553,16 +608,16 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    sub.w r3, r1, #8
 ; CHECK-NEXT:    add.w r8, r6, r3, lsr #3
-; CHECK-NEXT:    adr r3, .LCPI11_0
+; CHECK-NEXT:    adr r3, .LCPI13_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
-; CHECK-NEXT:  .LBB11_2: @ %vector.ph
+; CHECK-NEXT:  .LBB13_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB11_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB13_3 Depth 2
 ; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB11_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
+; CHECK-NEXT:  .LBB13_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB13_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrw.32 q1, [r2]
 ; CHECK-NEXT:    mov r12, r2
@@ -595,19 +650,19 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    vmov.16 q2[6], r7
 ; CHECK-NEXT:    vmov.16 q2[7], r5
 ; CHECK-NEXT:    vstrb.8 q2, [r6], #16
-; CHECK-NEXT:    le lr, .LBB11_3
+; CHECK-NEXT:    le lr, .LBB13_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB13_2 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    bne .LBB11_2
-; CHECK-NEXT:  .LBB11_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne .LBB13_2
+; CHECK-NEXT:  .LBB13_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #28
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI11_0:
+; CHECK-NEXT:  .LCPI13_0:
 ; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:    .short 1 @ 0x1
 ; CHECK-NEXT:    .short 2 @ 0x2
@@ -660,10 +715,10 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    sub sp, #136
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #64] @ 8-byte Folded Spill
-; CHECK-NEXT:    blt.w .LBB12_5
+; CHECK-NEXT:    blt.w .LBB14_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
 ; CHECK-NEXT:    ldr r1, [sp, #68] @ 4-byte Reload
-; CHECK-NEXT:    adr r3, .LCPI12_2
+; CHECK-NEXT:    adr r3, .LCPI14_2
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    movs r2, #1
 ; CHECK-NEXT:    bic r1, r1, #7
@@ -673,17 +728,17 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vmov.i16 q2, #0x18
 ; CHECK-NEXT:    add.w r1, r2, r1, lsr #3
 ; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
-; CHECK-NEXT:    adr r1, .LCPI12_0
-; CHECK-NEXT:    adr r2, .LCPI12_1
+; CHECK-NEXT:    adr r1, .LCPI14_0
+; CHECK-NEXT:    adr r2, .LCPI14_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #24] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:    add r2, sp, #120
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #8] @ 16-byte Spill
-; CHECK-NEXT:  .LBB12_2: @ %vector.ph
+; CHECK-NEXT:  .LBB14_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r10, sp, #104
 ; CHECK-NEXT:    dls lr, r1
@@ -691,8 +746,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #24] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp, #40] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #8] @ 16-byte Reload
-; CHECK-NEXT:  .LBB12_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
+; CHECK-NEXT:  .LBB14_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB14_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrw.32 q5, [r2]
 ; CHECK-NEXT:    mov r8, r2
@@ -786,21 +841,21 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vadd.i16 q0, q3, q1
 ; CHECK-NEXT:    vadd.i16 q0, q0, q7
 ; CHECK-NEXT:    vstrb.8 q0, [r7], #16
-; CHECK-NEXT:    le lr, .LBB12_3
+; CHECK-NEXT:    le lr, .LBB14_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB14_2 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r3, [sp, #68] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r1, r3
-; CHECK-NEXT:    bne.w .LBB12_2
-; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne.w .LBB14_2
+; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #136
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI12_0:
+; CHECK-NEXT:  .LCPI14_0:
 ; CHECK-NEXT:    .short 1 @ 0x1
 ; CHECK-NEXT:    .short 4 @ 0x4
 ; CHECK-NEXT:    .short 7 @ 0x7
@@ -809,7 +864,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    .short 16 @ 0x10
 ; CHECK-NEXT:    .short 19 @ 0x13
 ; CHECK-NEXT:    .short 22 @ 0x16
-; CHECK-NEXT:  .LCPI12_1:
+; CHECK-NEXT:  .LCPI14_1:
 ; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:    .short 3 @ 0x3
 ; CHECK-NEXT:    .short 6 @ 0x6
@@ -818,7 +873,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    .short 15 @ 0xf
 ; CHECK-NEXT:    .short 18 @ 0x12
 ; CHECK-NEXT:    .short 21 @ 0x15
-; CHECK-NEXT:  .LCPI12_2:
+; CHECK-NEXT:  .LCPI14_2:
 ; CHECK-NEXT:    .short 2 @ 0x2
 ; CHECK-NEXT:    .short 5 @ 0x5
 ; CHECK-NEXT:    .short 8 @ 0x8
@@ -880,21 +935,21 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    sub sp, #312
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    str r1, [sp, #116] @ 4-byte Spill
-; CHECK-NEXT:    blt.w .LBB13_5
+; CHECK-NEXT:    blt.w .LBB15_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    adr r1, .LCPI13_0
-; CHECK-NEXT:    adr r6, .LCPI13_8
+; CHECK-NEXT:    adr r1, .LCPI15_0
+; CHECK-NEXT:    adr r6, .LCPI15_8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_1
-; CHECK-NEXT:    adr r7, .LCPI13_7
-; CHECK-NEXT:    adr r3, .LCPI13_6
+; CHECK-NEXT:    adr r1, .LCPI15_1
+; CHECK-NEXT:    adr r7, .LCPI15_7
+; CHECK-NEXT:    adr r3, .LCPI15_6
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_5
+; CHECK-NEXT:    adr r1, .LCPI15_5
 ; CHECK-NEXT:    bic r10, r2, #7
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    adr r6, .LCPI13_9
+; CHECK-NEXT:    adr r6, .LCPI15_9
 ; CHECK-NEXT:    vmov.i32 q2, #0x30
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
@@ -905,22 +960,22 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:  .LBB13_2: @ %vector.ph
+; CHECK-NEXT:  .LBB15_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB13_3 Depth 2
-; CHECK-NEXT:    adr r1, .LCPI13_3
+; CHECK-NEXT:    @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT:    adr r1, .LCPI15_3
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_4
+; CHECK-NEXT:    adr r1, .LCPI15_4
 ; CHECK-NEXT:    vldrw.u32 q5, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_2
+; CHECK-NEXT:    adr r1, .LCPI15_2
 ; CHECK-NEXT:    vldrw.u32 q3, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_10
+; CHECK-NEXT:    adr r1, .LCPI15_10
 ; CHECK-NEXT:    vstrw.32 q6, [sp, #280] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r1]
-; CHECK-NEXT:    adr r1, .LCPI13_11
+; CHECK-NEXT:    adr r1, .LCPI15_11
 ; CHECK-NEXT:    ldr.w r8, [sp, #116] @ 4-byte Reload
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #248] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
@@ -935,8 +990,8 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    mov r11, r10
 ; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #184] @ 16-byte Spill
-; CHECK-NEXT:  .LBB13_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB13_2 Depth=1
+; CHECK-NEXT:  .LBB15_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB15_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vadd.i32 q4, q1, r0
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #136] @ 16-byte Spill
@@ -1114,74 +1169,74 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #168] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.i32 q7, q7, q2
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
-; CHECK-NEXT:    bne.w .LBB13_3
+; CHECK-NEXT:    bne.w .LBB15_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB13_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB15_2 Depth=1
 ; CHECK-NEXT:    cmp r10, r2
-; CHECK-NEXT:    bne.w .LBB13_2
-; CHECK-NEXT:  .LBB13_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne.w .LBB15_2
+; CHECK-NEXT:  .LBB15_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #312
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI13_0:
+; CHECK-NEXT:  .LCPI15_0:
 ; CHECK-NEXT:    .long 38 @ 0x26
 ; CHECK-NEXT:    .long 41 @ 0x29
 ; CHECK-NEXT:    .long 44 @ 0x2c
 ; CHECK-NEXT:    .long 47 @ 0x2f
-; CHECK-NEXT:  .LCPI13_1:
+; CHECK-NEXT:  .LCPI15_1:
 ; CHECK-NEXT:    .long 14 @ 0xe
 ; CHECK-NEXT:    .long 17 @ 0x11
 ; CHECK-NEXT:    .long 20 @ 0x14
 ; CHECK-NEXT:    .long 23 @ 0x17
-; CHECK-NEXT:  .LCPI13_2:
+; CHECK-NEXT:  .LCPI15_2:
 ; CHECK-NEXT:    .long 24 @ 0x18
 ; CHECK-NEXT:    .long 27 @ 0x1b
 ; CHECK-NEXT:    .long 30 @ 0x1e
 ; CHECK-NEXT:    .long 33 @ 0x21
-; CHECK-NEXT:  .LCPI13_3:
+; CHECK-NEXT:  .LCPI15_3:
 ; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 4 @ 0x4
 ; CHECK-NEXT:    .long 7 @ 0x7
 ; CHECK-NEXT:    .long 10 @ 0xa
-; CHECK-NEXT:  .LCPI13_4:
+; CHECK-NEXT:  .LCPI15_4:
 ; CHECK-NEXT:    .long 36 @ 0x24
 ; CHECK-NEXT:    .long 39 @ 0x27
 ; CHECK-NEXT:    .long 42 @ 0x2a
 ; CHECK-NEXT:    .long 45 @ 0x2d
-; CHECK-NEXT:  .LCPI13_5:
+; CHECK-NEXT:  .LCPI15_5:
 ; CHECK-NEXT:    .long 25 @ 0x19
 ; CHECK-NEXT:    .long 28 @ 0x1c
 ; CHECK-NEXT:    .long 31 @ 0x1f
 ; CHECK-NEXT:    .long 34 @ 0x22
-; CHECK-NEXT:  .LCPI13_6:
+; CHECK-NEXT:  .LCPI15_6:
 ; CHECK-NEXT:    .long 13 @ 0xd
 ; CHECK-NEXT:    .long 16 @ 0x10
 ; CHECK-NEXT:    .long 19 @ 0x13
 ; CHECK-NEXT:    .long 22 @ 0x16
-; CHECK-NEXT:  .LCPI13_7:
+; CHECK-NEXT:  .LCPI15_7:
 ; CHECK-NEXT:    .long 2 @ 0x2
 ; CHECK-NEXT:    .long 5 @ 0x5
 ; CHECK-NEXT:    .long 8 @ 0x8
 ; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:  .LCPI13_8:
+; CHECK-NEXT:  .LCPI15_8:
 ; CHECK-NEXT:    .long 26 @ 0x1a
 ; CHECK-NEXT:    .long 29 @ 0x1d
 ; CHECK-NEXT:    .long 32 @ 0x20
 ; CHECK-NEXT:    .long 35 @ 0x23
-; CHECK-NEXT:  .LCPI13_9:
+; CHECK-NEXT:  .LCPI15_9:
 ; CHECK-NEXT:    .long 37 @ 0x25
 ; CHECK-NEXT:    .long 40 @ 0x28
 ; CHECK-NEXT:    .long 43 @ 0x2b
 ; CHECK-NEXT:    .long 46 @ 0x2e
-; CHECK-NEXT:  .LCPI13_10:
+; CHECK-NEXT:  .LCPI15_10:
 ; CHECK-NEXT:    .long 12 @ 0xc
 ; CHECK-NEXT:    .long 15 @ 0xf
 ; CHECK-NEXT:    .long 18 @ 0x12
 ; CHECK-NEXT:    .long 21 @ 0x15
-; CHECK-NEXT:  .LCPI13_11:
+; CHECK-NEXT:  .LCPI15_11:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 3 @ 0x3
 ; CHECK-NEXT:    .long 6 @ 0x6
@@ -1238,14 +1293,14 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #56] @ 8-byte Folded Spill
-; CHECK-NEXT:    blt.w .LBB14_5
+; CHECK-NEXT:    blt.w .LBB16_5
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT:    adr r5, .LCPI14_3
-; CHECK-NEXT:    adr r7, .LCPI14_1
+; CHECK-NEXT:    adr r5, .LCPI16_3
+; CHECK-NEXT:    adr r7, .LCPI16_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    adr r3, .LCPI14_0
-; CHECK-NEXT:    adr r6, .LCPI14_2
+; CHECK-NEXT:    adr r3, .LCPI16_0
+; CHECK-NEXT:    adr r6, .LCPI16_2
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    bic r9, r1, #7
@@ -1255,16 +1310,16 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    mov.w lr, #16
 ; CHECK-NEXT:    str.w r9, [sp, #52] @ 4-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:  .LBB14_2: @ %vector.ph
+; CHECK-NEXT:  .LBB16_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_3 Depth 2
 ; CHECK-NEXT:    ldr.w r8, [sp, #56] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov q4, q3
-; CHECK-NEXT:  .LBB14_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB14_2 Depth=1
+; CHECK-NEXT:  .LBB16_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB16_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vadd.i32 q1, q5, r0
 ; CHECK-NEXT:    vadd.i32 q2, q4, r0
@@ -1318,36 +1373,36 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    vmov.8 q0[15], r12
 ; CHECK-NEXT:    vstrb.8 q0, [r8], #16
 ; CHECK-NEXT:    vmov q0, q6
-; CHECK-NEXT:    bne .LBB14_3
+; CHECK-NEXT:    bne .LBB16_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB14_2 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB16_2 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
 ; CHECK-NEXT:    ldr.w r9, [sp, #52] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r9, r1
-; CHECK-NEXT:    bne .LBB14_2
-; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
+; CHECK-NEXT:    bne .LBB16_2
+; CHECK-NEXT:  .LBB16_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:  .LCPI14_0:
+; CHECK-NEXT:  .LCPI16_0:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 1 @ 0x1
 ; CHECK-NEXT:    .long 2 @ 0x2
 ; CHECK-NEXT:    .long 3 @ 0x3
-; CHECK-NEXT:  .LCPI14_1:
+; CHECK-NEXT:  .LCPI16_1:
 ; CHECK-NEXT:    .long 8 @ 0x8
 ; CHECK-NEXT:    .long 9 @ 0x9
 ; CHECK-NEXT:    .long 10 @ 0xa
 ; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:  .LCPI14_2:
+; CHECK-NEXT:  .LCPI16_2:
 ; CHECK-NEXT:    .long 4 @ 0x4
 ; CHECK-NEXT:    .long 5 @ 0x5
 ; CHECK-NEXT:    .long 6 @ 0x6
 ; CHECK-NEXT:    .long 7 @ 0x7
-; CHECK-NEXT:  .LCPI14_3:
+; CHECK-NEXT:  .LCPI16_3:
 ; CHECK-NEXT:    .long 12 @ 0xc
 ; CHECK-NEXT:    .long 13 @ 0xd
 ; CHECK-NEXT:    .long 14 @ 0xe
@@ -1390,21 +1445,21 @@ define void @shl(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) {
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
-; CHECK-NEXT:  .LBB15_1: @ %vector.ph
-; CHECK-NEXT:    adr r3, .LCPI15_0
+; CHECK-NEXT:  .LBB17_1: @ %vector.ph
+; CHECK-NEXT:    adr r3, .LCPI17_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK-NEXT:  .LBB15_2: @ %vector.body
+; CHECK-NEXT:  .LBB17_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [q0, #64]!
 ; CHECK-NEXT:    vstrw.32 q1, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB15_2
+; CHECK-NEXT:    letp lr, .LBB17_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
-; CHECK-NEXT:  .LCPI15_0:
+; CHECK-NEXT:  .LCPI17_0:
 ; CHECK-NEXT:    .long 4294967232 @ 0xffffffc0
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
@@ -1444,12 +1499,12 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB16_3
+; CHECK-NEXT:    blt .LBB18_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adr.w lr, .LCPI16_0
-; CHECK-NEXT:    adr r4, .LCPI16_1
-; CHECK-NEXT:    adr r5, .LCPI16_2
-; CHECK-NEXT:    adr r6, .LCPI16_3
+; CHECK-NEXT:    adr.w lr, .LCPI18_0
+; CHECK-NEXT:    adr r4, .LCPI18_1
+; CHECK-NEXT:    adr r5, .LCPI18_2
+; CHECK-NEXT:    adr r6, .LCPI18_3
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
@@ -1459,7 +1514,7 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    vadd.i32 q2, q2, r1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r1
 ; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK-NEXT:  .LBB16_2: @ %vector.body
+; CHECK-NEXT:  .LBB18_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q4, [q3, #128]!
 ; CHECK-NEXT:    vldrw.u32 q5, [q2, #128]!
@@ -1469,28 +1524,28 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    vadd.i32 q4, q4, q5
 ; CHECK-NEXT:    vadd.i32 q4, q4, q6
 ; CHECK-NEXT:    vstrw.32 q4, [r0], #16
-; CHECK-NEXT:    letp lr, .LBB16_2
-; CHECK-NEXT:  .LBB16_3: @ %for.cond.cleanup
+; CHECK-NEXT:    letp lr, .LBB18_2
+; CHECK-NEXT:  .LBB18_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
-; CHECK-NEXT:  .LCPI16_0:
+; CHECK-NEXT:  .LCPI18_0:
 ; CHECK-NEXT:    .long 4294967168 @ 0xffffff80
 ; CHECK-NEXT:    .long 4294967200 @ 0xffffffa0
 ; CHECK-NEXT:    .long 4294967232 @ 0xffffffc0
 ; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
-; CHECK-NEXT:  .LCPI16_1:
+; CHECK-NEXT:  .LCPI18_1:
 ; CHECK-NEXT:    .long 4294967176 @ 0xffffff88
 ; CHECK-NEXT:    .long 4294967208 @ 0xffffffa8
 ; CHECK-NEXT:    .long 4294967240 @ 0xffffffc8
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
-; CHECK-NEXT:  .LCPI16_2:
+; CHECK-NEXT:  .LCPI18_2:
 ; CHECK-NEXT:    .long 4294967184 @ 0xffffff90
 ; CHECK-NEXT:    .long 4294967216 @ 0xffffffb0
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
-; CHECK-NEXT:  .LCPI16_3:
+; CHECK-NEXT:  .LCPI18_3:
 ; CHECK-NEXT:    .long 4294967192 @ 0xffffff98
 ; CHECK-NEXT:    .long 4294967224 @ 0xffffffb8
 ; CHECK-NEXT:    .long 4294967256 @ 0xffffffd8
diff --git a/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll b/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
index 67fd59e..ed3dcad 100644
--- a/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -30,9 +30,9 @@ entry:
   ret float %conv
 }
 
-define void @PR17495() {
+define void @PR17495(i1 %arg) {
 entry:
-  br i1 undef, label %while.end, label %while.body
+  br i1 %arg, label %while.end, label %while.body
 
 while.body:                                       ; preds = %while.body, %entry
   %x.1.copyload = load i24, ptr undef, align 1
diff --git a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
index 22bf458..49de509 100644
--- a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
+++ b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
@@ -165,7 +165,7 @@ failure:                                          ; preds = %backedge
   unreachable
 }
 
-define void @test_04() {
+define void @test_04(i32 %arg) {
 ; CHECK-LABEL: test_04:
 ; CHECK:       ## %bb.0: ## %bb
 ; CHECK-NEXT:    ud2
@@ -175,7 +175,7 @@ bb:
 bb1:                                              ; preds = %bb10, %bb
   %tmp = phi i64 [ 1, %bb ], [ %tmp2, %bb10 ]
   %tmp2 = add nuw nsw i64 %tmp, 1
-  br i1 undef, label %bb21, label %bb7
+  br i1 poison, label %bb21, label %bb7
 
 bb7:                                              ; preds = %bb1
   %tmp8 = add nsw i64 %tmp, -1
@@ -187,7 +187,7 @@ bb10:                                             ; preds = %bb16
   br label %bb1
 
 bb11:                                             ; preds = %bb16, %bb7
-  switch i32 undef, label %bb19 [
+  switch i32 %arg, label %bb19 [
     i32 0, label %bb17
     i32 1, label %bb16
     i32 2, label %bb15
@@ -205,7 +205,7 @@ bb15:                                             ; preds = %bb11
   unreachable
 
 bb16:                                             ; preds = %bb11
-  br i1 undef, label %bb10, label %bb11
+  br i1 poison, label %bb10, label %bb11
 
 bb17:                                             ; preds = %bb11
   unreachable
diff --git a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
index 86874b1..faa119c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
@@ -5,13 +5,13 @@ define void @undef_2phi(ptr%buf) {
 ; CHECK-LABEL: @undef_2phi(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[L3:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[L3:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
-; CHECK-NEXT:    br i1 undef, label [[L3]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L3]], label [[EXIT:%.*]]
 ; CHECK:       l3:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi x86_amx [ [[TMP1]], [[L2]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP2]])
@@ -20,16 +20,16 @@ define void @undef_2phi(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %l3
+  br i1 poison, label %l2, label %l3
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
-  br i1 undef, label %l3, label %exit
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
+  br i1 poison, label %l3, label %exit
 
 l3:
   %t4 = phi <256 x i32> [ %t3, %l2], [ %t2, %l1 ]
@@ -45,10 +45,10 @@ define void @foo_undef(ptr%buf) {
 ; CHECK-LABEL: @foo_undef(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]])
@@ -57,15 +57,15 @@ define void @foo_undef(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t4)
   br label %exit
@@ -78,10 +78,10 @@ define void @foo_zero(ptr%buf) {
 ; CHECK-LABEL: @foo_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]])
@@ -90,12 +90,12 @@ define void @foo_zero(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ]
@@ -112,14 +112,14 @@ define void @foo_vrow(ptr%buf, i16 %row) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW:%.*]], i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[TMP1]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[TMP0]], align 1024
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 32, ptr [[TMP0]], i64 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP5]])
@@ -128,15 +128,15 @@ define void @foo_vrow(ptr%buf, i16 %row) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   call void @llvm.x86.tilestored64.internal(i16 %row, i16 32, ptr %buf, i64 1024, x86_amx %t4)
   br label %exit
@@ -150,13 +150,13 @@ define void @foo_vcol(ptr%buf, i16 %col) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 [[COL:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[COL]] to i64
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[TMP1]], i64 [[TMP3]], x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP4]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[TMP0]], align 1024
@@ -168,12 +168,12 @@ define void @foo_vcol(ptr%buf, i16 %col) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 %col)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ]
@@ -189,29 +189,29 @@ define void @noshape(ptr%buf) {
 ; CHECK-LABEL: @noshape(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4)
   store <256 x i32> %t5, ptr %buf
@@ -225,14 +225,14 @@ define void @noshape2(ptr%buf) {
 ; CHECK-LABEL: @noshape2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
 ; CHECK-NEXT:    [[T6:%.*]] = call <256 x i32> @llvm.abs.v256i32(<256 x i32> [[T3]], i1 true)
 ; CHECK-NEXT:    store <256 x i32> [[T6]], ptr [[BUF:%.*]], align 1024
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -240,15 +240,15 @@ define void @noshape2(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4)
   %t6 = call <256 x i32> @llvm.abs.v256i32(<256 x i32> %t5, i1 1)
diff --git a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
index b2eb5fd..b70668f 100644
--- a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
@@ -18,14 +18,14 @@ wrapper_entry:
 
 ; Cases where amxcast can be combined across bb
 ; %5 and %6 is combined together since %goodphi's incoming is phi or amxcast
-define void @combine_amx_cast_and_phi() {
+define void @combine_amx_cast_and_phi(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -43,7 +43,7 @@ define void @combine_amx_cast_and_phi() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -62,7 +62,7 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 
 ; Cases where amxcast can't be combined across bb
 ; %5 and %6 is not combined together since %evilphi's incoming is not phi or amxcast
-define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
+define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp, i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -71,7 +71,7 @@ define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <110 x i32> [[TMP:%.*]], [[TMP]]
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP4]], align 512
 ; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40)
@@ -92,7 +92,7 @@ define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
 ;
 wrapper_entry:
   %0 = add <110 x i32> %tmp, %tmp
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -111,7 +111,7 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 
 ; Cases where amxcast can't be combined across bb
 ; %5 and %6 is not combined together since %evilphi's user aka %evilphi2 is not inside phi web.
-define void @fail_to_combine_amx_cast_and_phi2() {
+define void @fail_to_combine_amx_cast_and_phi2(i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi2(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -123,7 +123,7 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP5]], i64 40, x86_amx [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <110 x i32>, ptr [[TMP5]], align 512
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP4]], align 512
 ; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40)
@@ -134,13 +134,13 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 ; CHECK-NEXT:    [[TMP15:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP10]], x86_amx [[TMP12]], x86_amx [[TMP14]])
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, x86_amx [[TMP15]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <110 x i32>, ptr [[TMP1]], align 512
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
 ; CHECK:       for.cond.cleanup.i.i:
 ; CHECK-NEXT:    [[GOODPHI:%.*]] = phi <110 x i32> [ [[TMP8]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    store <110 x i32> [[GOODPHI]], ptr [[TMP0]], align 512
 ; CHECK-NEXT:    [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP19]])
-; CHECK-NEXT:    br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[EVILPHI2:%.*]] = phi <110 x i32> [ [[GOODPHI]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    store <110 x i32> [[EVILPHI2]], ptr undef, align 512
@@ -149,7 +149,7 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -157,27 +157,27 @@ for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %for.cond.cleanup.i.i, label %exit
+  br i1 %arg, label %for.cond.cleanup.i.i, label %exit
 
 for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
   %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
   %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
-  br i1 undef, label %exit, label %for.body.i.lr.ph.i
+  br i1 %arg, label %exit, label %for.body.i.lr.ph.i
 exit:
   %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
   store <110 x i32> %evilphi2, ptr undef, align 512
   ret void
 }
 
-define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() {
+define void @fail_to_combine_amx_cast_and_phi_due_to_const_value(i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi_due_to_const_value(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 11, i16 40)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -193,7 +193,7 @@ define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() {
 ; CHECK-NEXT:    ret void
 ;
 wrapper_entry:
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %0 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -213,14 +213,14 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 ; Cases where amxcast can be combined across bb
 ; When optimizeAMXCastFromPhi process %6 and %goodphi, %goodphi2 is outside the phi-web, so the optimization stop
 ; When optimizeAMXCastFromPhi process %7 and %goodphi2, the optimization continue.
-define void @combine_amx_cast_and_multiple_phi() {
+define void @combine_amx_cast_and_multiple_phi(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_multiple_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -229,11 +229,11 @@ define void @combine_amx_cast_and_multiple_phi() {
 ; CHECK-NEXT:    store <560 x i8> undef, ptr [[TMP0]], align 1024
 ; CHECK-NEXT:    [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40)
 ; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]])
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
 ; CHECK:       for.cond.cleanup.i.i:
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]])
-; CHECK-NEXT:    br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi x86_amx [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP12]])
@@ -242,7 +242,7 @@ define void @combine_amx_cast_and_multiple_phi() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -250,13 +250,13 @@ for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %for.cond.cleanup.i.i, label %exit
+  br i1 %arg, label %for.cond.cleanup.i.i, label %exit
 
 for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
   %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
   %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
-  br i1 undef, label %exit, label %for.body.i.lr.ph.i
+  br i1 %arg, label %exit, label %for.body.i.lr.ph.i
 exit:
   %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
   %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
@@ -265,7 +265,7 @@ exit:
 }
 
 ; Currently we are not able to delete DeadPHICycle, later we will handle with them
-define void @combine_amx_cast_and_phi_in_a_circle() {
+define void @combine_amx_cast_and_phi_in_a_circle(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_phi_in_a_circle(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -284,7 +284,7 @@ define void @combine_amx_cast_and_phi_in_a_circle() {
 ; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]])
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40, x86_amx [[TMP11]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <110 x i32>, ptr [[TMP0]], align 512
-; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi x86_amx [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ]
 ; CHECK-NEXT:    [[GOODPHI:%.*]] = phi <110 x i32> [ [[EVILPHI2:%.*]], [[BB3]] ], [ [[TMP13]], [[BB1]] ]
@@ -294,7 +294,7 @@ define void @combine_amx_cast_and_phi_in_a_circle() {
 ; CHECK-NEXT:    [[TMP15]] = phi x86_amx [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ]
 ; CHECK-NEXT:    [[EVILPHI2]] = phi <110 x i32> [ [[GOODPHI]], [[BB2]] ], [ [[TMP13]], [[BB1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]])
-; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[BB2]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]])
 ; CHECK-NEXT:    ret void
@@ -310,7 +310,7 @@ bb1:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %bb2, label %bb3
+  br i1 %arg, label %bb2, label %bb3
 
 bb2:                             ; preds = %bb1, %wrapper_entry
   %goodphi = phi <110 x i32> [ %evilphi2, %bb3], [ %5, %bb1 ]
@@ -321,19 +321,19 @@ bb3:
   %evilphi2 = phi <110 x i32> [ %goodphi, %bb2 ], [ %5, %bb1 ]
   %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %7)
-  br i1 undef, label %bb2, label %exit
+  br i1 %arg, label %bb2, label %exit
 exit:
   %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %8)
   ret void
 }
 
-define void @eliminate_unused_phi_and_cast() {
+define void @eliminate_unused_phi_and_cast(i1 %arg) {
 ; CHECK-LABEL: @eliminate_unused_phi_and_cast(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef)
@@ -349,7 +349,7 @@ define void @eliminate_unused_phi_and_cast() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef)
diff --git a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
index 391727d54..3a5b4245 100644
--- a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
@@ -317,16 +317,16 @@ define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byva
   ret void
 }
 
-define void @dead_code(ptr%buf) {
+define void @dead_code(ptr%buf, i1 %arg) {
 ; CHECK-LABEL: @dead_code(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP1]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024
@@ -335,12 +335,12 @@ define void @dead_code(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 %arg, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 %arg, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
diff --git a/llvm/test/CodeGen/X86/StackColoring.ll b/llvm/test/CodeGen/X86/StackColoring.ll
index 389d024..db3e7dc 100644
--- a/llvm/test/CodeGen/X86/StackColoring.ll
+++ b/llvm/test/CodeGen/X86/StackColoring.ll
@@ -135,7 +135,7 @@ entry:
   %t3 = call i32 @foo(i32 %in, ptr %a3)
   %t4 = call i32 @foo(i32 %in, ptr %a3)
   call void @llvm.lifetime.end.p0(i64 -1, ptr %a3)
-  br i1 undef, label %bb2, label %bb3
+  br i1 poison, label %bb2, label %bb3
 bb2:
   call void @llvm.lifetime.start.p0(i64 -1, ptr %a4)
   %t11 = call i32 @foo(i32 %in, ptr %a4)
diff --git a/llvm/test/CodeGen/X86/asm-label.ll b/llvm/test/CodeGen/X86/asm-label.ll
index 05c37db..2d3e7b6 100644
--- a/llvm/test/CodeGen/X86/asm-label.ll
+++ b/llvm/test/CodeGen/X86/asm-label.ll
@@ -12,15 +12,15 @@
 ; SAVETEMP:         jne {{.*}} <.LBB0_1>
 ; SAVETEMP-LABEL: <.LBB0_1>:
 
-define void @foo()  {
+define void @foo(i1 %arg, i32 %arg2)  {
 entry:
-  br i1 undef, label %land.lhs.true, label %if.end11
+  br i1 %arg, label %land.lhs.true, label %if.end11
 
 land.lhs.true:                                    ; preds = %entry
-  br i1 undef, label %if.then, label %if.end11
+  br i1 %arg, label %if.then, label %if.end11
 
 if.then:                                          ; preds = %land.lhs.true
-  br i1 undef, label %if.then9, label %if.end
+  br i1 %arg, label %if.then9, label %if.end
 
 if.then9:                                         ; preds = %if.then
   br label %cleanup
@@ -29,7 +29,7 @@ if.end:                                           ; preds = %if.then
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end, %if.then9
-  switch i32 undef, label %default [
+  switch i32 %arg2, label %default [
     i32 0, label %cleanup.cont
     i32 1, label %if.end11
   ]
diff --git a/llvm/test/CodeGen/X86/avx-select.ll b/llvm/test/CodeGen/X86/avx-select.ll
index 7a33daf..1b688c8 100644
--- a/llvm/test/CodeGen/X86/avx-select.ll
+++ b/llvm/test/CodeGen/X86/avx-select.ll
@@ -84,7 +84,7 @@ head:
   %isneg = icmp slt <4 x i32> %v3, zeroinitializer
   %or0 = select <4 x i1> %isneg, <4 x i32> <i32 26146, i32 -1257, i32 -2, i32 -3052>, <4 x i32> <i32 -24947, i32 7802, i32 29242, i32 15858>
   %or1 = shufflevector <4 x i32> %or0, <4 x i32> <i32 29361, i32 -16094, i32 -3080, i32 -26286>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  br i1 undef, label %exit, label %head
+  br i1 poison, label %exit, label %head
 
 exit:
   store <8 x i32> %or1, ptr addrspace(1) undef, align 32
diff --git a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
new file mode 100644
index 0000000..d7ad7b0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
+
+; VCVTTPD2DQS
+define <8 x i32> @test_signed_v8i32_v8f64(<8 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i32_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2dqs %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> %f)
+  ret <8 x i32> %x
+}
+
+; VCVTTPD2QQS
+define <8 x i64> @test_signed_v8i64_v8f64(<8 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i64_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2qqs %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> %f)
+  ret <8 x i64> %x
+}
+
+; VCVTTPD2UDQS
+define <8 x i32> @test_unsigned_v8i32_v8f64(<8 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i32_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2udqs %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> %f)
+ ret <8 x i32> %x
+}
+
+; VCVTTPD2UQQS
+define <8 x i64> @test_unsigned_v8i64_v8f64(<8 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i64_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2uqqs %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> %f)
+  ret <8 x i64> %x
+}
+
+; VCVTTPS2DQS
+define <16 x i32> @test_signed_v16i32_v16f32(<16 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v16i32_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2dqs %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> %f)
+  ret <16 x i32> %x
+}
+
+; VCVTTPS2UDQS
+define <16 x i32> @test_unsigned_v16i32_v16f32(<16 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v16i32_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2udqs %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> %f)
+  ret <16 x i32> %x
+}
+; VCVTTPS2QQS
+define <8 x i64> @test_signed_v8i64_v8f32(<8 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i64_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2qqs %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> %f)
+  ret <8 x i64> %x
+}
+
+; VCVTTPS2UQQS
+define <8 x i64> @test_unsigned_v8i64_v8f32(<8 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i64_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2uqqs %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> %f)
+  ret <8 x i64> %x
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64: {{.*}}
+; X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
index 494e4bc..a2f167e 100644
--- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
 
 ;
 ; 32-bit float to signed integer
@@ -112,3 +112,157 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
     %x = call i64 @llvm.fptosi.sat.i64.f64(double %f)
     ret i64 %x
 }
+
+; VCVTTPD2DQS
+define <2 x i32> @test_signed_v2i32_v2f64(<2 x double> %d) nounwind {
+; CHECK-LABEL: test_signed_v2i32_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2dqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %d)
+  ret <2 x i32> %x
+}
+
+define <4 x i32> @test_signed_v4i32_v4f64(<4 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v4i32_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2dqs %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> %f)
+  ret <4 x i32> %x
+}
+
+; VCVTTPD2QQS
+define <2 x i64> @test_signed_v2i64_v2f64(<2 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v2i64_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2qqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+    %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> %f)
+    ret <2 x i64> %x
+}
+
+define <4 x i64> @test_signed_v4i64_v4f64(<4 x double> %f) nounwind {
+; CHECK-LABEL: test_signed_v4i64_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2qqs %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> %f)
+  ret <4 x i64> %x
+}
+
+; VCVTTPD2UDQS
+define <2 x i32> @test_unsigned_v2i32_v2f64(<2 x double> %d) nounwind {
+; CHECK-LABEL: test_unsigned_v2i32_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2udqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> %d)
+  ret <2 x i32> %x
+}
+
+define <4 x i32> @test_unsigned_v4i32_v4f64(<4 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v4i32_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2udqs %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> %f)
+  ret <4 x i32> %x
+}
+
+; VCVTTPD2UQQS
+define <2 x i64> @test_unsigned_v2i64_v2f64(<2 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v2i64_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2uqqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> %f)
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @test_unsigned_v4i64_v4f64(<4 x double> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v4i64_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttpd2uqqs %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> %f)
+  ret <4 x i64> %x
+}
+
+; VCVTTPS2DQS
+define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v4i32_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2dqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %f)
+  ret <4 x i32> %x
+}
+
+define <8 x i32> @test_signed_v8i32_v8f32(<8 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i32_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2dqs %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> %f)
+  ret <8 x i32> %x
+}
+
+; VCVTTPS2UDQS
+define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v4i32_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2udqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> %f)
+  ret <4 x i32> %x
+}
+
+define <8 x i32> @test_unsigned_v8i32_v8f32(<8 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i32_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2udqs %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> %f)
+  ret <8 x i32> %x
+}
+
+; VCVTTPS2QQS
+define <2 x i64> @test_signed_v2i64_v2f32(<2 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v2i64_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2qqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> %f)
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @test_signed_v4i64_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_signed_v4i64_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2qqs %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> %f)
+  ret <4 x i64> %x
+}
+
+; VCVTTPS2UQQS
+define <2 x i64> @test_unsigned_v2i64_v2f32(<2 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v2i64_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2uqqs %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> %f)
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v4i64_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttps2uqqs %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f)
+  ret <4 x i64> %x
+}
diff --git a/llvm/test/CodeGen/X86/avx512-i1test.ll b/llvm/test/CodeGen/X86/avx512-i1test.ll
index 3cd7331..d8683df 100644
--- a/llvm/test/CodeGen/X86/avx512-i1test.ll
+++ b/llvm/test/CodeGen/X86/avx512-i1test.ll
@@ -21,20 +21,20 @@ define void @func() {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jmp .LBB0_2
 bb1:
-  br i1 undef, label %L_10, label %L_10
+  br i1 poison, label %L_10, label %L_10
 
 L_10:                                             ; preds = %bb1, %bb1
-  br i1 undef, label %L_30, label %bb56
+  br i1 poison, label %L_30, label %bb56
 
 bb56:                                             ; preds = %L_10
   br label %bb33
 
 bb33:                                             ; preds = %bb51, %bb56
   %r111 = load i64, ptr undef, align 8
-  br i1 undef, label %bb51, label %bb35
+  br i1 poison, label %bb51, label %bb35
 
 bb35:                                             ; preds = %bb33
-  br i1 undef, label %L_19, label %bb37
+  br i1 poison, label %L_19, label %bb37
 
 bb37:                                             ; preds = %bb35
   %r128 = and i64 %r111, 576460752303423488
@@ -43,7 +43,7 @@ bb37:                                             ; preds = %bb35
 
 L_19:                                             ; preds = %bb37, %bb35
   %"$V_S25.0" = phi i1 [ %phitmp, %bb37 ], [ true, %bb35 ]
-  br i1 undef, label %bb51, label %bb42
+  br i1 poison, label %bb51, label %bb42
 
 bb42:                                             ; preds = %L_19
   %r136 = select i1 %"$V_S25.0", ptr undef, ptr undef
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
index 55b86ca..9db57fe 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
@@ -5,6 +5,10 @@ declare half @llvm.minimum.f16(half, half)
 declare half @llvm.maximum.f16(half, half)
 declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
 declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
+declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
+declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
+declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>)
+declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
 
 define half @test_fminimum(half %x, half %y) {
 ; CHECK-LABEL: test_fminimum:
@@ -25,20 +29,10 @@ define half @test_fminimum(half %x, half %y) {
   ret half %z
 }
 
-define <8 x half> @test_fminimum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
-; CHECK-LABEL: test_fminimum_scalarize:
+define <8 x half> @test_fminimum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; CHECK-LABEL: test_fminimum_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcmpltph %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm2 {%k1}
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vcmpeqph %xmm1, %xmm2, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
+; CHECK-NEXT:    vminph %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %r = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y)
   ret <8 x half> %r
@@ -113,19 +107,10 @@ define half @test_fmaximum(half %x, half %y) {
   ret half %r
 }
 
-define <8 x half> @test_fmaximum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
-; CHECK-LABEL: test_fmaximum_scalarize:
+define <8 x half> @test_fmaximum_v8f16(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; CHECK-LABEL: test_fmaximum_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcmpltph %xmm0, %xmm1, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm2 {%k1}
-; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm2, %xmm0 {%k1}
-; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vcmpeqph %xmm1, %xmm2, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
+; CHECK-NEXT:    vmaxph %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %r = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y)
   ret <8 x half> %r
@@ -186,3 +171,50 @@ define half @test_fmaximum_combine_cmps(half %x, half %y) {
   %2 = tail call half @llvm.maximum.f16(half %x, half %1)
   ret half %2
 }
+
+define <16 x half> @test_fminimum_v16f16(<16 x half> %x, <16 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; CHECK-LABEL: test_fminimum_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %r = call <16 x half> @llvm.minimum.v16f16(<16 x half> %x, <16 x half> %y)
+  ret <16 x half> %r
+}
+
+define <16 x half> @test_fmaximum_v16f16_nans(<16 x half> %x, <16 x half> %y) "no-signed-zeros-fp-math"="true" {
+; CHECK-LABEL: test_fmaximum_v16f16_nans:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %ymm1, %ymm0, %ymm1
+; CHECK-NEXT:    vcmpunordph %ymm0, %ymm0, %k1
+; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = call <16 x half> @llvm.maximum.v16f16(<16 x half> %x, <16 x half> %y)
+  ret <16 x half> %r
+}
+
+define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "no-nans-fp-math"="true" {
+; CHECK-LABEL: test_fminimum_v32f16_szero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovw2m %zmm0, %k1
+; CHECK-NEXT:    vpblendmw %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vminph %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %r = call <32 x half> @llvm.minimum.v32f16(<32 x half> %x, <32 x half> %y)
+  ret <32 x half> %r
+}
+
+define <32 x half> @test_fmaximum_v32f16_nans_szero(<32 x half> %x, <32 x half> %y) {
+; CHECK-LABEL: test_fmaximum_v32f16_nans_szero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovw2m %zmm0, %k1
+; CHECK-NEXT:    vpblendmw %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmaxph %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vcmpunordph %zmm1, %zmm1, %k1
+; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %r = call <32 x half> @llvm.maximum.v32f16(<32 x half> %x, <32 x half> %y)
+  ret <32 x half> %r
+}
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index 6752934..1369131 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -312,7 +312,7 @@ exit:
   ret i32 %sum
 }
 
-define void @unnatural_cfg1() {
+define void @unnatural_cfg1(i1 %arg) {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
 ; CHECK-LABEL: unnatural_cfg1
@@ -327,7 +327,7 @@ loop.header:
   br label %loop.body1
 
 loop.body1:
-  br i1 undef, label %loop.body3, label %loop.body2
+  br i1 %arg, label %loop.body3, label %loop.body2
 
 loop.body2:
   %ptr = load ptr, ptr undef, align 4
@@ -341,14 +341,14 @@ loop.body3:
   br i1 %comp, label %loop.body4, label %loop.body5
 
 loop.body4:
-  br i1 undef, label %loop.header, label %loop.body5
+  br i1 %arg, label %loop.header, label %loop.body5
 
 loop.body5:
   %ptr2 = load ptr, ptr undef, align 4
   br label %loop.body3
 }
 
-define void @unnatural_cfg2(ptr %p0, i32 %a0) {
+define void @unnatural_cfg2(ptr %p0, i32 %a0, i1 %arg) {
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
@@ -372,10 +372,10 @@ loop.header:
 
 loop.body1:
   %val0 = load ptr, ptr undef, align 4
-  br i1 undef, label %loop.body2, label %loop.inner1.begin
+  br i1 %arg, label %loop.body2, label %loop.inner1.begin
 
 loop.body2:
-  br i1 undef, label %loop.body4, label %loop.body3
+  br i1 %arg, label %loop.body4, label %loop.body3
 
 loop.body3:
   %ptr1 = getelementptr inbounds i32, ptr %val0, i32 0
@@ -467,7 +467,7 @@ exit:
   ret i32 %merge
 }
 
-define void @fpcmp_unanalyzable_branch(i1 %cond, double %a0) {
+define void @fpcmp_unanalyzable_branch(i1 %cond, double %a0, i1 %arg) {
 ; This function's CFG contains an once-unanalyzable branch (une on floating
 ; points). As now it becomes analyzable, we should get best layout in which each
 ; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
@@ -493,7 +493,7 @@ entry.if.then_crit_edge:
   br label %if.then
 
 lor.lhs.false:
-  br i1 undef, label %if.end, label %exit
+  br i1 %arg, label %if.end, label %exit
 
 exit:
   %cmp.i = fcmp une double 0.000000e+00, %a0
@@ -516,7 +516,7 @@ declare i32 @f()
 declare i32 @g()
 declare i32 @h(i32 %x)
 
-define i32 @test_global_cfg_break_profitability() {
+define i32 @test_global_cfg_break_profitability(i1 %arg) {
 ; Check that our metrics for the profitability of a CFG break are global rather
 ; than local. A successor may be very hot, but if the current block isn't, it
 ; doesn't matter. Within this test the 'then' block is slightly warmer than the
@@ -530,7 +530,7 @@ define i32 @test_global_cfg_break_profitability() {
 ; CHECK: ret
 
 entry:
-  br i1 undef, label %then, label %else, !prof !2
+  br i1 %arg, label %then, label %else, !prof !2
 
 then:
   %then.result = call i32 @f()
@@ -600,7 +600,7 @@ cleanup:
   unreachable
 }
 
-define void @test_unnatural_cfg_backwards_inner_loop() {
+define void @test_unnatural_cfg_backwards_inner_loop(i1 %arg) {
 ; Test that when we encounter an unnatural CFG structure after having formed
 ; a chain for an inner loop which happened to be laid out backwards we don't
 ; attempt to merge onto the wrong end of the inner loop just because we find it
@@ -612,7 +612,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
 ; CHECK: %loop3
 
 entry:
-  br i1 undef, label %loop2a, label %body
+  br i1 %arg, label %loop2a, label %body
 
 body:
   br label %loop2a
@@ -692,7 +692,7 @@ exit:
   ret void
 }
 
-define void @unanalyzable_branch_to_free_block(float %x) {
+define void @unanalyzable_branch_to_free_block(float %x, i1 %arg) {
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
@@ -704,7 +704,7 @@ define void @unanalyzable_branch_to_free_block(float %x) {
 ; CHECK: %exit
 
 entry:
-  br i1 undef, label %a, label %b
+  br i1 %arg, label %a, label %b
 
 a:
   call i32 @f()
diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
index f6b3883..e7ffc47 100644
--- a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
+++ b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
@@ -173,7 +173,7 @@ define ghccc void @test5() {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    jmp tail@PLT # TAILCALL
 entry:
-  br i1 undef, label %then, label %else
+  br i1 poison, label %then, label %else
 
 then:
   store i64 0, ptr undef
@@ -186,4 +186,3 @@ else:
 exit:
   ret void
 }
-
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index c611e89..f3070cd 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s -o - | FileCheck %s
 ; This file tests the different cases what are involved when codegen prepare
 ; tries to get sign/zero extension out of the way of addressing mode.
@@ -9,14 +10,17 @@ target triple = "x86_64-apple-macosx"
 
 
 ; Check that we correctly promote both operands of the promotable add.
-; CHECK-LABEL: @twoArgsPromotion
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
-; CHECK: [[ARG2SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg2 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], [[ARG2SEXT]]
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i32 [[ARG2]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], [[PROMOTED2]]
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %base = inttoptr i64 %sextadd to ptr
   %res = load i8, ptr %base
@@ -28,11 +32,16 @@ define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) {
 ; Otherwise, we will increase the number of instruction executed.
 ; (This is a heuristic of course, because the new sext could have been
 ; merged with something else.)
-; CHECK-LABEL: @twoArgsNoPromotion
-; CHECK: add nsw i32 %arg1, %arg2
-; CHECK: ret
 define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, ptr %base) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsNoPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -41,11 +50,16 @@ define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, ptr %base) {
 
 ; Check that we do not promote when the related instruction does not have
 ; the nsw flag.
-; CHECK-LABEL: @noPromotion
-; CHECK-NOT: add i64
-; CHECK: ret
 define i8 @noPromotion(i32 %arg1, i32 %arg2, ptr %base) {
-  %add = add i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @noPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -53,13 +67,16 @@ define i8 @noPromotion(i32 %arg1, i32 %arg2, ptr %base) {
 }
 
 ; Check that we correctly promote constant arguments.
-; CHECK-LABEL: @oneArgPromotion
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotion(i32 %arg1, ptr %base) {
-  %add = add nsw i32 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, 1
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -67,14 +84,17 @@ define i8 @oneArgPromotion(i32 %arg1, ptr %base) {
 }
 
 ; Check that we are able to merge a sign extension with a zero extension.
-; CHECK-LABEL: @oneArgPromotionZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionZExt(i8 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %zext = zext i8 %arg1 to i32
-  %add = add nsw i32 %zext, 1 
+  %add = add nsw i32 %zext, 1
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -88,11 +108,14 @@ define i8 @oneArgPromotionZExt(i8 %arg1, ptr %base) {
 ; more thing in the addressing mode. Therefore the modification is
 ; rolled back.
 ; Still, this test case exercises the desired code path.
-; CHECK-LABEL: @oneArgPromotionCstZExt
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 0, 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionCstZExt(ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionCstZExt(
+; CHECK-SAME: ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 0, 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %cst = zext i16 undef to i32
   %add = add nsw i32 %cst, 1
   %sextadd = sext i32 %add to i64
@@ -103,15 +126,18 @@ define i8 @oneArgPromotionCstZExt(ptr %base) {
 
 ; Check that we do not promote truncate when we cannot determine the
 ; bits that are dropped.
-; CHECK-LABEL: @oneArgPromotionBlockTrunc1
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 %arg1 to i8
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTrunc1(
+; CHECK-SAME: i32 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %trunc = trunc i32 %arg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -120,17 +146,20 @@ define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, ptr %base) {
 
 ; Check that we do not promote truncate when we cannot determine all the
 ; bits that are dropped.
-; CHECK-LABEL: @oneArgPromotionBlockTrunc2
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i16 %arg1 to i32
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
-; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTrunc2(
+; CHECK-SAME: i16 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i16 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i16 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -139,15 +168,18 @@ define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, ptr %base) {
 
 ; Check that we are able to promote truncate when we know all the bits
 ; that are dropped.
-; CHECK-LABEL: @oneArgPromotionPassTruncKeepSExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionPassTruncKeepSExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i1 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -156,17 +188,19 @@ define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, ptr %base) {
 
 ; On X86 truncate are free. Check that we are able to promote the add
 ; to be used as addressing mode and that we insert a truncate for the other
-; use. 
-; CHECK-LABEL: @oneArgPromotionTruncInsert
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: add i8 [[LOAD]], [[TRUNC]]
-; CHECK: ret
+; use.
 define i8 @oneArgPromotionTruncInsert(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotionTruncInsert(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -175,15 +209,20 @@ define i8 @oneArgPromotionTruncInsert(i8 %arg1, ptr %base) {
 }
 
 ; Cannot sext from a larger type than the promoted type.
-; CHECK-LABEL: @oneArgPromotionLargerType
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i128 %arg1 to i8
-; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionLargerType(i128 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionLargerType(
+; CHECK-SAME: i128 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i128 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
   %trunc = trunc i128 %arg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -194,18 +233,20 @@ define i8 @oneArgPromotionLargerType(i128 %arg1, ptr %base) {
 ; Use same inserted trunc
 ; On X86 truncate are free. Check that we are able to promote the add
 ; to be used as addressing mode and that we insert a truncate for
-; *all* the other uses. 
-; CHECK-LABEL: @oneArgPromotionTruncInsertSeveralUse
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = add i8 [[LOAD]], [[TRUNC]]
-; CHECK: add i8 [[ADDRES]], [[TRUNC]]
-; CHECK: ret
+; *all* the other uses.
 define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotionTruncInsertSeveralUse(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ALMOSTFINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[ALMOSTFINALRES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -216,16 +257,18 @@ define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, ptr %base) {
 
 ; Check that the promoted instruction is used for all uses of the original
 ; sign extension.
-; CHECK-LABEL: @oneArgPromotionSExtSeveralUse
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = zext i8 [[LOAD]] to i64
-; CHECK: add i64 [[ADDRES]], [[PROMOTED]]
-; CHECK: ret
 define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i64 @oneArgPromotionSExtSeveralUse(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ALMOSTFINALRES:%.*]] = zext i8 [[RES]] to i64
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i64 [[ALMOSTFINALRES]], [[ADD]]
+; CHECK-NEXT:    ret i64 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -249,16 +292,19 @@ define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, ptr %base) {
 ; - Setting the operands of the promoted instruction with the promoted values.
 ; - Moving instruction around (mainly sext when promoting instruction).
 ; Each type of those promotions has to be undo at least once during this
-; specific test. 
-; CHECK-LABEL: @twoArgsPromotionNest
-; CHECK: [[ORIG:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ORIG]], [[ORIG]]
-; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[SEXT]]
-; CHECK: ret
+; specific test.
 define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, ptr %base) {
+; CHECK-LABEL: define i8 @twoArgsPromotionNest(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTABLEADD1:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[PROMOTABLEADD2:%.*]] = add nsw i32 [[PROMOTABLEADD1]], [[PROMOTABLEADD1]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[PROMOTABLEADD2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %promotableadd1 = add nsw i32 %arg1, %arg2
-  %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1 
+  %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1
   %sextadd = sext i32 %promotableadd2 to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -270,18 +316,21 @@ define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, ptr %base) {
 ; The matcher first promotes the add, removes the trunc and promotes
 ; the sext of arg1.
 ; Then, the matcher cannot use an addressing mode r + r + r, thus it
-; rolls back. 
-; CHECK-LABEL: @twoArgsNoPromotionRemove
-; CHECK: [[SEXTARG1:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[SEXTARG1]] to i8
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[TRUNC]], %arg2
-; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i64
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[SEXT]]
-; CHECK: ret
+; rolls back.
 define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, ptr %base) {
+; CHECK-LABEL: define i8 @twoArgsNoPromotionRemove(
+; CHECK-SAME: i1 [[ARG1:%.*]], i8 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[TRUNC]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i8 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, %arg2 
+  %add = add nsw i8 %trunc, %arg2
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -301,29 +350,40 @@ define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, ptr %base) {
 ; Check that we did not promote anything in the final matching.
 ;
 ; <rdar://problem/16020230>
-; CHECK-LABEL: @checkProfitability
-; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
-; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
-; CHECK: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
-; CHECK: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
 ; BB then
-; CHECK: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to ptr
-; CHECK: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[BASE1]], i64 48
-; CHECK: load i32, ptr [[FULL1]]
 ; BB else
-; CHECK: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to ptr
-; CHECK: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[BASE2]], i64 48
-; CHECK: load i32, ptr [[FULL2]]
-; CHECK: ret
 define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
+; CHECK-LABEL: define i32 @checkProfitability(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i1 [[TEST:%.*]]) {
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i32 [[ARG1]], 1
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[SHL]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTIDX1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    br i1 [[TEST]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = inttoptr i64 [[SEXTIDX1]] to ptr
+; CHECK-NEXT:    [[SUNKADDR13:%.*]] = getelementptr i8, ptr [[SUNKADDR]], i64 48
+; CHECK-NEXT:    [[RES1:%.*]] = load i32, ptr [[SUNKADDR13]], align 4
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[SUNKADDR17:%.*]] = inttoptr i64 [[SEXTIDX1]] to ptr
+; CHECK-NEXT:    [[SUNKADDR18:%.*]] = getelementptr i8, ptr [[SUNKADDR17]], i64 48
+; CHECK-NEXT:    [[RES2:%.*]] = load i32, ptr [[SUNKADDR18]], align 4
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[RES1]], %[[THEN]] ], [ [[RES2]], %[[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[SEXTIDX1]] to i32
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[TMP]], [[TMP1]]
+; CHECK-NEXT:    [[ADDR:%.*]] = inttoptr i32 [[RES]] to ptr
+; CHECK-NEXT:    [[FINAL:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    ret i32 [[FINAL]]
+;
   %shl = shl nsw i32 %arg1, 1
   %add1 = add nsw i32 %shl, %arg2
   %sextidx1 = sext i32 %add1 to i64
   %tmpptr = inttoptr i64 %sextidx1 to ptr
   %arrayidx1 = getelementptr i32, ptr %tmpptr, i64 12
   br i1 %test, label %then, label %else
-then: 
+then:
   %res1 = load i32, ptr %arrayidx1
   br label %end
 else:
@@ -346,15 +406,47 @@ end:
 ; We used to crash on this function because we did not return the right
 ; promoted instruction for %conv.i.
 ; Make sure we generate the right code now.
-; CHECK-LABEL: @fn3
 ; %conv.i is used twice and only one of its use is being promoted.
 ; Use it at the starting point for the matching.
-; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32
-; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64
-; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr %P, i64 [[PROMOTED_CONV]]
-; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[ADD]], i64 7
-; CHECK-NEXT: load i8, ptr [[ADDR]], align 1
 define signext i16 @fn3(ptr nocapture readonly %P) {
+; CHECK-LABEL: define signext i16 @fn3(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_BODY_I_I:.*]]
+; CHECK:       [[WHILE_BODY_I_I]]:
+; CHECK-NEXT:    [[SRC_ADDR_0_I_I:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[INC_I_I:%.*]], %[[WHILE_BODY_I_I]] ]
+; CHECK-NEXT:    [[INC_I_I]] = add i16 [[SRC_ADDR_0_I_I]], 1
+; CHECK-NEXT:    [[IDXPROM_I_I:%.*]] = sext i16 [[SRC_ADDR_0_I_I]] to i64
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IDXPROM_I_I]]
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr inbounds i8, ptr [[SUNKADDR]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[SUNKADDR2]], align 1
+; CHECK-NEXT:    [[CONV2_I_I:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV2_I_I]], 15
+; CHECK-NEXT:    store i32 [[AND_I_I]], ptr @a, align 4
+; CHECK-NEXT:    [[TOBOOL_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_I_I]], label %[[WHILE_BODY_I_I]], label %[[FN1_EXIT_I:.*]]
+; CHECK:       [[FN1_EXIT_I]]:
+; CHECK-NEXT:    [[CONV_I:%.*]] = zext i16 [[INC_I_I]] to i32
+; CHECK-NEXT:    [[PROMOTED4:%.*]] = zext i16 [[INC_I_I]] to i64
+; CHECK-NEXT:    [[SUNKADDR5:%.*]] = getelementptr i8, ptr [[P]], i64 [[PROMOTED4]]
+; CHECK-NEXT:    [[SUNKADDR6:%.*]] = getelementptr i8, ptr [[SUNKADDR5]], i64 7
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[SUNKADDR6]], align 1
+; CHECK-NEXT:    [[CONV2_I:%.*]] = sext i8 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[CONV2_I]], ptr @b, align 2
+; CHECK-NEXT:    [[SUB4_I:%.*]] = sub nsw i32 0, [[CONV_I]]
+; CHECK-NEXT:    [[CONV5_I:%.*]] = zext i16 [[CONV2_I]] to i32
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[CONV5_I]], [[SUB4_I]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[IF_THEN_I:.*]], label %[[FN2_EXIT:.*]]
+; CHECK:       [[IF_THEN_I]]:
+; CHECK-NEXT:    [[END_I:%.*]] = getelementptr inbounds [[STRUCT_DNS_PACKET:%.*]], ptr [[P]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[END_I]], align 4
+; CHECK-NEXT:    [[SUB7_I:%.*]] = add i32 [[TMP3]], 65535
+; CHECK-NEXT:    [[CONV8_I:%.*]] = trunc i32 [[SUB7_I]] to i16
+; CHECK-NEXT:    br label %[[FN2_EXIT]]
+; CHECK:       [[FN2_EXIT]]:
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i16 [ [[CONV8_I]], %[[IF_THEN_I]] ], [ undef, %[[FN1_EXIT_I]] ]
+; CHECK-NEXT:    ret i16 [[RETVAL_0_I]]
+;
 entry:
   %tmp = getelementptr inbounds %struct.dns_packet, ptr %P, i64 0, i32 2
   br label %while.body.i.i
@@ -399,13 +491,16 @@ fn2.exit:                                         ; preds = %if.then.i, %fn1.exi
 
 ; Check that we do not promote an extension if the non-wrapping flag does not
 ; match the kind of the extension.
-; CHECK-LABEL: @noPromotionFlag
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @noPromotionFlag(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[ZEXTADD:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ZEXTADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %zextadd = zext i32 %add to i64
   %base = inttoptr i64 %zextadd to ptr
   %res = load i8, ptr %base
@@ -413,14 +508,17 @@ define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
 }
 
 ; Check that we correctly promote both operands of the promotable add with zext.
-; CHECK-LABEL: @twoArgsPromotionZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64
-; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]]
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
-  %add = add nuw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsPromotionZExt(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i32 [[ARG2]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], [[PROMOTED2]]
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nuw i32 %arg1, %arg2
   %zextadd = zext i32 %add to i64
   %base = inttoptr i64 %zextadd to ptr
   %res = load i8, ptr %base
@@ -428,13 +526,16 @@ define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
 }
 
 ; Check that we correctly promote constant arguments.
-; CHECK-LABEL: @oneArgPromotionNegativeCstZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, ptr %base) {
-  %add = add nuw i8 %arg1, -1 
+; CHECK-LABEL: define i8 @oneArgPromotionNegativeCstZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 255
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nuw i8 %arg1, -1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -442,14 +543,17 @@ define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, ptr %base) {
 }
 
 ; Check that we are able to merge two zero extensions.
-; CHECK-LABEL: @oneArgPromotionZExtZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionZExtZExt(i8 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionZExtZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %zext = zext i8 %arg1 to i32
-  %add = add nuw i32 %zext, 1 
+  %add = add nuw i32 %zext, 1
   %zextadd = zext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -458,17 +562,20 @@ define i8 @oneArgPromotionZExtZExt(i8 %arg1, ptr %base) {
 
 ; Check that we do not promote truncate when the dropped bits
 ; are of a different kind.
-; CHECK-LABEL: @oneArgPromotionBlockTruncZExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTruncZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nuw i8 %trunc, 1 
+  %add = add nuw i8 %trunc, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -477,15 +584,18 @@ define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, ptr %base) {
 
 ; Check that we are able to promote truncate when we know all the bits
 ; that are dropped.
-; CHECK-LABEL: @oneArgPromotionPassTruncZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionPassTruncZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i1 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = zext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nuw i8 %trunc, 1 
+  %add = add nuw i8 %trunc, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -493,15 +603,18 @@ define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, ptr %base) {
 }
 
 ; Check that we do not promote sext with zext.
-; CHECK-LABEL: @oneArgPromotionBlockSExtZExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockSExtZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[SEXTARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i8
-  %add = add nuw i8 %sextarg1, 1 
+  %add = add nuw i8 %sextarg1, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir b/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
index aceb344..13f3f3a 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
+++ b/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
@@ -6,7 +6,7 @@
 # "Replacement PHI node is already replaced."
 
 --- |
-  define void @f1() {
+  define void @f1(i1 %arg) {
   entry:
     %arrayidx = getelementptr inbounds [2 x i16], ptr undef, i16 0, i16 2
     br label %for.cond
@@ -30,10 +30,10 @@
     %5 = phi ptr [ %4, %for.body ], [ %5, %if.then5 ], [ undef, %for.cond2 ]
     %6 = phi ptr [ %3, %for.body ], [ %6, %if.then5 ], [ undef, %for.cond2 ]
     %7 = phi ptr [ %2, %for.body ], [ %6, %if.then5 ], [ undef, %for.cond2 ]
-    br i1 undef, label %for.cond2, label %if.then5
+    br i1 %arg, label %for.cond2, label %if.then5
 
   if.then5:
-    br i1 undef, label %cleanup, label %for.cond2
+    br i1 %arg, label %cleanup, label %for.cond2
 
   cleanup:
     br i1 true, label %for.cond, label %for.body
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir b/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
index 6159aa8a..e93e04b 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
+++ b/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
@@ -7,7 +7,7 @@
 
 --- |
 
-  define void @f1() {
+  define void @f1(i1 %arg) {
   entry:
     %arrayidx = getelementptr inbounds [2 x i16], ptr undef, i16 0, i16 2
     br label %for.cond
@@ -24,7 +24,7 @@
     %2 = phi ptr [ %1, %for.cond ], [ %12, %cleanup ]
     %3 = phi ptr [ %0, %for.cond ], [ %11, %cleanup ]
     %4 = phi ptr [ %0, %for.cond ], [ %10, %cleanup ]
-    br i1 undef, label %for.cond2.preheader, label %if.then
+    br i1 %arg, label %for.cond2.preheader, label %if.then
 
   for.cond2.preheader:
     br label %for.cond2
@@ -37,7 +37,7 @@
     %5 = phi ptr [ %8, %for.inc ], [ %4, %for.cond2.preheader ]
     %6 = phi ptr [ %9, %for.inc ], [ %3, %for.cond2.preheader ]
     %7 = phi ptr [ %9, %for.inc ], [ %2, %for.cond2.preheader ]
-    br i1 undef, label %for.inc, label %if.then5
+    br i1 %arg, label %for.inc, label %if.then5
 
   if.then5:
     br i1 true, label %cleanup.loopexit, label %if.end
diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll
index 230afd1..7237b02 100644
--- a/llvm/test/CodeGen/X86/combine-concatvectors.ll
+++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll
@@ -72,7 +72,7 @@ alloca_0:
   br label %loop.4942
 
 loop.4942:                                        ; preds = %loop.4942, %alloca_0
-  br i1 undef, label %loop.4942, label %ifmerge.1298
+  br i1 poison, label %loop.4942, label %ifmerge.1298
 
 ifmerge.1298:                                     ; preds = %loop.4942
   %gepload4638 = load float, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 28324), align 4
diff --git a/llvm/test/CodeGen/X86/crash.ll b/llvm/test/CodeGen/X86/crash.ll
index 16e3bb6..2f49a60 100644
--- a/llvm/test/CodeGen/X86/crash.ll
+++ b/llvm/test/CodeGen/X86/crash.ll
@@ -115,9 +115,9 @@ do.body92:                                        ; preds = %if.then66
 ; Crash during XOR optimization.
 ; <rdar://problem/7869290>
 
-define void @test7() nounwind ssp {
+define void @test7(i1 %arg) nounwind ssp {
 entry:
-  br i1 undef, label %bb14, label %bb67
+  br i1 %arg, label %bb14, label %bb67
 
 bb14:
   %tmp0 = trunc i16 undef to i1
@@ -157,14 +157,14 @@ entry:
 ; shift of and.
 %struct.S0 = type { i8, [2 x i8], i8 }
 
-define void @func_59(i32 %p_63) noreturn nounwind {
+define void @func_59(i32 %p_63, i1 %arg) noreturn nounwind {
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.inc44, %entry
   %p_63.addr.1 = phi i32 [ %p_63, %entry ], [ 0, %for.inc44 ]
   %l_74.0 = phi i32 [ 0, %entry ], [ %add46, %for.inc44 ]
-  br i1 undef, label %for.inc44, label %bb.nph81
+  br i1 %arg, label %for.inc44, label %bb.nph81
 
 bb.nph81:                                         ; preds = %for.body
   %tmp98 = add i32 %p_63.addr.1, 0
@@ -237,7 +237,7 @@ declare i64 @llvm.objectsize.i64.p0(ptr, i1) nounwind readnone
 %t20 = type { i32, i32 }
 %t21 = type { ptr }
 
-define void @_ZNK4llvm17MipsFrameLowering12emitPrologueERNS_15MachineFunctionE() ssp align 2 {
+define void @_ZNK4llvm17MipsFrameLowering12emitPrologueERNS_15MachineFunctionE(i1 %arg) ssp align 2 {
 bb:
   %tmp = load ptr, ptr undef, align 4
   %tmp3 = getelementptr inbounds %t9, ptr %tmp, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -246,7 +246,7 @@ bb:
 bb4:                                              ; preds = %bb37, %bb
   %tmp5 = phi i96 [ undef, %bb ], [ %tmp38, %bb37 ]
   %tmp6 = phi i96 [ undef, %bb ], [ %tmp39, %bb37 ]
-  br i1 undef, label %bb34, label %bb7
+  br i1 %arg, label %bb34, label %bb7
 
 bb7:                                              ; preds = %bb4
   %tmp8 = load i32, ptr undef, align 4
@@ -292,7 +292,7 @@ bb33:                                             ; preds = %bb29
   unreachable
 
 bb34:                                             ; preds = %bb4
-  br i1 undef, label %bb36, label %bb35
+  br i1 %arg, label %bb36, label %bb35
 
 bb35:                                             ; preds = %bb34
   store ptr null, ptr %tmp3, align 4
@@ -319,7 +319,7 @@ declare void @llvm.lifetime.end.p0(i64, ptr nocapture) nounwind
 
 ; PR10463
 ; Spilling a virtual register with <undef> uses.
-define void @autogen_239_1000() {
+define void @autogen_239_1000(i1 %arg) {
 BB:
     %Shuff = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 undef>
     br label %CF
@@ -327,14 +327,14 @@ BB:
 CF:
     %B16 = frem <8 x double> zeroinitializer, %Shuff
     %E19 = extractelement <8 x double> %Shuff, i32 5
-    br i1 undef, label %CF, label %CF75
+    br i1 %arg, label %CF, label %CF75
 
 CF75:
-    br i1 undef, label %CF75, label %CF76
+    br i1 %arg, label %CF75, label %CF76
 
 CF76:
     store double %E19, ptr undef
-    br i1 undef, label %CF76, label %CF77
+    br i1 %arg, label %CF76, label %CF77
 
 CF77:
     %B55 = fmul <8 x double> %B16, undef
@@ -396,24 +396,24 @@ if.end:
 ; InstrEmitter::EmitSubregNode() may steal virtual registers from already
 ; emitted blocks when isCoalescableExtInstr points out the opportunity.
 ; Make sure kill flags are cleared on the newly global virtual register.
-define i64 @ov_read(ptr %vf, ptr nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, ptr %bitstream) nounwind uwtable ssp {
+define i64 @ov_read(ptr %vf, ptr nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, ptr %bitstream, i1 %arg) nounwind uwtable ssp {
 entry:
-  br i1 undef, label %return, label %while.body.preheader
+  br i1 %arg, label %return, label %while.body.preheader
 
 while.body.preheader:                             ; preds = %entry
-  br i1 undef, label %if.then3, label %if.end7
+  br i1 %arg, label %if.then3, label %if.end7
 
 if.then3:                                         ; preds = %while.body.preheader
   %0 = load i32, ptr undef, align 4
-  br i1 undef, label %land.lhs.true.i255, label %if.end7
+  br i1 %arg, label %land.lhs.true.i255, label %if.end7
 
 land.lhs.true.i255:                               ; preds = %if.then3
-  br i1 undef, label %if.then.i256, label %if.end7
+  br i1 %arg, label %if.then.i256, label %if.end7
 
 if.then.i256:                                     ; preds = %land.lhs.true.i255
   %sub.i = sub i32 0, %0
   %conv = sext i32 %sub.i to i64
-  br i1 undef, label %if.end7, label %while.end
+  br i1 %arg, label %if.end7, label %while.end
 
 if.end7:                                          ; preds = %if.then.i256, %land.lhs.true.i255, %if.then3, %while.body.preheader
   unreachable
@@ -486,12 +486,12 @@ declare void @fn3(...)
 ; When coalescing %1 and %2, the IMPLICIT_DEF instruction should be
 ; erased along with its value number.
 ;
-define void @rdar12474033() nounwind ssp {
+define void @rdar12474033(i1 %arg, i32 %arg2, i32 %arg3, i32 %arg4) nounwind ssp {
 bb:
-  br i1 undef, label %bb21, label %bb1
+  br i1 %arg, label %bb21, label %bb1
 
 bb1:                                              ; preds = %bb
-  switch i32 undef, label %bb10 [
+  switch i32 %arg2, label %bb10 [
     i32 4, label %bb2
     i32 1, label %bb9
     i32 5, label %bb3
@@ -503,7 +503,7 @@ bb2:                                              ; preds = %bb1
   unreachable
 
 bb3:                                              ; preds = %bb1, %bb1
-  br i1 undef, label %bb4, label %bb5
+  br i1 %arg, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb3
   unreachable
@@ -521,7 +521,7 @@ bb9:                                              ; preds = %bb1, %bb1
 bb10:                                             ; preds = %bb5, %bb1
   %tmp11 = phi i128 [ undef, %bb1 ], [ %tmp6, %bb5 ]
   %tmp12 = phi i128 [ 0, %bb1 ], [ %tmp8, %bb5 ]
-  switch i32 undef, label %bb21 [
+  switch i32 %arg3, label %bb21 [
     i32 2, label %bb18
     i32 3, label %bb13
     i32 5, label %bb16
@@ -530,7 +530,7 @@ bb10:                                             ; preds = %bb5, %bb1
   ]
 
 bb13:                                             ; preds = %bb10
-  br i1 undef, label %bb15, label %bb14
+  br i1 %arg, label %bb15, label %bb14
 
 bb14:                                             ; preds = %bb13
   br label %bb21
@@ -554,7 +554,7 @@ bb21:                                             ; preds = %bb18, %bb14, %bb10,
   %tmp23 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp19, %bb18 ]
   store <4 x float> %tmp23, ptr undef, align 16
   store <4 x float> %tmp22, ptr undef, align 16
-  switch i32 undef, label %bb29 [
+  switch i32 %arg4, label %bb29 [
     i32 5, label %bb27
     i32 1, label %bb24
     i32 2, label %bb25
diff --git a/llvm/test/CodeGen/X86/domain-reassignment-test.ll b/llvm/test/CodeGen/X86/domain-reassignment-test.ll
index af7aca6..77c1ef2 100644
--- a/llvm/test/CodeGen/X86/domain-reassignment-test.ll
+++ b/llvm/test/CodeGen/X86/domain-reassignment-test.ll
@@ -3,7 +3,7 @@
 
 ; Check that the X86 domain reassignment pass doesn't introduce an illegal
 ; test instruction. See PR37396
-define void @japi1_foo2_34617() {
+define void @japi1_foo2_34617(i1 %arg) {
 pass2:
   br label %if5
 
@@ -27,7 +27,7 @@ if5:
   %tmp120 = and i1 %tmp118, %tmp119
   %tmp121 = zext i1 %tmp120 to i8
   %tmp122 = and i8 %b.055, %tmp121
-  br i1 undef, label %L174, label %if5
+  br i1 %arg, label %L174, label %if5
 
 L188:
   unreachable
diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
index 8d8d4fa..4a5cddb 100644
--- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
@@ -5,9 +5,9 @@
 ; The machine verifier will catch and complain about this case.
 ; CHECK-LABEL: baz
 ; CHECK: retq
-define void @baz() {
+define void @baz(i1 %arg) {
 entry:
-  br i1 undef, label %exit, label %exit
+  br i1 %arg, label %exit, label %exit
 
 exit:
   ret void
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index c6da0c5..1dcce53 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
 
 declare float @llvm.maximum.f32(float, float)
@@ -73,6 +74,11 @@ define float @test_fmaximum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -110,6 +116,11 @@ define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no-
 ; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_scalarize:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
@@ -129,6 +140,11 @@ define float @test_fmaximum_nan0(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nan0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -148,6 +164,11 @@ define float @test_fmaximum_nan1(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nan1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -215,6 +236,13 @@ define float @test_fmaximum_nnan(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX10_2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxss $1, %xmm0, %xmm2
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nnan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -272,6 +300,12 @@ define double @test_fmaximum_zero0(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxsd $1, %xmm0, %xmm1
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -323,6 +357,12 @@ define double @test_fmaximum_zero1(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxsd $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -354,6 +394,11 @@ define double @test_fmaximum_zero2(double %x, double %y) {
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_zero2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldz
@@ -390,6 +435,11 @@ define float @test_fmaximum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="t
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_nsz:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -474,6 +524,12 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_combine_cmps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -562,6 +618,11 @@ define float @test_fminimum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -599,6 +660,11 @@ define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) "
 ; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_scalarize:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
@@ -618,6 +684,11 @@ define float @test_fminimum_nan0(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nan0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -637,6 +708,11 @@ define float @test_fminimum_nan1(float %x, float %y) {
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nan1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
@@ -695,6 +771,11 @@ define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true"
 ; AVX512DQ-NEXT:    vminsd %xmm2, %xmm1, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nnan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -749,6 +830,11 @@ define double @test_fminimum_zero0(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero0:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -796,6 +882,11 @@ define double @test_fminimum_zero1(double %x, double %y) nounwind {
 ; AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -826,6 +917,11 @@ define double @test_fminimum_zero2(double %x, double %y) {
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_zero2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldz
@@ -863,6 +959,11 @@ define float @test_fminimum_nsz(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_nsz:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -948,6 +1049,12 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512DQ-NEXT:    vminss %xmm2, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_combine_cmps:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -1009,6 +1116,11 @@ define <2 x double> @test_fminimum_vector(<2 x double> %x, <2 x double> %y) {
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
@@ -1032,6 +1144,11 @@ define <4 x float> @test_fmaximum_vector(<4 x float> %x, <4 x float> %y) "no-nan
 ; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
@@ -1054,6 +1171,12 @@ define <2 x double> @test_fminimum_vector_zero(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1077,6 +1200,11 @@ define <4 x float> @test_fmaximum_vector_signed_zero(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_signed_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -1102,6 +1230,13 @@ define <2 x double> @test_fminimum_vector_partially_zero(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_partially_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_partially_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1149,6 +1284,13 @@ define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) {
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_different_zeros:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_different_zeros:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1177,6 +1319,11 @@ define <4 x float> @test_fmaximum_vector_non_zero(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_non_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_non_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
@@ -1206,6 +1353,13 @@ define <2 x double> @test_fminimum_vector_nan(<2 x double> %x) {
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_nan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_nan:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1232,6 +1386,12 @@ define <2 x double> @test_fminimum_vector_zero_first(<2 x double> %x) {
 ; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -1260,6 +1420,11 @@ define <2 x double> @test_fminimum_vector_signed_zero(<2 x double> %x) {
 ; AVX-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fminimum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fminimum_vector_signed_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
@@ -1284,6 +1449,11 @@ define <4 x float> @test_fmaximum_vector_signed_zero_first(<4 x float> %x) {
 ; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_signed_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_signed_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
@@ -1314,6 +1484,12 @@ define <4 x float> @test_fmaximum_vector_zero(<4 x float> %x) {
 ; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_vector_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
@@ -1369,6 +1545,12 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
 ; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4f32_splat:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4f32_splat:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
@@ -1803,6 +1985,11 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4f16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxph $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $164, %esp
@@ -2330,6 +2517,11 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
 ;
+; AVX10_2-LABEL: test_fmaximum_v4bf16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxnepbf16 $1, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
 ; X86-LABEL: test_fmaximum_v4bf16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
new file mode 100644
index 0000000..2e9e8e6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -0,0 +1,2765 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
+
+declare float @llvm.maximumnum.f32(float, float)
+declare double @llvm.maximumnum.f64(double, double)
+declare float @llvm.minimumnum.f32(float, float)
+declare double @llvm.minimumnum.f64(double, double)
+declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>)
+declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+
+;
+; fmaximumnum
+;
+
+define float @test_fmaximumnum(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    js .LBB0_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB0_2:
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordss %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB0_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB0_4:
+; SSE2-NEXT:    maxss %xmm1, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB0_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    jmp .LBB0_3
+; AVX1-NEXT:  .LBB0_1:
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:  .LBB0_3:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    testl %eax, %eax
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm2, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB0_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB0_3
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB0_3:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define <4 x float> @test_fmaximumnum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fmaximumnum_scalarize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    maxps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_scalarize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_scalarize:
+; X86:       # %bb.0:
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %r
+}
+
+define float @test_fmaximumnum_nan0(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_nan0:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nan0:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float 0x7fff000000000000, float %y)
+  ret float %1
+}
+
+define float @test_fmaximumnum_nan1(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_nan1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nan1:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float 0x7fff000000000000)
+  ret float %1
+}
+
+define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    addss %xmm1, %xmm2
+; SSE2-NEXT:    subss %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    js .LBB4_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    maxss %xmm2, %xmm0
+; SSE2-NEXT:    retq
+; SSE2-NEXT:  .LBB4_1:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    maxss %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_nnan:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm2, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB4_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB4_1:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fmaximumnum_nnan:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovd %xmm2, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm2, %xmm1
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fmaximumnum_nnan:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm2, %xmm1
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX10_2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxss $17, %xmm0, %xmm2
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vaddss %xmm0, %xmm2, %xmm1
+; X86-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB4_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm2
+; X86-NEXT:    vmovaps %xmm1, %xmm0
+; X86-NEXT:  .LBB4_3:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fadd nnan float %x, %y
+  %2 = fsub nnan float %x, %y
+  %3 = tail call float @llvm.maximumnum.f32(float %1, float %2)
+  ret float %3
+}
+
+define double @test_fmaximumnum_zero0(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpordsd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm1, %xmm2
+; SSE2-NEXT:    xorpd %xmm3, %xmm3
+; SSE2-NEXT:    maxsd %xmm3, %xmm1
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_zero0:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordsd %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_zero0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxsd $17, %xmm0, %xmm1
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero0:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double 0.0, double %y)
+  ret double %1
+}
+
+define double @test_fmaximumnum_zero1(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    cmpordsd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    andpd %xmm0, %xmm2
+; SSE2-NEXT:    xorpd %xmm3, %xmm3
+; SSE2-NEXT:    maxsd %xmm3, %xmm0
+; SSE2-NEXT:    andnpd %xmm0, %xmm1
+; SSE2-NEXT:    orpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_zero1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_zero1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxsd $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double %x, double 0.0)
+  ret double %1
+}
+
+define double @test_fmaximumnum_zero2(double %x, double %y) {
+; SSE2-LABEL: test_fmaximumnum_zero2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_zero2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero2:
+; X86:       # %bb.0:
+; X86-NEXT:    fldz
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double 0.0, double -0.0)
+  ret double %1
+}
+
+define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind {
+; SSE2-LABEL: test_fmaximumnum_nsz:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpordss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm0, %xmm3
+; SSE2-NEXT:    maxss %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_nsz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_nsz:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nsz:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_combine_cmps:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    divss %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    js .LBB9_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:  .LBB9_2:
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB9_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:  .LBB9_4:
+; SSE2-NEXT:    maxss %xmm1, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_combine_cmps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB9_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm2
+; AVX1-NEXT:    jmp .LBB9_3
+; AVX1-NEXT:  .LBB9_1:
+; AVX1-NEXT:    vmovaps %xmm1, %xmm2
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:  .LBB9_3:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fmaximumnum_combine_cmps:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm0, %xmm2
+; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fmaximumnum_combine_cmps:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_combine_cmps:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB9_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm2
+; X86-NEXT:    vmovaps %xmm1, %xmm0
+; X86-NEXT:  .LBB9_3:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fdiv nnan float %y, %x
+  %2 = tail call float @llvm.maximumnum.f32(float %x, float %1)
+  ret float %2
+}
+
+;
+; fminimumnum
+;
+
+define float @test_fminimumnum(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    js .LBB10_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB10_2:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB10_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:  .LBB10_4:
+; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB10_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX1-NEXT:    jmp .LBB10_3
+; AVX1-NEXT:  .LBB10_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX1-NEXT:  .LBB10_3:
+; AVX1-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    testl %eax, %eax
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB10_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB10_3
+; X86-NEXT:  .LBB10_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB10_3:
+; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define <2 x double> @test_fminimumnum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fminimumnum_scalarize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    minpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_scalarize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_scalarize:
+; X86:       # %bb.0:
+; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %r
+}
+
+define float @test_fminimumnum_nan0(float %x, float %y) {
+; SSE2-LABEL: test_fminimumnum_nan0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_nan0:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nan0:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float 0x7fff000000000000, float %y)
+  ret float %1
+}
+
+define float @test_fminimumnum_nan1(float %x, float %y) {
+; SSE2-LABEL: test_fminimumnum_nan1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_nan1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nan1:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float %x, float 0x7fff000000000000)
+  ret float %1
+}
+
+define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind {
+; SSE2-LABEL: test_fminimumnum_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB14_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    minsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+; SSE2-NEXT:  .LBB14_1:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    minsd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_nnan:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    testq %rax, %rax
+; AVX1-NEXT:    js .LBB14_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB14_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    vminsd %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fminimumnum_nnan:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    testq %rax, %rax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovapd %xmm1, %xmm2
+; AVX512F-NEXT:    vmovsd %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vminsd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fminimumnum_nnan:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vfpclasssd $5, %xmm1, %k0 # k0 = isQuietNaN(xmm1) | isNegativeZero(xmm1)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovapd %xmm0, %xmm2
+; AVX512DQ-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512DQ-NEXT:    vminsd %xmm2, %xmm1, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vextractps $1, %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB14_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovapd %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB14_3
+; X86-NEXT:  .LBB14_1:
+; X86-NEXT:    vmovapd %xmm0, %xmm2
+; X86-NEXT:    vmovapd %xmm1, %xmm0
+; X86-NEXT:  .LBB14_3:
+; X86-NEXT:    vminsd %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero0(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_zero0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpordsd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm1, %xmm2
+; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_zero0:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpordsd %xmm1, %xmm1, %xmm0
+; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_zero0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero0:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double -0.0, double %y)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero1(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_zero1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    cmpordsd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    andpd %xmm0, %xmm2
+; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    andnpd %xmm0, %xmm1
+; SSE2-NEXT:    orpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_zero1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_zero1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double %x, double -0.0)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero2(double %x, double %y) {
+; SSE2-LABEL: test_fminimumnum_zero2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_zero2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero2:
+; X86:       # %bb.0:
+; X86-NEXT:    fldz
+; X86-NEXT:    fchs
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double -0.0, double 0.0)
+  ret double %1
+}
+
+define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_nsz:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpordss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm0, %xmm3
+; SSE2-NEXT:    minss %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_nsz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_nsz:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nsz:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call nsz float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_combine_cmps:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    divss %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    js .LBB19_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:  .LBB19_2:
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB19_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:  .LBB19_4:
+; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_combine_cmps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB19_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB19_3
+; AVX1-NEXT:  .LBB19_1:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm0
+; AVX1-NEXT:  .LBB19_3:
+; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fminimumnum_combine_cmps:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512F-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX512F-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fminimumnum_combine_cmps:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vfpclassss $5, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isNegativeZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vminss %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_combine_cmps:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vdivss %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB19_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB19_3
+; X86-NEXT:  .LBB19_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm1
+; X86-NEXT:    vmovaps %xmm2, %xmm0
+; X86-NEXT:  .LBB19_3:
+; X86-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fdiv nnan float %y, %x
+  %2 = tail call float @llvm.minimumnum.f32(float %x, float %1)
+  ret float %2
+}
+
+define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: test_fminimumnum_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector:
+; X86:       # %bb.0:
+; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fmaximumnum_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    maxps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector:
+; X86:       # %bb.0:
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_signed_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_signed_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_signed_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_partially_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_partially_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_partially_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_different_zeros:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    orps %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    minpd %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_different_zeros:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_different_zeros:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_different_zeros:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_non_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_non_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_non_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_non_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 5., float 4., float 3., float 2.>)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_nan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm2, %xmm2
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_nan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
+; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_nan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_nan:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpordpd %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_zero_first:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_zero_first:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_zero_first:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_signed_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_signed_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    minpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    cmpordpd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm2, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_signed_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_signed_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double -0., double -0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    maxps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    cmpordps %xmm0, %xmm1
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 0., float 0., float 0., float 0.>)
+  ret <4 x float> %r
+}
+
+; PR77805: Check that signed zeroes are handled correctly in this case (FIXME)
+define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_v4f32_splat:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    movaps %xmm1, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    orps %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    maxps %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    cmpordps %xmm0, %xmm2
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX512-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4f32_splat:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %splatinsert = insertelement <4 x float> poison, float %y, i64 0
+  %vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %vec) readnone
+  ret <4 x float> %r
+}
+
+define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_v4f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    subq $104, %rsp
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_2:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_4:
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_6
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_6:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_8
+; SSE2-NEXT:  # %bb.7:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_8:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_10
+; SSE2-NEXT:  # %bb.9:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_10:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    cmpordss %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_12
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:  .LBB33_12:
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd (%rsp), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_14
+; SSE2-NEXT:  # %bb.13:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_14:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_16
+; SSE2-NEXT:  # %bb.15:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_16:
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $104, %rsp
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    subq $120, %rsp
+; AVX1-NEXT:    vmovaps %xmm0, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_3
+; AVX1-NEXT:  .LBB33_1:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_3:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_4
+; AVX1-NEXT:  # %bb.5:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_6
+; AVX1-NEXT:  .LBB33_4:
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_6:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_7
+; AVX1-NEXT:  # %bb.8:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_9
+; AVX1-NEXT:  .LBB33_7:
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_9:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_10
+; AVX1-NEXT:  # %bb.11:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_12
+; AVX1-NEXT:  .LBB33_10:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_12:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; AVX1-NEXT:    addq $120, %rsp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $88, %rsp
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm4
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm6
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
+; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm2
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm2
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm9
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vxorps %xmm10, %xmm10, %xmm10
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm2, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm5
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm5
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm5, %xmm3, %xmm3 {%k1}
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm4[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT:    vucomiss %xmm5, %xmm5
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm5, %xmm5 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm5, %xmm15
+; AVX512-NEXT:    vcvtph2ps %xmm15, %xmm5
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vmulss %xmm3, %xmm9, %xmm3
+; AVX512-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm0, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm11
+; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm7
+; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm12
+; AVX512-NEXT:    vcvtph2ps %xmm12, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm14
+; AVX512-NEXT:    vmovd %xmm14, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm13
+; AVX512-NEXT:    vmovd %xmm13, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm3
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm8
+; AVX512-NEXT:    vcvtph2ps %xmm8, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    vucomiss %xmm1, %xmm1
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT:    vucomiss %xmm4, %xmm4
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm1, %xmm4, %xmm4 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm6
+; AVX512-NEXT:    vmovss %xmm6, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm6
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm6, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm9
+; AVX512-NEXT:    vmovd %xmm9, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm10
+; AVX512-NEXT:    vmovd %xmm10, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm6 = xmm0[0],mem[0]
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm0, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm2, %eax
+; AVX512-NEXT:    vmovd %xmm15, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512-NEXT:    vmovd %xmm11, %eax
+; AVX512-NEXT:    vmovd %xmm7, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-NEXT:    vmovd %xmm3, %eax
+; AVX512-NEXT:    vmovd %xmm4, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpeqw %xmm0, %xmm2, %xmm3
+; AVX512-NEXT:    vpblendvb %xmm3, %xmm2, %xmm6, %xmm2
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm3, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm3, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
+; AVX512-NEXT:    vmovd %xmm12, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vmovd %xmm8, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm1
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovel %ecx, %edx
+; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %esi
+; AVX512-NEXT:    cmovel %ecx, %esi
+; AVX512-NEXT:    vcvtph2ps %xmm13, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %edi
+; AVX512-NEXT:    cmovel %ecx, %edi
+; AVX512-NEXT:    vcvtph2ps %xmm14, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r8d
+; AVX512-NEXT:    cmovel %ecx, %r8d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r9d
+; AVX512-NEXT:    cmovel %ecx, %r9d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r10d
+; AVX512-NEXT:    cmovel %ecx, %r10d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r11d
+; AVX512-NEXT:    cmovel %ecx, %r11d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    vmovd %esi, %xmm1
+; AVX512-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $5, %r10d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $6, %r11d, %xmm1, %xmm1
+; AVX512-NEXT:    cmovel %ecx, %eax
+; AVX512-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm0, %xmm6, %xmm0
+; AVX512-NEXT:    addq $88, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4f16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxph $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4f16:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $164, %esp
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrlq $48, %xmm1, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrld $16, %xmm2, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpextrw $0, %xmm1, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm2, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB33_3
+; X86-NEXT:  .LBB33_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB33_3:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_6
+; X86-NEXT:  .LBB33_4:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_6:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_7
+; X86-NEXT:  # %bb.8:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_9
+; X86-NEXT:  .LBB33_7:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_9:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_10
+; X86-NEXT:  # %bb.11:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_12
+; X86-NEXT:  .LBB33_10:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_12:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovd %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; X86-NEXT:    addl $164, %esp
+; X86-NEXT:    retl
+  %r = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
+  ret <4 x half> %r
+}
+
+define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_v4bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $56, %rsp
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pextrw $0, %xmm1, %r14d
+; SSE2-NEXT:    pextrw $0, %xmm0, %r15d
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %ecx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    testl %ecx, %ecx
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    js .LBB34_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_2:
+; SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1,1,1]
+; SSE2-NEXT:    movdqa %xmm5, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1,1,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    cmpordss %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm6
+; SSE2-NEXT:    andps %xmm1, %xmm6
+; SSE2-NEXT:    js .LBB34_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_4:
+; SSE2-NEXT:    pextrw $0, %xmm4, %ebp
+; SSE2-NEXT:    pextrw $0, %xmm5, %ebx
+; SSE2-NEXT:    maxss %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm6, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %r15d
+; SSE2-NEXT:    movd %r15d, %xmm3
+; SSE2-NEXT:    shll $16, %r14d
+; SSE2-NEXT:    movd %r14d, %xmm2
+; SSE2-NEXT:    testl %r15d, %r15d
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    js .LBB34_6
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_6:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm5
+; SSE2-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    cmpordss %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm1, %xmm4
+; SSE2-NEXT:    js .LBB34_8
+; SSE2-NEXT:  # %bb.7:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_8:
+; SSE2-NEXT:    pextrw $0, %xmm5, %r15d
+; SSE2-NEXT:    pextrw $0, %xmm6, %r14d
+; SSE2-NEXT:    maxss %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd %ebx, %xmm1
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movd %ebp, %xmm3
+; SSE2-NEXT:    testl %ebx, %ebx
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    js .LBB34_10
+; SSE2-NEXT:  # %bb.9:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_10:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    js .LBB34_12
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB34_12:
+; SSE2-NEXT:    maxss %xmm3, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %r14d
+; SSE2-NEXT:    movd %r14d, %xmm1
+; SSE2-NEXT:    shll $16, %r15d
+; SSE2-NEXT:    movd %r15d, %xmm3
+; SSE2-NEXT:    testl %r14d, %r14d
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    js .LBB34_14
+; SSE2-NEXT:  # %bb.13:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_14:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    js .LBB34_16
+; SSE2-NEXT:  # %bb.15:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB34_16:
+; SSE2-NEXT:    maxss %xmm3, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $56, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4bf16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    subq $56, %rsp
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $48, %xmm1, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpextrw $0, %xmm4, %ebx
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpextrw $0, %xmm4, %ebp
+; AVX1-NEXT:    vpextrw $0, %xmm0, %r12d
+; AVX1-NEXT:    vpextrw $0, %xmm1, %r13d
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    shll $16, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm4
+; AVX1-NEXT:    js .LBB34_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm4, %xmm1
+; AVX1-NEXT:    jmp .LBB34_3
+; AVX1-NEXT:  .LBB34_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm4, %xmm0
+; AVX1-NEXT:  .LBB34_3:
+; AVX1-NEXT:    vpextrw $0, %xmm2, %r14d
+; AVX1-NEXT:    vpextrw $0, %xmm3, %r15d
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %r13d
+; AVX1-NEXT:    vmovd %r13d, %xmm0
+; AVX1-NEXT:    shll $16, %r12d
+; AVX1-NEXT:    vmovd %r12d, %xmm2
+; AVX1-NEXT:    js .LBB34_4
+; AVX1-NEXT:  # %bb.5:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_6
+; AVX1-NEXT:  .LBB34_4:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_6:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %ebp
+; AVX1-NEXT:    vmovd %ebp, %xmm0
+; AVX1-NEXT:    shll $16, %ebx
+; AVX1-NEXT:    vmovd %ebx, %xmm2
+; AVX1-NEXT:    js .LBB34_7
+; AVX1-NEXT:  # %bb.8:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_9
+; AVX1-NEXT:  .LBB34_7:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_9:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %r15d
+; AVX1-NEXT:    vmovd %r15d, %xmm0
+; AVX1-NEXT:    shll $16, %r14d
+; AVX1-NEXT:    vmovd %r14d, %xmm2
+; AVX1-NEXT:    js .LBB34_10
+; AVX1-NEXT:  # %bb.11:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_12
+; AVX1-NEXT:  .LBB34_10:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_12:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; AVX1-NEXT:    addq $56, %rsp
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4bf16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    vmovq %xmm1, %r13
+; AVX512-NEXT:    movq %r13, %rbx
+; AVX512-NEXT:    shrq $32, %rbx
+; AVX512-NEXT:    vmovq %xmm0, %rbp
+; AVX512-NEXT:    movq %rbp, %r14
+; AVX512-NEXT:    shrq $32, %r14
+; AVX512-NEXT:    movq %r13, %r15
+; AVX512-NEXT:    shrq $48, %r15
+; AVX512-NEXT:    movq %rbp, %r12
+; AVX512-NEXT:    shrq $48, %r12
+; AVX512-NEXT:    movl %ebp, %eax
+; AVX512-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; AVX512-NEXT:    sets %cl
+; AVX512-NEXT:    kmovw %ecx, %k1
+; AVX512-NEXT:    movl %r13d, %ecx
+; AVX512-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
+; AVX512-NEXT:    vmovd %ecx, %xmm1
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    shll $16, %ebp
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %r13d
+; AVX512-NEXT:    vmovd %r13d, %xmm1
+; AVX512-NEXT:    vmovd %ebp, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsp)
+; AVX512-NEXT:    shll $16, %r12d
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %r15d
+; AVX512-NEXT:    vmovd %r15d, %xmm1
+; AVX512-NEXT:    vmovd %r12d, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    shll $16, %r14d
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovd %ebx, %xmm1
+; AVX512-NEXT:    vmovd %r14d, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512-NEXT:    addq $8, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4bf16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxnepbf16 $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4bf16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $68, %esp
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm2
+; X86-NEXT:    vpsrlq $48, %xmm1, %xmm3
+; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm4, %esi
+; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm4, %ebx
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    vpextrw $0, %xmm1, %ecx
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %edx
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %edi
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    vmovd %edi, %xmm0
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    vmovd %edx, %xmm4
+; X86-NEXT:    js .LBB34_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm4, %xmm1
+; X86-NEXT:    jmp .LBB34_3
+; X86-NEXT:  .LBB34_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm4, %xmm0
+; X86-NEXT:  .LBB34_3:
+; X86-NEXT:    vpextrw $0, %xmm2, %edi
+; X86-NEXT:    vpextrw $0, %xmm3, %ebp
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm2
+; X86-NEXT:    js .LBB34_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_6
+; X86-NEXT:  .LBB34_4:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_6:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ebx
+; X86-NEXT:    vmovd %ebx, %xmm0
+; X86-NEXT:    shll $16, %esi
+; X86-NEXT:    vmovd %esi, %xmm2
+; X86-NEXT:    js .LBB34_7
+; X86-NEXT:  # %bb.8:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_9
+; X86-NEXT:  .LBB34_7:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_9:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ebp
+; X86-NEXT:    vmovd %ebp, %xmm0
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    vmovd %edi, %xmm2
+; X86-NEXT:    js .LBB34_10
+; X86-NEXT:  # %bb.11:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_12
+; X86-NEXT:  .LBB34_10:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_12:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovd %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; X86-NEXT:    addl $68, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+  ret <4 x bfloat> %r
+}
diff --git a/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll b/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
index 9543238..55d9ea9 100644
--- a/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7
 
-define void @autogen_SD13708(i32) {
+define void @autogen_SD13708(i32, i1 %arg) {
 BB:
  %Shuff7 = shufflevector <8 x i32> zeroinitializer, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 undef, i32 2, i32 4, i32 undef>
  br label %CF
@@ -8,11 +8,11 @@ BB:
 CF:
  %Tr = trunc <8 x i64> zeroinitializer to <8 x i32>
  %Shuff20 = shufflevector <8 x i32> %Shuff7, <8 x i32> %Tr, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 11>
- br i1 undef, label %CF, label %CF247
+ br i1 %arg, label %CF, label %CF247
 
 CF247:
  %I171 = insertelement <8 x i32> %Shuff20, i32 %0, i32 0
- br i1 undef, label %CF, label %CF247
+ br i1 %arg, label %CF, label %CF247
 }
 
 define void @autogen_SD13800(ptr, ptr, ptr, i32, i64, i8) {
diff --git a/llvm/test/CodeGen/X86/hoist-spill.ll b/llvm/test/CodeGen/X86/hoist-spill.ll
index d11b666..b51609c 100644
--- a/llvm/test/CodeGen/X86/hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/hoist-spill.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @d = external global ptr, align 8
 
 ; Function Attrs: norecurse noreturn nounwind uwtable
-define void @fn1(i32 %p1, i32 %p2, i64 %p3) {
+define void @fn1(i32 %p1, i32 %p2, i64 %p3, i1 %arg) {
 entry:
   %tmp = load ptr, ptr @d, align 8
   %tmp1 = load ptr, ptr @a, align 8
@@ -54,10 +54,10 @@ for.cond4.preheader:                              ; preds = %for.body, %for.cond
   br i1 %cmp528, label %for.inc14, label %for.body6.preheader
 
 for.body6.preheader:                              ; preds = %for.cond4.preheader
-  br i1 undef, label %for.body6, label %min.iters.checked
+  br i1 %arg, label %for.body6, label %min.iters.checked
 
 min.iters.checked:                                ; preds = %for.body6.preheader
-  br i1 undef, label %for.body6, label %vector.memcheck
+  br i1 %arg, label %for.body6, label %vector.memcheck
 
 vector.memcheck:                                  ; preds = %min.iters.checked
   %bound1 = icmp ule ptr undef, %scevgep41
@@ -85,10 +85,10 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp16 = getelementptr inbounds i32, ptr %tmp1, i64 %offset.idx.1
   store <4 x i32> %wide.load.1, ptr %tmp16, align 4
   %index.next.3 = add i64 %index, 32
-  br i1 undef, label %middle.block, label %vector.body
+  br i1 %arg, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body, %vector.body.preheader.split
-  br i1 undef, label %for.inc14, label %for.body6
+  br i1 %arg, label %for.inc14, label %for.body6
 
 for.body.preheader:                               ; preds = %for.cond
   br label %for.body
@@ -98,7 +98,7 @@ for.body:                                         ; preds = %for.body, %for.body
   %add = add nsw i32 %k.127, 1
   %tmp18 = load i32, ptr undef, align 4
   store i32 %tmp18, ptr @b, align 4
-  br i1 undef, label %for.body, label %for.cond4.preheader
+  br i1 %arg, label %for.body, label %for.cond4.preheader
 
 for.body6:                                        ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
   %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
diff --git a/llvm/test/CodeGen/X86/implicit-null-checks.mir b/llvm/test/CodeGen/X86/implicit-null-checks.mir
index 0077906..c98019c 100644
--- a/llvm/test/CodeGen/X86/implicit-null-checks.mir
+++ b/llvm/test/CodeGen/X86/implicit-null-checks.mir
@@ -5,15 +5,15 @@
   target triple = "x86_64-apple-macosx"
 
   ;; Positive test
-  define i32 @imp_null_check_with_bitwise_op_0(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_0(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -24,15 +24,15 @@
 
   ;; Negative test.  The regalloc is such that we cannot hoist the
   ;; instruction materializing 2200000 into $eax
-  define i32 @imp_null_check_with_bitwise_op_1(ptr %x, i32 %val, ptr %ptr) {
+  define i32 @imp_null_check_with_bitwise_op_1(ptr %x, i32 %val, ptr %ptr, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 undef
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -43,15 +43,15 @@
 
   ;; Negative test: IR is identical to
   ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
-  define i32 @imp_null_check_with_bitwise_op_2(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_2(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -62,15 +62,15 @@
 
   ;; Negative test: IR is identical to
   ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
-  define i32 @imp_null_check_with_bitwise_op_3(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_3(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -80,15 +80,15 @@
   }
 
   ;; Positive test
-  define i32 @imp_null_check_with_bitwise_op_4(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_4(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
diff --git a/llvm/test/CodeGen/X86/interval-update-remat.ll b/llvm/test/CodeGen/X86/interval-update-remat.ll
index 44d3db3..91fde2b 100644
--- a/llvm/test/CodeGen/X86/interval-update-remat.ll
+++ b/llvm/test/CodeGen/X86/interval-update-remat.ll
@@ -17,13 +17,13 @@ target triple = "i386-unknown-linux-gnu"
 @f = external global i16, align 2
 @.str = external unnamed_addr constant [12 x i8], align 1
 
-define void @fn1() {
+define void @fn1(i1 %arg) {
 entry:
   %tmp = load i64, ptr @b, align 8
   %or = or i64 0, 3299921317
   %and = and i64 %or, %tmp
   %tmp1 = load i32, ptr @d, align 4
-  br i1 undef, label %lor.rhs, label %lor.end
+  br i1 %arg, label %lor.rhs, label %lor.end
 
 lor.rhs:                                          ; preds = %entry
   %tobool3 = icmp ne i8 undef, 0
@@ -32,7 +32,7 @@ lor.rhs:                                          ; preds = %entry
 lor.end:                                          ; preds = %lor.rhs, %entry
   %lor.ext = zext i1 undef to i32
   %tmp2 = load i64, ptr @e, align 8
-  br i1 undef, label %lor.rhs5, label %lor.end7
+  br i1 %arg, label %lor.rhs5, label %lor.end7
 
 lor.rhs5:                                         ; preds = %lor.end
   br label %lor.end7
diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
index 9eaa654..6dc0427 100644
--- a/llvm/test/CodeGen/X86/jump_sign.ll
+++ b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -249,16 +249,16 @@ define void @func_o() nounwind uwtable {
 ; CHECK-NEXT:  .LBB12_7: # %if.else.i97
 entry:
   %0 = load i16, ptr undef, align 2
-  br i1 undef, label %if.then.i, label %if.end.i
+  br i1 poison, label %if.then.i, label %if.end.i
 
 if.then.i:                                        ; preds = %entry
   unreachable
 
 if.end.i:                                         ; preds = %entry
-  br i1 undef, label %sw.bb, label %sw.default
+  br i1 poison, label %sw.bb, label %sw.default
 
 sw.bb:                                            ; preds = %if.end.i
-  br i1 undef, label %if.then44, label %if.end29
+  br i1 poison, label %if.then44, label %if.end29
 
 if.end29:                                         ; preds = %sw.bb
   %1 = urem i16 %0, 10
@@ -267,7 +267,7 @@ if.end29:                                         ; preds = %sw.bb
   br i1 %cmp25, label %if.then44, label %sw.default
 
 sw.default:                                       ; preds = %if.end29, %if.end.i
-  br i1 undef, label %if.then.i96, label %if.else.i97
+  br i1 poison, label %if.then.i96, label %if.else.i97
 
 if.then.i96:                                      ; preds = %sw.default
   unreachable
@@ -277,7 +277,7 @@ if.else.i97:                                      ; preds = %sw.default
 
 if.then44:                                        ; preds = %if.end29, %sw.bb
   %aModeRefSel.1.ph = phi i16 [ %., %if.end29 ], [ 3, %sw.bb ]
-  br i1 undef, label %if.then.i103, label %if.else.i104
+  br i1 poison, label %if.then.i103, label %if.else.i104
 
 if.then.i103:                                     ; preds = %if.then44
   unreachable
@@ -420,4 +420,3 @@ if.end:
 }
 
 !1 = !{!"branch_weights", i32 2, i32 1}
-
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
index a004333..9cd7551 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
@@ -7,7 +7,7 @@
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.12.0"
 
-define void @foo() {
+define void @foo(i1 %arg) {
 entry:
   br label %for
 
@@ -17,7 +17,7 @@ for:
   store i32 %next, ptr undef, align 4
   %add = add i64 %0, 9223372036854775807
   %inc = add nsw i32 %next, 1
-  br i1 undef, label %exit, label %for
+  br i1 %arg, label %exit, label %for
 
 exit:
   store i64 %add, ptr undef
diff --git a/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll b/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
index 552999f..cf43441 100644
--- a/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
+++ b/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
@@ -3,7 +3,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @hoge
-define void @hoge() {
+define void @hoge(i1 %arg) {
 bb:
   %tmp = sext i32 undef to i64
   %tmp3 = sub nsw i64 0, %tmp
@@ -21,7 +21,7 @@ bb7:                                              ; preds = %bb7, %bb4
   br i1 true, label %bb11, label %bb7
 
 bb11:                                             ; preds = %bb7
-  br i1 undef, label %bb20, label %bb12
+  br i1 %arg, label %bb20, label %bb12
 
 bb12:                                             ; preds = %bb11
   br label %bb13
diff --git a/llvm/test/CodeGen/X86/lsr-delayed-fold.ll b/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
index efa9331..a35015d 100644
--- a/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
+++ b/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
@@ -30,7 +30,7 @@ bb24:                                             ; preds = %bb21, %bb11
 ; ScalarEvolution should be able to correctly expand the crazy addrec here.
 ; PR6914
 
-define void @int323() nounwind {
+define void @int323(i1 %arg) nounwind {
 entry:
   br label %for.cond
 
@@ -38,7 +38,7 @@ for.cond:                                         ; preds = %lbl_264, %for.inc,
   %g_263.tmp.1 = phi i8 [ undef, %entry ], [ %g_263.tmp.1, %for.cond ]
   %p_95.addr.0 = phi i8 [ 0, %entry ], [ %add, %for.cond ]
   %add = add i8 %p_95.addr.0, 1                   ; <i8> [#uses=1]
-  br i1 undef, label %for.cond, label %lbl_264
+  br i1 %arg, label %for.cond, label %lbl_264
 
 lbl_264:                                          ; preds = %if.end, %lbl_264.preheader
   %g_263.tmp.0 = phi i8 [ %g_263.tmp.1, %for.cond ] ; <i8> [#uses=1]
@@ -56,13 +56,13 @@ lbl_264:                                          ; preds = %if.end, %lbl_264.pr
 
 %struct.Bu = type { i32, i32, i32 }
 
-define void @_Z3fooP2Bui(ptr nocapture %bu) {
+define void @_Z3fooP2Bui(ptr nocapture %bu, i1 %arg) {
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.inc131, %entry
   %indvar = phi i64 [ %indvar.next, %for.inc131 ], [ 0, %entry ] ; <i64> [#uses=3]
-  br i1 undef, label %for.inc131, label %lor.lhs.false
+  br i1 %arg, label %for.inc131, label %lor.lhs.false
 
 lor.lhs.false:                                    ; preds = %for.body
   %tmp15 = add i64 %indvar, 1                     ; <i64> [#uses=1]
@@ -123,11 +123,11 @@ for.body123:                                      ; preds = %for.body123, %lor.l
   %add129 = add i32 %mul, %j.03                   ; <i32> [#uses=1]
   tail call void undef(i32 %add129)
   %inc = add nsw i32 %j.03, 1                     ; <i32> [#uses=1]
-  br i1 undef, label %for.inc131, label %for.body123
+  br i1 %arg, label %for.inc131, label %for.body123
 
 for.inc131:                                       ; preds = %for.body123, %for.body
   %indvar.next = add i64 %indvar, 1               ; <i64> [#uses=1]
-  br i1 undef, label %for.end134, label %for.body
+  br i1 %arg, label %for.end134, label %for.body
 
 for.end134:                                       ; preds = %for.inc131
   ret void
@@ -138,14 +138,14 @@ for.end134:                                       ; preds = %for.inc131
 ; require insert point adjustment.
 ; PR7306
 
-define fastcc i32 @GetOptimum() nounwind {
+define fastcc i32 @GetOptimum(i1 %arg) nounwind {
 bb:
   br label %bb1
 
 bb1:                                              ; preds = %bb1, %bb
   %t = phi i32 [ 0, %bb ], [ %t2, %bb1 ]      ; <i32> [#uses=1]
   %t2 = add i32 %t, undef                     ; <i32> [#uses=3]
-  br i1 undef, label %bb1, label %bb3
+  br i1 %arg, label %bb1, label %bb3
 
 bb3:                                              ; preds = %bb1
   %t4 = add i32 undef, -1                       ; <i32> [#uses=1]
@@ -155,13 +155,13 @@ bb5:                                              ; preds = %bb16, %bb3
   %t6 = phi i32 [ %t17, %bb16 ], [ 0, %bb3 ]  ; <i32> [#uses=3]
   %t7 = add i32 undef, %t6                    ; <i32> [#uses=2]
   %t8 = add i32 %t4, %t6                    ; <i32> [#uses=1]
-  br i1 undef, label %bb9, label %bb10
+  br i1 %arg, label %bb9, label %bb10
 
 bb9:                                              ; preds = %bb5
   br label %bb10
 
 bb10:                                             ; preds = %bb9, %bb5
-  br i1 undef, label %bb11, label %bb16
+  br i1 %arg, label %bb11, label %bb16
 
 bb11:                                             ; preds = %bb10
   %t12 = icmp ugt i32 %t7, %t2              ; <i1> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 5828f06..41eae3c 100644
--- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -52,7 +52,7 @@ define void @PR24199(i32 %a0) {
 entry:
   %i = alloca %struct.A, align 8
   %tobool = icmp ne i32 %a0, 0
-  br i1 undef, label %if.end, label %if.then
+  br i1 poison, label %if.end, label %if.then
 
 if.then:
   br label %if.end
@@ -96,5 +96,3 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 !4 = !DIExpression()
 !5 = !DILocalVariable(name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer)
 !6 = !DILocation(line: 0, scope: !3)
-
-
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index ee5fd78..62935f7 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -193,13 +193,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index a46f9ed..9bbd335 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -179,14 +179,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
 define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -391,14 +391,14 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 define i32 @length8(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 4a9643c..3a16ab6 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index 4e27301..0f817b2 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
 define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
 define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index bdb50f5..35fd373 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -122,13 +122,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 9347e54..f638852 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -107,14 +107,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -186,14 +186,14 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
index ad9f2a3..4a3f5a6 100644
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -221,13 +221,13 @@ define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 8fe1a58..bb089e5 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -205,14 +205,14 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
 define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
   ret i32 %m
@@ -260,6 +260,36 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
   ret i1 %c
 }
 
+define i1 @length4_le(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: length4_le:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    setbe %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 1
+  ret i1 %c
+}
+
+define i1 @length4_ge(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: length4_ge:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, -1
+  ret i1 %c
+}
+
 define i1 @length4_eq_const(ptr %X) nounwind {
 ; X64-LABEL: length4_eq_const:
 ; X64:       # %bb.0:
@@ -279,13 +309,13 @@ define i32 @length5(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB18_3
+; X64-NEXT:    jne .LBB20_3
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %res_block
+; X64-NEXT:  .LBB20_3: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    sbbl %eax, %eax
@@ -319,7 +349,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB20_3
+; X64-NEXT:    jne .LBB22_3
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
@@ -327,7 +357,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB20_3: # %res_block
+; X64-NEXT:  .LBB22_3: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    sbbl %eax, %eax
@@ -348,7 +378,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB21_2
+; X64-NEXT:    jne .LBB23_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 3(%rdi), %ecx
 ; X64-NEXT:    movl 3(%rsi), %edx
@@ -356,13 +386,13 @@ define i32 @length7(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB21_3
-; X64-NEXT:  .LBB21_2: # %res_block
+; X64-NEXT:    je .LBB23_3
+; X64-NEXT:  .LBB23_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB21_3: # %endblock
+; X64-NEXT:  .LBB23_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
   ret i32 %m
@@ -376,7 +406,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB22_2
+; X64-NEXT:    jne .LBB24_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 3(%rdi), %ecx
 ; X64-NEXT:    movl 3(%rsi), %edx
@@ -384,13 +414,13 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB22_3
-; X64-NEXT:  .LBB22_2: # %res_block
+; X64-NEXT:    je .LBB24_3
+; X64-NEXT:  .LBB24_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB22_3: # %endblock
+; X64-NEXT:  .LBB24_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -417,14 +447,14 @@ define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
 define i32 @length8(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
   ret i32 %m
@@ -524,7 +554,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB31_2
+; X64-NEXT:    jne .LBB33_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
@@ -532,13 +562,13 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapl %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB31_3
-; X64-NEXT:  .LBB31_2: # %res_block
+; X64-NEXT:    je .LBB33_3
+; X64-NEXT:  .LBB33_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB31_3: # %endblock
+; X64-NEXT:  .LBB33_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
   ret i32 %m
@@ -582,7 +612,7 @@ define i32 @length15(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB34_2
+; X64-NEXT:    jne .LBB36_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 7(%rdi), %rcx
 ; X64-NEXT:    movq 7(%rsi), %rdx
@@ -590,13 +620,13 @@ define i32 @length15(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB34_3
-; X64-NEXT:  .LBB34_2: # %res_block
+; X64-NEXT:    je .LBB36_3
+; X64-NEXT:  .LBB36_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB34_3: # %endblock
+; X64-NEXT:  .LBB36_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
   ret i32 %m
@@ -610,7 +640,7 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB35_2
+; X64-NEXT:    jne .LBB37_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 7(%rdi), %rcx
 ; X64-NEXT:    movq 7(%rsi), %rdx
@@ -618,13 +648,13 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB35_3
-; X64-NEXT:  .LBB35_2: # %res_block
+; X64-NEXT:    je .LBB37_3
+; X64-NEXT:  .LBB37_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB35_3: # %endblock
+; X64-NEXT:  .LBB37_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -640,20 +670,20 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    movq (%rdi), %rdx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    jne .LBB36_2
+; X64-NEXT:    jne .LBB38_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movabsq $4051322327650219061, %rcx # imm = 0x3839303132333435
 ; X64-NEXT:    movq 7(%rdi), %rdx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    je .LBB36_3
-; X64-NEXT:  .LBB36_2: # %res_block
+; X64-NEXT:    je .LBB38_3
+; X64-NEXT:  .LBB38_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rcx, %rdx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB36_3: # %endblock
+; X64-NEXT:  .LBB38_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
   ret i32 %m
@@ -681,20 +711,20 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    movq (%rdi), %rcx
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    jne .LBB38_2
+; X64-NEXT:    jne .LBB40_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movabsq $4051322327650219061, %rax # imm = 0x3839303132333435
 ; X64-NEXT:    movq 7(%rdi), %rcx
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    je .LBB38_3
-; X64-NEXT:  .LBB38_2: # %res_block
+; X64-NEXT:    je .LBB40_3
+; X64-NEXT:  .LBB40_2: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rax, %rcx
 ; X64-NEXT:    sbbl %edx, %edx
 ; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB38_3: # %endblock
+; X64-NEXT:  .LBB40_3: # %endblock
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
@@ -713,7 +743,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB39_2
+; X64-NEXT:    jne .LBB41_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rcx
 ; X64-NEXT:    movq 8(%rsi), %rdx
@@ -721,13 +751,13 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB39_3
-; X64-NEXT:  .LBB39_2: # %res_block
+; X64-NEXT:    je .LBB41_3
+; X64-NEXT:  .LBB41_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB39_3: # %endblock
+; X64-NEXT:  .LBB41_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
   ret i32 %m
@@ -783,7 +813,7 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB41_2
+; X64-NEXT:    jne .LBB43_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rcx
 ; X64-NEXT:    movq 8(%rsi), %rdx
@@ -791,13 +821,13 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 ; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB41_3
-; X64-NEXT:  .LBB41_2: # %res_block
+; X64-NEXT:    je .LBB43_3
+; X64-NEXT:  .LBB43_2: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB41_3: # %endblock
+; X64-NEXT:  .LBB43_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -814,7 +844,7 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB42_2
+; X64-NEXT:    jne .LBB44_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq 8(%rsi), %rcx
@@ -822,13 +852,13 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB42_3
-; X64-NEXT:  .LBB42_2: # %res_block
+; X64-NEXT:    je .LBB44_3
+; X64-NEXT:  .LBB44_2: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    sbbl %edx, %edx
 ; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB42_3: # %endblock
+; X64-NEXT:  .LBB44_3: # %endblock
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
index a10fbc1..3dba5eb 100644
--- a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
+++ b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
@@ -21,10 +21,10 @@ define void @testfn(ptr nocapture %p) {
 
 ; CHECK-LABEL: testfn_scalar
 ; CHECK: retq
-define void @testfn_scalar(ptr nocapture %j) local_unnamed_addr #0 align 2 {
+define void @testfn_scalar(ptr nocapture %j, i1 %arg) local_unnamed_addr #0 align 2 {
 entry:
   %0 = bitcast i64 undef to <2 x float>
-  br i1 undef, label %if.end, label %if.then
+  br i1 %arg, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
   unreachable
diff --git a/llvm/test/CodeGen/X86/mingw-refptr.ll b/llvm/test/CodeGen/X86/mingw-refptr.ll
index 73f1a98..82a90ab 100644
--- a/llvm/test/CodeGen/X86/mingw-refptr.ll
+++ b/llvm/test/CodeGen/X86/mingw-refptr.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X64
+; RUN: llc < %s -mtriple=x86_64-pc-cygwin | FileCheck %s -check-prefix=CHECK-X64
 ; RUN: llc < %s -mtriple=i686-w64-mingw32 | FileCheck %s -check-prefix=CHECK-X86
 ; RUN: llc < %s -mtriple=i686-w64-mingw32-none-elf | FileCheck %s -check-prefix=CHECK-X86-ELF
 
diff --git a/llvm/test/CodeGen/X86/misched-crash.ll b/llvm/test/CodeGen/X86/misched-crash.ll
index 98818d9..a421faba 100644
--- a/llvm/test/CodeGen/X86/misched-crash.ll
+++ b/llvm/test/CodeGen/X86/misched-crash.ll
@@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx10"
 
 ; This function contains a cmp instruction with two users.
 ; Hoisting the last use requires trimming the EFLAGS live range to the second.
-define void @rdar13353090(ptr %plane, i64 %_x1, i64 %_x2) {
+define void @rdar13353090(ptr %plane, i64 %_x1, i64 %_x2, i1 %arg) {
 entry:
   %cmp = icmp ult i64 %_x1, %_x2
   %cond = select i1 %cmp, i64 %_x1, i64 %_x2
@@ -33,7 +33,7 @@ for.body34.i:                                     ; preds = %for.inc39.i, %if.th
 
 for.inc39.i:                                      ; preds = %for.body34.i
   %inc41.i = add i64 %index.178.i, 1
-  br i1 undef, label %return, label %for.body34.i
+  br i1 %arg, label %return, label %for.body34.i
 
 return:                                           ; preds = %for.inc39.i, %for.body34.i, %land.lhs.true21, %entry
   ret void
diff --git a/llvm/test/CodeGen/X86/pr10475.ll b/llvm/test/CodeGen/X86/pr10475.ll
index 4dd5aab..4275dc2 100644
--- a/llvm/test/CodeGen/X86/pr10475.ll
+++ b/llvm/test/CodeGen/X86/pr10475.ll
@@ -2,19 +2,19 @@
 
 ; No check in a crash test
 
-define void @autogen_262380_1000() {
+define void @autogen_262380_1000(i1 %arg) {
 BB:
   br label %CF79
 
 CF79:                                             ; preds = %CF79, %BB
-  br i1 undef, label %CF79, label %CF84.critedge.critedge
+  br i1 %arg, label %CF79, label %CF84.critedge.critedge
 
 CF84.critedge.critedge:                           ; preds = %CF79
   %L35 = load <8 x i32>, ptr undef
   br label %CF85
 
 CF85:                                             ; preds = %CF85, %CF84.critedge.critedge
-  br i1 undef, label %CF85, label %CF86
+  br i1 %arg, label %CF85, label %CF86
 
 CF86:                                             ; preds = %CF86, %CF85
   %B61 = sub <8 x i32> %L35, zeroinitializer
@@ -23,7 +23,7 @@ CF86:                                             ; preds = %CF86, %CF85
   br i1 %E73, label %CF86, label %CF87
 
 CF87:                                             ; preds = %CF87, %CF86
-  br i1 undef, label %CF87, label %CF88
+  br i1 %arg, label %CF87, label %CF88
 
 CF88:                                             ; preds = %CF87
   ret void
diff --git a/llvm/test/CodeGen/X86/pr107423.ll b/llvm/test/CodeGen/X86/pr107423.ll
new file mode 100644
index 0000000..d5119d4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr107423.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s
+
+define void @PR107423(<64 x i8> %arg, ptr %p0) {
+; CHECK-LABEL: PR107423:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpsllw $8, %xmm2, %xmm2
+; CHECK-NEXT:    vpsllw $8, %xmm1, %xmm3
+; CHECK-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; CHECK-NEXT:    vpaddb %xmm1, %xmm4, %xmm1
+; CHECK-NEXT:    vpaddb %xmm4, %xmm0, %xmm4
+; CHECK-NEXT:    vpsllw $8, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vpsllw $8, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqu %xmm0, 16(%rdi)
+; CHECK-NEXT:    vmovdqu %xmm2, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %i3 = bitcast <64 x i8> %arg to <32 x i16>
+  %i4 = shufflevector <32 x i16> %i3, <32 x i16> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %i5 = shl <8 x i16> %i4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i6 = bitcast <8 x i16> %i5 to <16 x i8>
+  %i7 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  %i8 = shufflevector <64 x i8> %arg, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i9 = shufflevector <64 x i8> %i7, <64 x i8> %i8, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i10 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i11 = shufflevector <64 x i8> %i10, <64 x i8> %i9, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %i12 = bitcast <64 x i8> %i11 to <32 x i16>
+  %i13 = shl <32 x i16> %i12, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i14 = bitcast <32 x i16> %i13 to <64 x i8>
+  %i15 = shufflevector <64 x i8> %i14, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i16 = shufflevector <64 x i8> %i11, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  %i17 = shufflevector <16 x i8> %i6, <16 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i18 = shufflevector <64 x i8> %i16, <64 x i8> %i17, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i19 = shufflevector <16 x i8> %i15, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i20 = shufflevector <64 x i8> %i19, <64 x i8> %i18, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %i21 = add <64 x i8> %i20, %i11
+  %i22 = bitcast <64 x i8> %i21 to <32 x i16>
+  %i23 = shl <32 x i16> %i22, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i24 = bitcast <32 x i16> %i23 to <64 x i8>
+  %i25 = shufflevector <64 x i8> %i24, <64 x i8> poison, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i26 = bitcast <32 x i16> %i23 to <64 x i8>
+  %i28 = shufflevector <64 x i8> %i26, <64 x i8> poison, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+  %i32 = shufflevector <64 x i8> %i21, <64 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %i33 = shufflevector <16 x i8> %i25, <16 x i8> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i34 = shufflevector <64 x i8> %i32, <64 x i8> %i33, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %i35 = shufflevector <16 x i8> %i28, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i36 = shufflevector <64 x i8> %i35, <64 x i8> %i34, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %i37 = add <64 x i8> %i36, %i21
+  %i38 = bitcast <64 x i8> %i37 to <32 x i16>
+  %i39 = shufflevector <32 x i16> %i38, <32 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %i40 = shl <8 x i16> %i39, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i41 = bitcast <8 x i16> %i40 to <16 x i8>
+  %i42 = shufflevector <16 x i8> %i41, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i43 = shufflevector <64 x i8> %i42, <64 x i8> %i37, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %i44 = bitcast <64 x i8> %i43 to <32 x i16>
+  %i45 = shufflevector <32 x i16> %i44, <32 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %i46 = shl <8 x i16> %i45, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %i47 = bitcast <8 x i16> %i46 to <16 x i8>
+  %i48 = shufflevector <16 x i8> %i47, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %i49 = shufflevector <64 x i8> %i43, <64 x i8> %i48, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>
+  %i50 = shufflevector <64 x i8> %i37, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %i51 = add <32 x i8> %i49, %i50
+  store <32 x i8> %i51, ptr %p0, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr11998.ll b/llvm/test/CodeGen/X86/pr11998.ll
index caaf271..4b93c20 100644
--- a/llvm/test/CodeGen/X86/pr11998.ll
+++ b/llvm/test/CodeGen/X86/pr11998.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-- -mattr=+avx
 
-define void @autogen_51367_5000(i8) {
+define void @autogen_51367_5000(i8, i1 %arg) {
 BB:
   %B = srem i8 55, %0
   %B9 = shl i8 %B, %B
   br label %CF
 
 CF:                                               ; preds = %CF, %BB
-  br i1 undef, label %CF, label %CF403
+  br i1 %arg, label %CF, label %CF403
 
 CF403:                                            ; preds = %CF403, %CF
   %S44 = icmp eq i8 %B9, %0
diff --git a/llvm/test/CodeGen/X86/pr32108.ll b/llvm/test/CodeGen/X86/pr32108.ll
index 32f8a76..a50b9a6 100644
--- a/llvm/test/CodeGen/X86/pr32108.ll
+++ b/llvm/test/CodeGen/X86/pr32108.ll
@@ -13,7 +13,7 @@ BB:
   br label %CF243
 
 CF243:                                            ; preds = %CF243, %BB
-  br i1 undef, label %CF243, label %CF257
+  br i1 poison, label %CF243, label %CF257
 
 CF257:                                            ; preds = %CF243
   %Shuff144 = shufflevector <4 x i1> undef, <4 x i1> %Cmp45, <4 x i32> <i32 undef, i32 undef, i32 5, i32 undef>
diff --git a/llvm/test/CodeGen/X86/pr50254.ll b/llvm/test/CodeGen/X86/pr50254.ll
index 01d261a..95b7ae5 100644
--- a/llvm/test/CodeGen/X86/pr50254.ll
+++ b/llvm/test/CodeGen/X86/pr50254.ll
@@ -37,7 +37,7 @@ entry:
   br label %for.body
 
 for.body:                                         ; preds = %entry
-  br i1 undef, label %for.end, label %for.body.1
+  br i1 poison, label %for.end, label %for.body.1
 
 for.end:                                          ; preds = %for.body
   store i16 %xor1, ptr @d.e, align 4
diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll
index 4ca8ae9..779978b 100644
--- a/llvm/test/CodeGen/X86/pr57673.ll
+++ b/llvm/test/CodeGen/X86/pr57673.ll
@@ -100,7 +100,7 @@ bb_entry:
   br label %bb_8
 
 bb_8:                                             ; preds = %bb_last, %bb_entry
-  br i1 undef, label %bb_last, label %bb_mid
+  br i1 poison, label %bb_last, label %bb_mid
 
 bb_mid:                                           ; preds = %bb_8
   %i4 = getelementptr inbounds %t10, ptr %i1, i64 0, i32 1, i64 32
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index beb42f5..47e5079 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -331,13 +331,13 @@ if.end:
   ]
 
 if.then4:
-  br i1 undef, label %SyTime.exit, label %if.then.i
+  br i1 poison, label %SyTime.exit, label %if.then.i
 
 if.then.i:
   unreachable
 
 SyTime.exit:
-  br i1 undef, label %SyTime.exit2681, label %if.then.i2673
+  br i1 poison, label %SyTime.exit2681, label %if.then.i2673
 
 if.then.i2673:
   unreachable
@@ -349,7 +349,7 @@ land.lhs.true14:
   unreachable
 
 if.end25:
-  br i1 undef, label %SyTime.exit2720, label %if.then.i2712
+  br i1 poison, label %SyTime.exit2720, label %if.then.i2712
 
 if.then.i2712:
   unreachable
@@ -406,7 +406,7 @@ do.end:
   %mul167 = shl i32 %rep.6, 2
   %rep.8 = select i1 %cmp164, i32 %mul167, i32 %rep.6
   %..ch.19 = select i1 false, i32 2, i32 0
-  br i1 undef, label %while.body200, label %while.end1465
+  br i1 poison, label %while.body200, label %while.end1465
 
 while.body200:
   %dec3386.in = phi i32 [ %dec3386, %while.cond197.backedge ], [ %rep.8, %do.end ]
@@ -444,7 +444,7 @@ while.cond1037.preheader:
   br i1 %cmp10393273, label %if.end1070, label %land.rhs1041
 
 while.cond635.preheader:
-  br i1 undef, label %for.body643.us, label %while.cond661
+  br i1 poison, label %for.body643.us, label %while.cond661
 
 for.body643.us:
   br label %for.body643.us
@@ -488,7 +488,7 @@ land.rhs485:
   br i1 %isascii.i.i27763151, label %cond.true.i.i2780, label %cond.false.i.i2782
 
 cond.true.i.i2780:
-  br i1 undef, label %land.lhs.true490, label %lor.rhs500
+  br i1 poison, label %land.lhs.true490, label %lor.rhs500
 
 cond.false.i.i2782:
   unreachable
@@ -499,10 +499,10 @@ land.lhs.true490:
 lor.rhs500:
   ; Make sure spill is hoisted to a cold preheader in outside loop.
   %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
-  br i1 undef, label %land.lhs.true504, label %do.body479.backedge
+  br i1 poison, label %land.lhs.true504, label %do.body479.backedge
 
 land.lhs.true504:
-  br i1 undef, label %do.body479.backedge, label %if.end517
+  br i1 poison, label %do.body479.backedge, label %if.end517
 
 do.body479.backedge:
   %incdec.ptr480 = getelementptr i8, ptr %incdec.ptr4803316, i64 1
@@ -531,10 +531,10 @@ for.cond534:
   br i1 %cmp536, label %for.cond542.preheader, label %for.cond534
 
 for.cond542.preheader:
-  br i1 undef, label %for.body545, label %for.end552
+  br i1 poison, label %for.body545, label %for.end552
 
 for.body545:
-  br i1 undef, label %for.end552, label %for.body545
+  br i1 poison, label %for.end552, label %for.body545
 
 for.end552:
   %s.2.lcssa = phi ptr [ undef, %for.cond542.preheader ], [ %q.4, %for.body545 ]
@@ -554,7 +554,7 @@ while.cond864:
   br label %while.cond864
 
 sw.bb956:
-  br i1 undef, label %if.then959, label %while.cond197.backedge
+  br i1 poison, label %if.then959, label %while.cond197.backedge
 
 if.then959:
   br label %while.cond962
@@ -600,7 +600,7 @@ while.end1465:
   ]
 
 for.cond1480.preheader:
-  br i1 undef, label %for.body1606.lr.ph, label %for.end1609
+  br i1 poison, label %for.body1606.lr.ph, label %for.end1609
 
 if.then1477:
   %p.1.lcssa3539 = phi ptr [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ %line, %while.body200 ]
@@ -614,7 +614,7 @@ for.body1606.lr.ph:
   br label %for.end1609
 
 for.end1609:
-  br i1 undef, label %for.cond1659.preheader, label %land.lhs.true1614
+  br i1 poison, label %for.cond1659.preheader, label %land.lhs.true1614
 
 land.lhs.true1614:
   br label %for.cond1659.preheader
@@ -631,13 +631,13 @@ while.body1703.lr.ph:
   unreachable
 
 while.cond1683.preheader:
-  br i1 undef, label %while.body1691, label %while.end1693
+  br i1 poison, label %while.body1691, label %while.end1693
 
 while.body1679:
   %oldc.43406 = phi i32 [ %inc, %syEchoch.exit3070 ], [ %oldc.1.lcssa, %for.body1664.lr.ph ]
   %3 = load ptr, ptr %echo.i3101, align 8, !tbaa !6
   %call.i3062 = call i32 @fileno(ptr %3)
-  br i1 undef, label %if.then.i3069, label %syEchoch.exit3070
+  br i1 poison, label %if.then.i3069, label %syEchoch.exit3070
 
 if.then.i3069:
   br label %syEchoch.exit3070
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index c9edd3f..cd3d481 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -408,7 +408,7 @@ define dso_local void @PR42880(i32 %t0) {
   %x = ptrtoint ptr %add.ptr.i94 to i32
   %sub2 = sub i32 %x, 0
   %div = sdiv exact i32 %sub2, 24
-  br i1 undef, label %if, label %then
+  br i1 poison, label %if, label %then
 
 then:
   %t1 = xor i32 %div, -1
diff --git a/llvm/test/CodeGen/X86/shuffle-combine-crash.ll b/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
index e10e3dd..962b833 100644
--- a/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
+++ b/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
@@ -28,7 +28,7 @@ define void @sample_test() {
 ; CHECK-NEXT:    movd %xmm0, (%rax)
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    retq
-  br i1 undef, label %5, label %1
+  br i1 poison, label %5, label %1
 
 ; <label>:1                                       ; preds = %0
   %2 = load <4 x i8>, ptr undef
@@ -40,4 +40,3 @@ define void @sample_test() {
 ; <label>:5                                       ; preds = %1, %0
   ret void
 }
-
diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll
index 33180a7..72406aa 100644
--- a/llvm/test/CodeGen/X86/stackmap.ll
+++ b/llvm/test/CodeGen/X86/stackmap.ll
@@ -379,23 +379,23 @@ entry:
 ; CHECK-NEXT:   .short 6
 ; CHECK-NEXT:   .short 0
 ; CHECK-NEXT:   .long
-define void @spillSubReg(i64 %arg) #0 {
+define void @spillSubReg(i64 %arg, i1 %arg2) #0 {
 bb:
-  br i1 undef, label %bb1, label %bb2
+  br i1 %arg2, label %bb1, label %bb2
 
 bb1:
   unreachable
 
 bb2:
   %tmp = load i64, ptr inttoptr (i64 140685446136880 to ptr)
-  br i1 undef, label %bb16, label %bb17
+  br i1 %arg2, label %bb16, label %bb17
 
 bb16:
   unreachable
 
 bb17:
   %tmp32 = trunc i64 %tmp to i32
-  br i1 undef, label %bb60, label %bb61
+  br i1 %arg2, label %bb60, label %bb61
 
 bb60:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 1489b02..77b1ac0 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -1014,7 +1014,7 @@ define void @swifterror_isel(ptr) {
 ; CHECK-i386-NEXT:    retl
 entry:
   %swifterror = alloca swifterror ptr, align 8
-  br i1 undef, label %5, label %1
+  br i1 poison, label %5, label %1
 
   %2 = phi i16 [ %4, %1 ], [ undef, %entry ]
   %3 = call i1 undef(i16 %2, ptr swiftself %0, ptr nocapture swifterror %swifterror)
diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll
index 629ba48..c75819c 100644
--- a/llvm/test/CodeGen/X86/switch.ll
+++ b/llvm/test/CodeGen/X86/switch.ll
@@ -2563,7 +2563,7 @@ define i32 @pr27135(i32 %i) {
 ; NOOPT-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; NOOPT-NEXT:    retq
 entry:
-  br i1 undef, label %sw, label %end
+  br i1 poison, label %sw, label %end
 sw:
   switch i32 %i, label %end [
     i32 99,  label %sw.bb
diff --git a/llvm/test/CodeGen/X86/tail-merge-unreachable.ll b/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
index ce5613f..9afdabd 100644
--- a/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
+++ b/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s
 
-define i32 @tail_merge_unreachable(i32 %i) {
+define i32 @tail_merge_unreachable(i32 %i, i1 %arg) {
 entry:
-  br i1 undef, label %sw, label %end
+  br i1 %arg, label %sw, label %end
 sw:
   switch i32 %i, label %end [
     i32 99,  label %sw.bb
diff --git a/llvm/test/CodeGen/X86/uint_to_half.ll b/llvm/test/CodeGen/X86/uint_to_half.ll
new file mode 100644
index 0000000..b62a07e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/uint_to_half.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefixes=AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+f16c | FileCheck %s -check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefixes=AVX512
+
+define <8 x half> @test_uitofp_v8i32_v8f16(<8 x i32> %a) {
+; AVX1-LABEL: test_uitofp_v8i32_v8f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v8i32_v8f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_uitofp_v8i32_v8f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = uitofp <8 x i32> %a to <8 x half>
+  ret <8 x half> %vec
+}
+
+define <8 x half> @test_strict_uitofp_v8i32_v8f16(<8 x i32> %a) {
+; AVX1-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <8 x half> %vec
+}
+
+define <16 x half> @test_uitofp_v16i32_v16f16(<16 x i32> %a) {
+; AVX1-LABEL: test_uitofp_v16i32_v16f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v16i32_v16f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vsubps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_uitofp_v16i32_v16f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %vec = uitofp <16 x i32> %a to <16 x half>
+  ret <16 x half> %vec
+}
+
+define <16 x half> @test_strict_uitofp_v16i32_v16f16(<16 x i32> %a) {
+; AVX1-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vsubps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <16 x half> %vec
+}
diff --git a/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll b/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
index d784425..b09e202 100644
--- a/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
+++ b/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
@@ -7,9 +7,9 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define double @fn1(ptr %arg, i64 %arg1) {
+define double @fn1(ptr %arg, i64 %arg1, i1 %arg2) {
 Entry:
-  br i1 undef, label %Body, label %Exit
+  br i1 %arg2, label %Body, label %Exit
 
 Exit:                                             ; preds = %Brancher7, %Entry
   ret double undef
diff --git a/llvm/test/CodeGen/X86/update-terminator.mir b/llvm/test/CodeGen/X86/update-terminator.mir
index d26f7975..ff5df9a 100644
--- a/llvm/test/CodeGen/X86/update-terminator.mir
+++ b/llvm/test/CodeGen/X86/update-terminator.mir
@@ -10,14 +10,14 @@
   declare void @dummy3()
 
   ; Function Attrs: nounwind
-  define void @f2() {
-    br i1 undef, label %bb1, label %bb3
+  define void @f2(i1 %arg) {
+    br i1 %arg, label %bb1, label %bb3
 
   bb1:
     call void @dummy1()
     call void @dummy1()
     call void @dummy1()
-    br i1 undef, label %bb2, label %bb2
+    br i1 %arg, label %bb2, label %bb2
 
   bb2:
     call void @dummy2()
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 460c5fe..78dd2cf 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -517,7 +517,7 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
@@ -647,7 +647,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2
 ; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
@@ -993,7 +993,7 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
 ; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
 ; AVX512-NEXT:    movw %ax, 9(%rdi)
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index d06993d..746c09e 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -522,7 +522,7 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
@@ -652,7 +652,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
@@ -1010,7 +1010,7 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
 ; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
 ; AVX512-NEXT:    movw %ax, 9(%rdi)
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index bac1180..be7888c 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -604,7 +604,7 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpltud %zmm0, %zmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
@@ -730,7 +730,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpltub %xmm0, %xmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
@@ -1046,7 +1046,7 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
 ; AVX512-NEXT:    movw %ax, 9(%rdi)
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index ab75ada..ceb1ad1 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -647,7 +647,7 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpnleud %zmm0, %zmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
@@ -773,7 +773,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpnleub %xmm0, %xmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
@@ -1093,7 +1093,7 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm0
 ; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
 ; AVX512-NEXT:    movw %ax, 9(%rdi)
 ; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 4f42d5c..15e287d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4129,6 +4129,62 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
   ret <32 x i8> %shuffle
 }
 
+; PR121823
+define <32 x i8> @shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a)  {
+; AVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,9,0,3]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],zero,zero,zero,zero
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,1,2,4,5,6,14,15]
+; AVX512VLBW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,9,0,3,11,2,5,13,4,7,15,6,17,25,16,19,27,18,21,29,20,23,31,22,56,57,58,59,60,61,62,63]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],xmm1[1,9,0,3]
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; XOPAVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0]
+; XOPAVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; XOPAVX2-NEXT:    retq
+  %r = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 0, i32 3, i32 11, i32 2, i32 5, i32 13, i32 4, i32 7, i32 15, i32 6, i32 17, i32 25, i32 16, i32 19, i32 27, i32 18, i32 21, i32 29, i32 20, i32 23, i32 31, i32 22, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48>
+  ret <32 x i8> %r
+}
+
 define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index f0b70ae..4125d78 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -190,7 +190,7 @@ define i64 @PR55050() {
 entry:
   %i275 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer)
   %i277 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer)
-  br i1 undef, label %exit, label %if
+  br i1 poison, label %exit, label %if
 
 if:
   %i298 = bitcast <2 x i64> %i275 to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index fe7459e..928f29b 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -868,7 +868,7 @@ define void @infiniteloop() {
 ; DISABLE-NEXT:    popq %rbp
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %if.then, label %if.end
+  br i1 poison, label %if.then, label %if.end
 
 if.then:
   %ptr = alloca i32, i32 4
@@ -983,7 +983,7 @@ define void @infiniteloop2() {
 ; DISABLE-NEXT:    popq %rbp
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %if.then, label %if.end
+  br i1 poison, label %if.then, label %if.end
 
 if.then:
   %ptr = alloca i32, i32 4
@@ -994,7 +994,7 @@ for.body:                                         ; preds = %for.body, %entry
   %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
   %add = add nsw i32 %call, %sum.03
   store i32 %add, ptr %ptr
-  br i1 undef, label %body1, label %body2
+  br i1 poison, label %body1, label %body2
 
 body1:
   tail call void asm sideeffect "nop", "~{ebx}"()
@@ -1074,10 +1074,10 @@ define void @infiniteloop3() {
 ; DISABLE-NEXT:  LBB12_7: ## %end
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %loop2a, label %body
+  br i1 poison, label %loop2a, label %body
 
 body:                                             ; preds = %entry
-  br i1 undef, label %loop2a, label %end
+  br i1 poison, label %loop2a, label %end
 
 loop1:                                            ; preds = %loop2a, %loop2b
   %var.phi = phi ptr [ %next.phi, %loop2b ], [ %var, %loop2a ]
diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
index 26ad597..82301e4 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
@@ -4,7 +4,7 @@
 @GLOBAL = addrspace(1) externally_initialized global i32 0, align 4, !dbg !0
 @SHARED = addrspace(3) externally_initialized global i32 undef, align 4, !dbg !6
 
-define void @test(float, ptr, ptr, i32) !dbg !17 {
+define ptx_kernel void @test(float, ptr, ptr, i32) !dbg !17 {
   %5 = alloca float, align 4
   %6 = alloca ptr, align 8
   %7 = alloca ptr, align 8
@@ -38,7 +38,6 @@ define void @test(float, ptr, ptr, i32) !dbg !17 {
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!2}
-!nvvm.annotations = !{!10}
 !llvm.module.flags = !{!11, !12, !13, !14, !15}
 !llvm.ident = !{!16}
 
@@ -52,7 +51,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 !7 = distinct !DIGlobalVariable(name: "SHARED", scope: !2, file: !8, line: 4, type: !9, isLocal: false, isDefinition: true)
 !8 = !DIFile(filename: "test.cu", directory: "/tmp")
 !9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!10 = !{ptr @test, !"kernel", i32 1}
 !11 = !{i32 2, !"Dwarf Version", i32 2}
 !12 = !{i32 2, !"Debug Info Version", i32 3}
 !13 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index 55c81ca..c926229 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -59,7 +59,7 @@
 ; CHECK: }
 
 ; Function Attrs: nounwind
-define void @_Z5saxpyifPfS_(i32 %n, float %a, ptr nocapture readonly %x, ptr nocapture %y) local_unnamed_addr #0 !dbg !566 {
+define ptx_kernel void @_Z5saxpyifPfS_(i32 %n, float %a, ptr nocapture readonly %x, ptr nocapture %y) local_unnamed_addr #0 !dbg !566 {
 entry:
   call void @llvm.dbg.value(metadata i32 %n, metadata !570, metadata !DIExpression()), !dbg !575
   call void @llvm.dbg.value(metadata float %a, metadata !571, metadata !DIExpression()), !dbg !576
@@ -8496,7 +8496,6 @@ attributes #2 = { nounwind readnone speculatable }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!nvvm.annotations = !{!555, !556, !557, !556, !558, !558, !558, !558, !559, !559, !558}
 !llvm.module.flags = !{!560, !561, !562, !563}
 !llvm.ident = !{!564}
 !nvvm.internalize.after.link = !{}
@@ -9057,11 +9056,6 @@ attributes #3 = { nounwind }
 !552 = !DISubprogram(name: "tgammaf", linkageName: "_ZL7tgammaff", scope: !444, file: !444, line: 1592, type: !13, isLocal: true, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true)
 !553 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !5, entity: !554, file: !445, line: 459)
 !554 = !DISubprogram(name: "truncf", linkageName: "_ZL6truncff", scope: !462, file: !462, line: 662, type: !13, isLocal: true, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true)
-!555 = !{ptr @_Z5saxpyifPfS_, !"kernel", i32 1}
-!556 = !{null, !"align", i32 8}
-!557 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!558 = !{null, !"align", i32 16}
-!559 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !560 = !{i32 2, !"Dwarf Version", i32 2}
 !561 = !{i32 2, !"Debug Info Version", i32 3}
 !562 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll b/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll
new file mode 100644
index 0000000..a836b2a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll
@@ -0,0 +1,83 @@
+; RUN: llc -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump -debug-info -debug-names - \
+; RUN:     | FileCheck %s
+
+;; Test that an entry in the debug names table gets created for a top level DIE when the creation of TU fails.
+
+;; clang++ -O0 main.cpp -gdwarf-5 -fdebug-types-section -gpubnames -S -emit-llvm -glldb -o main.ll
+;; int foo;
+;; namespace {
+;; struct t1 {};
+;; } // namespace
+;; template <int *> struct t2 {
+;;   t1 v1;
+;; };
+;; struct t3 {
+;;   t2<&foo> v1;
+;; };
+;; t3 v1;
+
+; CHECK: [[OFFSET:0x[0-9a-f]*]]:   DW_TAG_structure_type
+; CHECK: [[OFFSET1:0x[0-9a-f]*]]:   DW_TAG_structure_type
+
+; CHECK:        Bucket 0 [
+; CHECK-NEXT:    Name 1 {
+; CHECK-NEXT:      Hash: {{.+}}
+; CHECK-NEXT:      String: {{.+}} "t3"
+; CHECK-NEXT:      Entry @ {{.+}} {
+; CHECK-NEXT:        Abbrev: 0x1
+; CHECK-NEXT:        Tag: DW_TAG_structure_type
+; CHECK-NEXT:        DW_IDX_die_offset: [[OFFSET]]
+; CHECK-NEXT:        DW_IDX_parent: <parent not indexed>
+
+; CHECK:        Name 5 {
+; CHECK-NEXT:      Hash: {{.+}}
+; CHECK-NEXT:      String: {{.+}} "t2<&foo>"
+; CHECK-NEXT:      Entry @ 0xe1 {
+; CHECK-NEXT:        Abbrev: 0x1
+; CHECK-NEXT:        Tag: DW_TAG_structure_type
+; CHECK-NEXT:        DW_IDX_die_offset: [[OFFSET1]]
+; CHECK-NEXT:        DW_IDX_parent: <parent not indexed>
+
+; ModuleID = 'main.cpp'
+source_filename = "main.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.t3 = type { i8 }
+
+@foo = dso_local global i32 0, align 4, !dbg !0
+@v1 = dso_local global %struct.t3 zeroinitializer, align 1, !dbg !5
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!20, !21, !22, !23, !24, !25, !26}
+!llvm.ident = !{!27}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !19, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 20.0.0git (git@github.com:llvm/llvm-project.git ba373096e8ac83a7136fc44bc4e71a7bc53417a6)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, sysroot: "/")
+!3 = !DIFile(filename: "main.cpp", directory: "/StructuredType", checksumkind: CSK_MD5, checksum: "f91f8d905197b1c0309da9526bc4776e")
+!4 = !{!0, !5}
+!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!6 = distinct !DIGlobalVariable(name: "v1", scope: !2, file: !3, line: 11, type: !7, isLocal: false, isDefinition: true)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t3", file: !3, line: 8, size: 8, flags: DIFlagTypePassByValue, elements: !8, identifier: "_ZTS2t3")
+!8 = !{!9}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !7, file: !3, line: 9, baseType: !10, size: 8)
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2<&foo>", file: !3, line: 5, size: 8, flags: DIFlagTypePassByValue, elements: !11, templateParams: !16, identifier: "_ZTS2t2IXadL_Z3fooEEE")
+!11 = !{!12}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !10, file: !3, line: 6, baseType: !13, size: 8)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", scope: !14, file: !3, line: 3, size: 8, flags: DIFlagTypePassByValue, elements: !15)
+!14 = !DINamespace(scope: null)
+!15 = !{}
+!16 = !{!17}
+!17 = !DITemplateValueParameter(type: !18, value: ptr @foo)
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !{i32 7, !"Dwarf Version", i32 5}
+!21 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !{i32 1, !"wchar_size", i32 4}
+!23 = !{i32 8, !"PIC Level", i32 2}
+!24 = !{i32 7, !"PIE Level", i32 2}
+!25 = !{i32 7, !"uwtable", i32 2}
+!26 = !{i32 7, !"frame-pointer", i32 2}
+!27 = !{!"clang version 20.0.0git (git@github.com:llvm/llvm-project.git ba373096e8ac83a7136fc44bc4e71a7bc53417a6)"}
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
index b4f6e04..9296f04 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
@@ -1,16 +1,18 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv7.o %s
 # RUN: llvm-objdump -s --section=.rodata %t_armv7.o | FileCheck --check-prefix=CHECK-OBJ %s
-# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
-# RUN:              -slab-page-size 4096 %t_armv7.o -debug-only=jitlink 2>&1 \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_armv7.o 2>&1 \
 # RUN:              | FileCheck --check-prefix=CHECK-LG %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
 # RUN:              -slab-page-size 4096 %t_armv7.o -check %s
 
 # RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7.o %s
 # RUN: llvm-objdump -s --section=.rodata %t_thumbv7.o | FileCheck --check-prefix=CHECK-OBJ %s
-# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
-# RUN:              -slab-page-size 4096 %t_thumbv7.o -debug-only=jitlink 2>&1 \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_thumbv7.o 2>&1 \
 # RUN:              | FileCheck --check-prefix=CHECK-LG %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
 # RUN:              -slab-page-size 4096 %t_thumbv7.o -check %s
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s b/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
index 151a041..b25ffee 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
index 20534d5..b2adb85 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=arm64-apple-ios -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of compact-unwind sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
index 8d43b0f..4e84518 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=arm64-apple-darwin11 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
index 23f6acc3..da9f998 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch32_relocations.s
@@ -103,6 +103,30 @@ test_gotoffset12_external:
     ld.w $a0, $a0, %got_pc_lo12(external_data)
     .size test_gotoffset12_external, .-test_gotoffset12_external
 
+## Check R_LARCH_B16 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br16, 2)[17:0] = \
+# jitlink-check:   (test_br16_target - test_br16)[17:0]
+    .globl test_br16, test_br16_target
+    .p2align 2
+test_br16:
+    beq $t1, $t2, %b16(test_br16_target)
+    .skip (1 << 16)
+test_br16_target:
+    .size test_br16, .-test_br16
+
+## Check R_LARCH_B21 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br21, 1)[22:0] = \
+# jitlink-check:   (test_br21_target - test_br21)[22:0]
+    .globl test_br21, test_br21_target
+    .p2align 2
+test_br21:
+    beqz $t1, %b21(test_br21_target)
+    .skip (1 << 21)
+test_br21_target:
+    .size test_br21, .-test_br21
+
 
     .globl named_data
     .p2align 4
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
index cc54585..806cdcf 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
-# RUN: llvm-mc --triple=loongarch64-linux-gnu --filetype=obj -o %t %s
-# RUN: llvm-jitlink --noexec --phony-externals --debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-mc -triple=loongarch64-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 
 ## Check that splitting of eh-frame sections works.
 
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
index f07ac54..a390d1b 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_relocations.s
@@ -116,7 +116,6 @@ test_gotoffset12_external:
     ld.d $a0, $a0, %got_pc_lo12(external_data)
     .size test_gotoffset12_external, .-test_gotoffset12_external
 
-
 ## Check R_LARCH_CALL36 relocation of a local function call.
 
 # jitlink-check: decode_operand(local_func_call36, 1)[19:0] = \
@@ -130,6 +129,31 @@ local_func_call36:
     jirl $ra, $ra, 0
     .size local_func_call36, .-local_func_call36
 
+## Check R_LARCH_B16 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br16, 2)[17:0] = \
+# jitlink-check:   (test_br16_target - test_br16)[17:0]
+    .globl test_br16, test_br16_target
+    .p2align 2
+test_br16:
+    beq $t1, $t2, %b16(test_br16_target)
+    .skip (1 << 16)
+test_br16_target:
+    .size test_br16, .-test_br16
+
+## Check R_LARCH_B21 relocation for compare and branch instructions.
+
+# jitlink-check: decode_operand(test_br21, 1)[22:0] = \
+# jitlink-check:   (test_br21_target - test_br21)[22:0]
+    .globl test_br21, test_br21_target
+    .p2align 2
+test_br21:
+    beqz $t1, %b21(test_br21_target)
+    .skip (1 << 21)
+test_br21_target:
+    .size test_br21, .-test_br21
+
+
     .globl named_data
     .p2align 4
     .type named_data,@object
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
index 480fbb8..2b5c9e3 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
@@ -1,15 +1,15 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax -filetype=obj -o %t.rv32 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32 \
-# RUN:    2>&1 | FileCheck %s
+# RUN:     -check %s %t.rv32 2>&1 \
+# RUN:     | FileCheck %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:    2>&1 | FileCheck %s
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
 
         .text
 
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
index e8a2928..3bbfd55 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
@@ -1,43 +1,43 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax,+c -filetype=obj -o %t.rv32 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32 \
-# RUN:    2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv32 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv32 %t.rv32 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV32 %s
+# RUN:     -check %s -check-name=jitlink-check-rv32 %t.rv32 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax,+c -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:     2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv64 %t.rv64 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV64 %s
+# RUN:     -check %s -check-name=jitlink-check-rv64 %t.rv64 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV64 %s
 
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax,+zca -filetype=obj -o %t.rv32zca %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32zca \
-# RUN:    2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv32zca 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv32 %t.rv32zca \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV32 %s
+# RUN:     -check %s -check-name=jitlink-check-rv32 %t.rv32zca 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax,+c -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:     2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv64 %t.rv64 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV64 %s
+# RUN:     -check %s -check-name=jitlink-check-rv64 %t.rv64 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV64 %s
 
         .text
 
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s
index 13689b6..01f9e7e 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/riscv_reloc_add.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_reloc_add.s
@@ -1,6 +1,8 @@
 # RUN: rm -rf %t && mkdir -p %t
-# RUN: llvm-mc -triple=riscv64 -filetype=obj -o %t/riscv64_reloc_add.o %s
-# RUN: llvm-mc -triple=riscv32 -filetype=obj -o %t/riscv32_reloc_add.o %s
+# RUN: llvm-mc -triple=riscv64 -mattr=+relax -filetype=obj \
+# RUN:     -o %t/riscv64_reloc_add.o %s
+# RUN: llvm-mc -triple=riscv32 -mattr=+relax -filetype=obj \
+# RUN:     -o %t/riscv32_reloc_add.o %s
 # RUN: llvm-jitlink -noexec -check %s %t/riscv64_reloc_add.o \
 # RUN:     -slab-allocate=1Mb -slab-address=0x1000 -slab-page-size=0x1000
 # RUN: llvm-jitlink -noexec -check %s %t/riscv32_reloc_add.o \
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s b/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
index e7114e4..a1badfd 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv64 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -debug-only=jitlink -num-threads=0 -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Because of the exist of cfi directive, sections like eh_frame section will be emitted
 # in llvm's object code emission phase. Anonymous symbols will also be emitted to indicate
diff --git a/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s b/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
index 9e9b340..75f09ff 100644
--- a/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
@@ -1,10 +1,12 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=powerpc64le-unknown-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -phony-externals \
+# RUN:              %t 2>&1 \
+# RUN:              | FileCheck %s
 # RUN: llvm-mc -triple=powerpc64-unknown-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -phony-externals \
+# RUN:              %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s b/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
index 0bc9090..7021a27 100644
--- a/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
+++ b/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
@@ -4,8 +4,9 @@
 # RUN:   %t/external_weak.o %S/Inputs/external_weak.s
 # RUN: llvm-mc -triple=powerpc64le-unknown-linux-gnu -filetype=obj -o \
 # RUN:   %t/external_weak_main.o %S/Inputs/external_weak_main.s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t/external_weak.o \
-# RUN:   %t/external_weak_main.o 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              %t/external_weak.o %t/external_weak_main.o 2>&1 \
+# RUN:              | FileCheck %s
 # CHECK: Created ELFLinkGraphBuilder for "{{.*}}external_weak_main.o"
 # CHECK: Creating defined graph symbol for ELF symbol "foo"
 # CHECK: External symbols:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
index 830a2e0..d69dbbd 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check absolute symbol is created with a correct value.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
index 10f1182..b117674 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_ANY selection type.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
index 7dfb4c7..8915d04 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
@@ -1,16 +1,19 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check COMDAT associative symbol is emitted as local symbol.
 #
-# CHECK: Creating graph symbols...
-# CHECK:      2: Creating defined graph symbol for COFF symbol ".text" in .text (index: 2)
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000001, linkage: strong, scope: local, dead  -   <anonymous symbol>
-# CHECK-NEXT: 4: Exporting COMDAT graph symbol for COFF symbol "func" in section 2
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000001, linkage: weak, scope: default, dead  -   func
-# CHECK-NEXT: 5: Creating defined graph symbol for COFF symbol ".xdata" in .xdata (index: 3)
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .xdata
+# CHECK:       Creating graph symbols...
+# CHECK:         0: Creating defined graph symbol for COFF symbol ".text" in .text (index: 1)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .text
+# CHECK-NEXT:    4: Exporting COMDAT graph symbol for COFF symbol "func" in section 2
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: weak, scope: default, dead  -   func
+# CHECK-NEXT:    4: Creating defined graph symbol for COFF symbol "func" in .text (index: 2)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: weak, scope: default, dead  -   func
+# CHECK-NEXT:    5: Creating defined graph symbol for COFF symbol ".xdata" in .xdata (index: 3)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .xdata
 
 --- !COFF
 header:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
index f757271..76a0ac4 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_EXACT_MATCH selection type.
 # Doesn't check the content validation.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
index 11a1825..79f4b15 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a comdat export is done correctly even if second symbol of comdat sequences appear out of order
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
index 86d809d..dc05297 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check jitlink handles largest selection type as plain weak symbol.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
index 53b2c81..0c5313e 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a strong symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_NODUPLICATES selection type.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
index 97467fd..6cd8ff9 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a COMDAT symbol with an offset is handled correctly.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
index ef0f84a..e1d955f 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_SAME_SIZE selection type.
 # Doesn't check the size validation.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
index 79ac75f..8fa8ba0 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a COMDAT any symbol is exported as a weak symbol.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s
index 2754855..01aac02 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak_plus_strong.s
@@ -8,7 +8,7 @@
 #
 # RUN: not llvm-jitlink -noexec %t/COFF_main.o %t/COFF_weak_1.o %t/COFF_strong.o \
 # RUN:                  -slab-allocate 64Kb -slab-address 0xfff00000 \
-# RUN:                  -slab-page-size 4096 -show-graph 2>&1 | FileCheck %s
+# RUN:                  -slab-page-size 4096 -show-graphs=".*" 2>&1 | FileCheck %s
 #
 # Check that a combination of comdat any definition and strong definition
 # generate duplicate definition error.
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
index 2d4ad30..2788a9b 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a common symbol is created.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
index e929c01..ebce795 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
@@ -1,10 +1,10 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec -abs __ImageBase=0xfff00000 \
-# RUN: --debug-only=jitlink \
-# RUN: -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \
-# RUN: %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs __ImageBase=0xfff00000 -slab-allocate 100Kb \
+# RUN:              -slab-address 0xfff00000 -slab-page-size 4096 %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check duplicate undefined external symbols are handled correctly.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
index 3980f81..ac1ef2d 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs func=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs func=0xcafef00d %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a file debug symbol is skipped.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
index 5275c7d..dce0c1e 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs var=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs var=0xcafef00d  %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a local symbol is created for a static variable.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
index c750d75..d49d561 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs var=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs var=0xcafef00d %t 2>&1 | \
+# RUN:              FileCheck %s
 #
 # Check a default symbol is aliased as a weak external symbol.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
index 0afcda4..09dda47 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: yaml2obj -o %t.o %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t.o 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t.o 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that debug sections get NoAlloc lifetimes.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
index c01ced5..9339f07 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
@@ -2,8 +2,9 @@
 # UNSUPPORTED: system-windows
 # RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent \
 # RUN:     -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -abs bar=0x01 \
-# RUN:     -abs _ZTIi=0x02 -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs bar=0x01 -abs _ZTIi=0x02 %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # FIXME: This test should run on windows. Investigate spurious
 # 'note: command had no output on stdout or stderr' errors, then re-enable.
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
index 64990b5..98fc5f4 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
@@ -2,8 +2,9 @@
 # UNSUPPORTED: system-windows
 # RUN: llvm-mc -triple=x86_64-pc-linux-gnu -large-code-model \
 # RUN:   -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec -phony-externals %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check handling of pointer encodings for personality functions when compiling
 # with `-mcmodel=large -static`.
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
index 139ef14..83d71cd 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
@@ -1,14 +1,15 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=orc -noexec -abs _external_func=0x1 \
-# RUN:   -entry=_foo %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=orc -noexec \
+# RUN:              -abs _external_func=0x1 -entry=_foo %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that simplification eliminates dependencies on symbols in this unit,
 # and correctly propagates dependencies on symbols outside the unit (including
 # via locally scoped symbols). In this test _baz depends on _foo indirectly via
 # the local symbol _bar. Initially we expect _baz to depend on _foo, and _foo
 # on _external_func, after simplification we expect both to depend on
-# _external_func only.	
+# _external_func only.
 
 # CHECK: In main emitting {{.*}}_foo{{.*}}
 # CHECK-NEXT: Initial dependencies:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s
new file mode 100644
index 0000000..81ea18f
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s
@@ -0,0 +1,318 @@
+# RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=orc -noexec -debugger-support \
+# RUN:              %t.o 2>&1 \
+# RUN:              | FileCheck %s
+#
+# REQUIRES: asserts && system-darwin
+#
+# Test that source file names can be indentified from DWARF line tables.
+
+# CHECK: Using FileName = "check-dwarf-filename.c" from DWARF line table
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.build_version macos, 15, 0	sdk_version 15, 0
+	.globl	_main                           ## -- Begin function main
+	.p2align	4, 0x90
+_main:                                  ## @main
+Lfunc_begin0:
+	.file	0 "/Users/lhames/Projects/scratch" "check-dwarf-filename.c" md5 0x331a6c7ae0cfcd2896eca60ac6f5703e
+	.loc	0 1 0                           ## check-dwarf-filename.c:1:0
+	.cfi_startproc
+## %bb.0:
+	##DEBUG_VALUE: main:argc <- $edi
+	##DEBUG_VALUE: main:argv <- $rsi
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+Ltmp0:
+	.loc	0 2 3 prologue_end              ## check-dwarf-filename.c:2:3
+	xorl	%eax, %eax
+	.loc	0 2 3 epilogue_begin is_stmt 0  ## check-dwarf-filename.c:2:3
+	popq	%rbp
+	retq
+Ltmp1:
+Lfunc_end0:
+	.cfi_endproc
+                                        ## -- End function
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                               ## Abbreviation Code
+	.byte	17                              ## DW_TAG_compile_unit
+	.byte	1                               ## DW_CHILDREN_yes
+	.byte	37                              ## DW_AT_producer
+	.byte	37                              ## DW_FORM_strx1
+	.byte	19                              ## DW_AT_language
+	.byte	5                               ## DW_FORM_data2
+	.byte	3                               ## DW_AT_name
+	.byte	37                              ## DW_FORM_strx1
+	.ascii	"\202|"                         ## DW_AT_LLVM_sysroot
+	.byte	37                              ## DW_FORM_strx1
+	.ascii	"\357\177"                      ## DW_AT_APPLE_sdk
+	.byte	37                              ## DW_FORM_strx1
+	.byte	114                             ## DW_AT_str_offsets_base
+	.byte	23                              ## DW_FORM_sec_offset
+	.byte	16                              ## DW_AT_stmt_list
+	.byte	23                              ## DW_FORM_sec_offset
+	.byte	27                              ## DW_AT_comp_dir
+	.byte	37                              ## DW_FORM_strx1
+	.ascii	"\341\177"                      ## DW_AT_APPLE_optimized
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	17                              ## DW_AT_low_pc
+	.byte	27                              ## DW_FORM_addrx
+	.byte	18                              ## DW_AT_high_pc
+	.byte	6                               ## DW_FORM_data4
+	.byte	115                             ## DW_AT_addr_base
+	.byte	23                              ## DW_FORM_sec_offset
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	2                               ## Abbreviation Code
+	.byte	46                              ## DW_TAG_subprogram
+	.byte	1                               ## DW_CHILDREN_yes
+	.byte	17                              ## DW_AT_low_pc
+	.byte	27                              ## DW_FORM_addrx
+	.byte	18                              ## DW_AT_high_pc
+	.byte	6                               ## DW_FORM_data4
+	.byte	64                              ## DW_AT_frame_base
+	.byte	24                              ## DW_FORM_exprloc
+	.byte	122                             ## DW_AT_call_all_calls
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	3                               ## DW_AT_name
+	.byte	37                              ## DW_FORM_strx1
+	.byte	58                              ## DW_AT_decl_file
+	.byte	11                              ## DW_FORM_data1
+	.byte	59                              ## DW_AT_decl_line
+	.byte	11                              ## DW_FORM_data1
+	.byte	39                              ## DW_AT_prototyped
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	73                              ## DW_AT_type
+	.byte	19                              ## DW_FORM_ref4
+	.byte	63                              ## DW_AT_external
+	.byte	25                              ## DW_FORM_flag_present
+	.ascii	"\341\177"                      ## DW_AT_APPLE_optimized
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	3                               ## Abbreviation Code
+	.byte	5                               ## DW_TAG_formal_parameter
+	.byte	0                               ## DW_CHILDREN_no
+	.byte	2                               ## DW_AT_location
+	.byte	24                              ## DW_FORM_exprloc
+	.byte	3                               ## DW_AT_name
+	.byte	37                              ## DW_FORM_strx1
+	.byte	58                              ## DW_AT_decl_file
+	.byte	11                              ## DW_FORM_data1
+	.byte	59                              ## DW_AT_decl_line
+	.byte	11                              ## DW_FORM_data1
+	.byte	73                              ## DW_AT_type
+	.byte	19                              ## DW_FORM_ref4
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	4                               ## Abbreviation Code
+	.byte	36                              ## DW_TAG_base_type
+	.byte	0                               ## DW_CHILDREN_no
+	.byte	3                               ## DW_AT_name
+	.byte	37                              ## DW_FORM_strx1
+	.byte	62                              ## DW_AT_encoding
+	.byte	11                              ## DW_FORM_data1
+	.byte	11                              ## DW_AT_byte_size
+	.byte	11                              ## DW_FORM_data1
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	5                               ## Abbreviation Code
+	.byte	15                              ## DW_TAG_pointer_type
+	.byte	0                               ## DW_CHILDREN_no
+	.byte	73                              ## DW_AT_type
+	.byte	19                              ## DW_FORM_ref4
+	.byte	0                               ## EOM(1)
+	.byte	0                               ## EOM(2)
+	.byte	0                               ## EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit
+	.long	Lset0
+Ldebug_info_start0:
+	.short	5                               ## DWARF version number
+	.byte	1                               ## DWARF Unit Type
+	.byte	8                               ## Address Size (in bytes)
+.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+	.long	Lset1
+	.byte	1                               ## Abbrev [1] 0xc:0x50 DW_TAG_compile_unit
+	.byte	0                               ## DW_AT_producer
+	.short	29                              ## DW_AT_language
+	.byte	1                               ## DW_AT_name
+	.byte	2                               ## DW_AT_LLVM_sysroot
+	.byte	3                               ## DW_AT_APPLE_sdk
+.set Lset2, Lstr_offsets_base0-Lsection_str_off ## DW_AT_str_offsets_base
+	.long	Lset2
+.set Lset3, Lline_table_start0-Lsection_line ## DW_AT_stmt_list
+	.long	Lset3
+	.byte	4                               ## DW_AT_comp_dir
+                                        ## DW_AT_APPLE_optimized
+	.byte	0                               ## DW_AT_low_pc
+.set Lset4, Lfunc_end0-Lfunc_begin0     ## DW_AT_high_pc
+	.long	Lset4
+.set Lset5, Laddr_table_base0-Lsection_info0 ## DW_AT_addr_base
+	.long	Lset5
+	.byte	2                               ## Abbrev [2] 0x25:0x24 DW_TAG_subprogram
+	.byte	0                               ## DW_AT_low_pc
+.set Lset6, Lfunc_end0-Lfunc_begin0     ## DW_AT_high_pc
+	.long	Lset6
+	.byte	1                               ## DW_AT_frame_base
+	.byte	86
+                                        ## DW_AT_call_all_calls
+	.byte	5                               ## DW_AT_name
+	.byte	0                               ## DW_AT_decl_file
+	.byte	1                               ## DW_AT_decl_line
+                                        ## DW_AT_prototyped
+	.long	73                              ## DW_AT_type
+                                        ## DW_AT_external
+                                        ## DW_AT_APPLE_optimized
+	.byte	3                               ## Abbrev [3] 0x34:0xa DW_TAG_formal_parameter
+	.byte	1                               ## DW_AT_location
+	.byte	85
+	.byte	7                               ## DW_AT_name
+	.byte	0                               ## DW_AT_decl_file
+	.byte	1                               ## DW_AT_decl_line
+	.long	73                              ## DW_AT_type
+	.byte	3                               ## Abbrev [3] 0x3e:0xa DW_TAG_formal_parameter
+	.byte	1                               ## DW_AT_location
+	.byte	84
+	.byte	8                               ## DW_AT_name
+	.byte	0                               ## DW_AT_decl_file
+	.byte	1                               ## DW_AT_decl_line
+	.long	77                              ## DW_AT_type
+	.byte	0                               ## End Of Children Mark
+	.byte	4                               ## Abbrev [4] 0x49:0x4 DW_TAG_base_type
+	.byte	6                               ## DW_AT_name
+	.byte	5                               ## DW_AT_encoding
+	.byte	4                               ## DW_AT_byte_size
+	.byte	5                               ## Abbrev [5] 0x4d:0x5 DW_TAG_pointer_type
+	.long	82                              ## DW_AT_type
+	.byte	5                               ## Abbrev [5] 0x52:0x5 DW_TAG_pointer_type
+	.long	87                              ## DW_AT_type
+	.byte	4                               ## Abbrev [4] 0x57:0x4 DW_TAG_base_type
+	.byte	9                               ## DW_AT_name
+	.byte	6                               ## DW_AT_encoding
+	.byte	1                               ## DW_AT_byte_size
+	.byte	0                               ## End Of Children Mark
+Ldebug_info_end0:
+	.section	__DWARF,__debug_str_offs,regular,debug
+Lsection_str_off:
+	.long	44                              ## Length of String Offsets Set
+	.short	5
+	.short	0
+Lstr_offsets_base0:
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"Apple clang version 16.0.0 (clang-1600.0.26.3)" ## string offset=0
+	.asciz	"check-dwarf-filename.c"        ## string offset=47
+	.asciz	"/Library/Developer/CommandLineTools/SDKs/MacOSX15.0.sdk" ## string offset=70
+	.asciz	"MacOSX15.0.sdk"                ## string offset=126
+	.asciz	"/Users/lhames/Projects/scratch" ## string offset=141
+	.asciz	"main"                          ## string offset=172
+	.asciz	"int"                           ## string offset=177
+	.asciz	"argc"                          ## string offset=181
+	.asciz	"argv"                          ## string offset=186
+	.asciz	"char"                          ## string offset=191
+	.section	__DWARF,__debug_str_offs,regular,debug
+	.long	0
+	.long	47
+	.long	70
+	.long	126
+	.long	141
+	.long	172
+	.long	177
+	.long	181
+	.long	186
+	.long	191
+	.section	__DWARF,__debug_addr,regular,debug
+Lsection_info0:
+.set Lset7, Ldebug_addr_end0-Ldebug_addr_start0 ## Length of contribution
+	.long	Lset7
+Ldebug_addr_start0:
+	.short	5                               ## DWARF version number
+	.byte	8                               ## Address size
+	.byte	0                               ## Segment selector size
+Laddr_table_base0:
+	.quad	Lfunc_begin0
+Ldebug_addr_end0:
+	.section	__DWARF,__debug_names,regular,debug
+Ldebug_names_begin:
+.set Lset8, Lnames_end0-Lnames_start0   ## Header: unit length
+	.long	Lset8
+Lnames_start0:
+	.short	5                               ## Header: version
+	.short	0                               ## Header: padding
+	.long	1                               ## Header: compilation unit count
+	.long	0                               ## Header: local type unit count
+	.long	0                               ## Header: foreign type unit count
+	.long	3                               ## Header: bucket count
+	.long	3                               ## Header: name count
+.set Lset9, Lnames_abbrev_end0-Lnames_abbrev_start0 ## Header: abbreviation table size
+	.long	Lset9
+	.long	8                               ## Header: augmentation string size
+	.ascii	"LLVM0700"                      ## Header: augmentation string
+.set Lset10, Lcu_begin0-Lsection_info   ## Compilation unit 0
+	.long	Lset10
+	.long	0                               ## Bucket 0
+	.long	1                               ## Bucket 1
+	.long	2                               ## Bucket 2
+	.long	2090499946                      ## Hash in Bucket 1
+	.long	193495088                       ## Hash in Bucket 2
+	.long	2090147939                      ## Hash in Bucket 2
+	.long	172                             ## String in Bucket 1: main
+	.long	177                             ## String in Bucket 2: int
+	.long	191                             ## String in Bucket 2: char
+.set Lset11, Lnames0-Lnames_entries0    ## Offset in Bucket 1
+	.long	Lset11
+.set Lset12, Lnames1-Lnames_entries0    ## Offset in Bucket 2
+	.long	Lset12
+.set Lset13, Lnames2-Lnames_entries0    ## Offset in Bucket 2
+	.long	Lset13
+Lnames_abbrev_start0:
+	.ascii	"\230."                         ## Abbrev code
+	.byte	46                              ## DW_TAG_subprogram
+	.byte	3                               ## DW_IDX_die_offset
+	.byte	19                              ## DW_FORM_ref4
+	.byte	4                               ## DW_IDX_parent
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	0                               ## End of abbrev
+	.byte	0                               ## End of abbrev
+	.ascii	"\230$"                         ## Abbrev code
+	.byte	36                              ## DW_TAG_base_type
+	.byte	3                               ## DW_IDX_die_offset
+	.byte	19                              ## DW_FORM_ref4
+	.byte	4                               ## DW_IDX_parent
+	.byte	25                              ## DW_FORM_flag_present
+	.byte	0                               ## End of abbrev
+	.byte	0                               ## End of abbrev
+	.byte	0                               ## End of abbrev list
+Lnames_abbrev_end0:
+Lnames_entries0:
+Lnames0:
+L1:
+	.ascii	"\230."                         ## Abbreviation code
+	.long	37                              ## DW_IDX_die_offset
+	.byte	0                               ## DW_IDX_parent
+                                        ## End of list: main
+Lnames1:
+L0:
+	.ascii	"\230$"                         ## Abbreviation code
+	.long	73                              ## DW_IDX_die_offset
+	.byte	0                               ## DW_IDX_parent
+                                        ## End of list: int
+Lnames2:
+L2:
+	.ascii	"\230$"                         ## Abbreviation code
+	.long	87                              ## DW_IDX_die_offset
+	.byte	0                               ## DW_IDX_parent
+                                        ## End of list: char
+	.p2align	2, 0x0
+Lnames_end0:
+.subsections_via_symbols
+	.section	__DWARF,__debug_line,regular,debug
+Lsection_line:
+Lline_table_start0:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
index e5783141..3852207 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-darwin11 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of compact-unwind sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
index 5a8cef5..3859a35 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macos10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Verify that PC-begin candidate symbols have been sorted correctly when adding
 # PC-begin edges for FDEs. In this test both _main and _X are at address zero,
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
index a5baf56..0d68a10 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec -entry hook %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -entry hook %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Verify that we split C string literals on null-terminators, rather than on
 # symbol boundaries. We expect four dead-stripped symbols: l_str.0, l_str.2,
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
index e1adb3b..66fcb47 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
@@ -4,7 +4,8 @@
 #
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 
 # CHECK:        Creating graph symbols...
 # CHECK:          Graphifying regular section __DATA,__data...
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll
new file mode 100644
index 0000000..a201174
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=hwasan -S | FileCheck %s
+; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW
+
+; This shows that HWASan will emit a memaccess check when dereferencing a null
+; pointer.
+; The output is used as the source for llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+define void @test_store_to_zeroptr() sanitize_hwaddress {
+; CHECK-LABEL: define void @test_store_to_zeroptr
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[B:%.*]] = inttoptr i64 0 to ptr
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[B]], i32 19)
+; CHECK-NEXT:    store i64 42, ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+; ABORT-ZERO-BASED-SHADOW-LABEL: define void @test_store_to_zeroptr
+; ABORT-ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
+; ABORT-ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
+; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = inttoptr i64 0 to ptr
+; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr [[B]], i32 19, i64 0)
+; ABORT-ZERO-BASED-SHADOW-NEXT:    store i64 42, ptr [[B]], align 8
+; ABORT-ZERO-BASED-SHADOW-NEXT:    ret void
+;
+entry:
+  %b = inttoptr i64 0 to i64*
+  store i64 42, ptr %b
+  ret void
+}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
index 78f3816..56cf3f5 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 ;.
 ; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
index 1aa47cac..117cd1a 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
index 94098bd..ea5adf6 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/anon.ll b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
index ce4f0c1b..37de1b7 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/anon.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
index 9b9522f..8ddc573 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
 ; Test basic type sanitizer instrumentation.
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic.ll b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
index 8873a40..704c188 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/byval.ll b/llvm/test/Instrumentation/TypeSanitizer/byval.ll
index 23ed1b0..6ae343d 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/byval.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/byval.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
 ; Test basic type sanitizer instrumentation.
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/globals.ll b/llvm/test/Instrumentation/TypeSanitizer/globals.ll
index 1f57c2a..a73599e 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/globals.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/globals.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll b/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
index e7de62e..0c99c0f 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 !llvm.tysan.globals = !{!0}
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll b/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
index 26f7c18..65a30bd 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
index 7b07a42..c7c153e 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll b/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
index 3cb7b83..060f031 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll b/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
index 5711fb4..dc83a02 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll b/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll
new file mode 100644
index 0000000..bb000ef
--- /dev/null
+++ b/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll
@@ -0,0 +1,2 @@
+target triple = "nvptx64-nvidia-gpulibs"
+target datalayout = "e-i64:64-i128:128-v32:32-n16:32:64"
diff --git a/llvm/test/Linker/cuda-libdevice.ll b/llvm/test/Linker/cuda-libdevice.ll
index 484e8339..87136b1 100644
--- a/llvm/test/Linker/cuda-libdevice.ll
+++ b/llvm/test/Linker/cuda-libdevice.ll
@@ -4,8 +4,8 @@
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-9.ll -o %t/libdevice.compute_35.10.bc
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-10.ll -o %t/libdevice.10.bc
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-11.ll -o %t/libdevice.11.10.bc
-; RUN: llvm-as %p/Inputs/libdevice-cuda-9.ll -o %t/correct-libdevice-wrong-filename.bc
 ; RUN: llvm-as %p/Inputs/not-a-libdevice.ll -o %t/libdevice-with-wrong-info.bc
+; RUN: llvm-as %p/Inputs/libdevice-with-wrong-dl.ll -o %t/libdevice-with-wrong-dl.bc
 
 ; No warnings expected when we link with libdevice variants
 ; RUN: llvm-link %t/main.bc %t/libdevice.compute_35.10.bc -S 2>&1 \
@@ -15,12 +15,12 @@
 ; RUN: llvm-link %t/main.bc %t/libdevice.11.10.bc -S 2>&1 \
 ; RUN:  | FileCheck --check-prefixes COMMON,NOWARN %s
 
-; But make sure we still issue warnings if we see unexpected filename, or
-; unexpected triple or datalayout within a libdevice filename.
-; RUN: llvm-link %t/main.bc %t/correct-libdevice-wrong-filename.bc -S 2>&1 \
-; RUN:  | FileCheck --check-prefixes COMMON,WARN-TRIPLE %s
+; But make sure we still issue warnings if we see unexpected triple or
+; datalayout within a libdevice module.
 ; RUN: llvm-link %t/main.bc %t/libdevice-with-wrong-info.bc -S 2>&1 \
 ; RUN:  | FileCheck --check-prefixes COMMON,WARN-TRIPLE,WARN-DL %s
+; RUN: llvm-link %t/main.bc %t/libdevice-with-wrong-dl.bc -S 2>&1 \
+; RUN:  | FileCheck --check-prefixes COMMON,NOWARN,WARN-DL %s
 
 
 target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s b/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
index baf05f1..093101b 100644
--- a/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
+++ b/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
@@ -2,10 +2,18 @@
 // RUN: llvm-mc -triple aarch64 -show-encoding %s  | FileCheck %s
 .func:
   apas x0
+  apas x1
+  apas x2
+  apas x17
+  apas x30
   mrs x3, GPCBW_EL3
   msr GPCBW_EL3, x4
 
 # CHECK:      .func:
-# CHECK-NEXT: 	apas	x0                              // encoding: [0x1f,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x0                              // encoding: [0x00,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x1                              // encoding: [0x01,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x2                              // encoding: [0x02,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x17                             // encoding: [0x11,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x30                             // encoding: [0x1e,0x70,0x0e,0xd5]
 # CHECK-NEXT: 	mrs	x3, GPCBW_EL3                   // encoding: [0xa3,0x21,0x3e,0xd5]
 # CHECK-NEXT: 	msr	GPCBW_EL3, x4                   // encoding: [0xa4,0x21,0x1e,0xd5]
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index 87a0987..a0565dc 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -573,3 +573,9 @@ v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
 
 v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+image_bvh_intersect_ray v[4:7], v[9:19], null
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], v[9:20], null
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
index bd61ad3..f6ea86e 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mimg_err.s
@@ -359,3 +359,130 @@ image_sample_c_d_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_
 
 image_load v[0:1], v0, s[0:7] dmask:0x9 dim:1 D
 // NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid dim value
+
+// null is not allowed as SRSRC or SSAMP
+image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_fcmpswap v[1:2], v[2:3], null dmask:0x3 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_fmax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_fmin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], v[1:3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], v[1:3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s
new file mode 100644
index 0000000..5eb2e9c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mtbuf_err.s
@@ -0,0 +1,49 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s
+
+tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s
new file mode 100644
index 0000000..bd7acfe
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf_err.s
@@ -0,0 +1,160 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s
+
+buffer_atomic_add v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_x2 v[5:8], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_smax v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_smax_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_smin v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_smin_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_umax v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_umax_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_umin v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_umin_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor v5, v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_x2 v[5:6], v0, null, s3 idxen
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_dword v5, v0, null, s3 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_dwordx2 v[5:6], v0, null, s3 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_dwordx3 v[5:7], v0, null, s3 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_dwordx4 v[5:8], v0, null, s3 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_sbyte v5, v0, null, s3 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_sshort v5, v0, null, s3 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_ubyte v5, v0, null, s3 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_ushort v5, v0, null, s3 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_byte v1, v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_dword v1, v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_dwordx2 v[1:2], v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_dwordx3 v[1:3], v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_dwordx4 v[1:4], v0, null, s4 idxen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_hi_x v1, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_x v1, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_xy v1, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_xyz v[1:2], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_d16_xyzw v[1:3], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_x v1, v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s
index b582de8..683a019 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem.s
@@ -281,6 +281,22 @@ s_load_dwordx16 s[20:35], s[2:3], 0x1234 glc dlc
 s_load_dwordx16 s[20:35], s[2:3], s0 offset:0x12345 glc dlc
 // GFX10: encoding: [0x01,0x45,0x11,0xf4,0x45,0x23,0x01,0x00]
 
+// null as dst
+s_load_dword null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_dwordx2 null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_dwordx4 null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_dwordx8 null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_dwordx16 null, s[2:3], s0
+// GFX10: encoding: [0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00]
+
 s_buffer_load_dword s5, s[4:7], s0
 // GFX10: encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s
new file mode 100644
index 0000000..74c283c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s
@@ -0,0 +1,88 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOGFX10 --implicit-check-not=error: %s
+
+s_buffer_atomic_add s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_add_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_and s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_cmpswap s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_cmpswap_x2 s[4:7], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_dec s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_dec_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_inc s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_inc_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_or s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_or_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_smax s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_smax_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_smin s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_smin_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_sub s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_sub_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_swap s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_umax s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_umax_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_umin s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_atomic_umin_x2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dword s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx2 s[4:5], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx4 s[4:7], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx8 s[4:11], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx16 s[4:19], null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_store_dword s4, null, s101
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_atc_probe_buffer 7, null, s2
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
index 9bf72a1..2586198 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
@@ -400,3 +400,126 @@ image_store_pck v1, v[2:3], s[12:19] dmask:0x1 unorm
 image_store_mip_pck v1, v[2:3], s[12:19] dmask:0x0 unorm
 // NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
 
+// null is not allowed as SRSRC or SSAMP
+image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], v[1:3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], v[1:3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh_intersect_ray v[4:7], v[9:19], null
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], v[9:20], null
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s
new file mode 100644
index 0000000..3b69835
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_err.s
@@ -0,0 +1,49 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s
+
+tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s
new file mode 100644
index 0000000..d3d74467
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_err.s
@@ -0,0 +1,229 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s
+
+buffer_atomic_add_f32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_b32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_b64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_b32 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_b64 v[5:8], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_f32 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_csub_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_f32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_i32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_i64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_f32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_i32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_i64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_b32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_b64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_u32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_u64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_b32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_b64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_b32 v5, v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_b64 v[5:6], v0, null, s3 idxen
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_b128 v[5:8], v0, null, s3 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_b32 v5, v0, null, s3 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b64 v[1:2], v0, null, s4 idxen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b96 v[1:3], v0, null, s4 idxen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_b16 v5, v0, null, s3 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xy v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_i16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_b32 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_i16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_u16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_lds_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_u16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b32 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b64 v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b96 v[3:5], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xy v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_b8 v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_x v1, v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem.s
index 1d6b947..e071c67 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem.s
@@ -239,6 +239,22 @@ s_load_b512 s[20:35], s[2:3], s0 glc dlc
 s_load_b512 s[20:35], s[2:3], 0x1234 glc dlc
 // GFX11: encoding: [0x01,0x65,0x10,0xf4,0x34,0x12,0x00,0xf8]
 
+// null as dst
+s_load_b32 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b256 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b512 null, s[2:3], s0
+// GFX11: encoding: [0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00]
+
 s_buffer_load_b32 s5, s[4:7], s0
 // GFX11: encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s
new file mode 100644
index 0000000..7dd6ded
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s
@@ -0,0 +1,34 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefixes=NOGFX11 --implicit-check-not=error: %s
+
+s_buffer_load_b32 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b64 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b128 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b256 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b512 s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dword s4, null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx2 s[4:5], null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx4 s[4:7], null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx8 s[4:11], null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx16 s[4:19], null, s101
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_atc_probe_buffer 7, null, s2
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index 4e4dc66..1aefd1f 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -269,50 +269,65 @@ v_clz_i32_u32 v5, src_scc
 v_clz_i32_u32 v255, 0xaf123456
 // GFX11: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cos_f16 v5, v1
-// GFX11: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
+v_cos_f16 v5.l, v1.l
+// GFX11: v_cos_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xc3,0x0a,0x7e]
 
-v_cos_f16 v5, v127
-// GFX11: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
+v_cos_f16 v5.l, v127.l
+// GFX11: v_cos_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
-v_cos_f16 v5, s1
-// GFX11: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, s1
+// GFX11: v_cos_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, s105
-// GFX11: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, s105
+// GFX11: v_cos_f16_e32 v5.l, s105                ; encoding: [0x69,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, vcc_lo
-// GFX11: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, vcc_lo
+// GFX11: v_cos_f16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, vcc_hi
-// GFX11: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, vcc_hi
+// GFX11: v_cos_f16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, ttmp15
-// GFX11: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, ttmp15
+// GFX11: v_cos_f16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, m0
-// GFX11: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, m0
+// GFX11: v_cos_f16_e32 v5.l, m0                  ; encoding: [0x7d,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, exec_lo
-// GFX11: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, exec_lo
+// GFX11: v_cos_f16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, exec_hi
-// GFX11: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, exec_hi
+// GFX11: v_cos_f16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, null
-// GFX11: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, null
+// GFX11: v_cos_f16_e32 v5.l, null                ; encoding: [0x7c,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, -1
-// GFX11: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, -1
+// GFX11: v_cos_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, 0.5
-// GFX11: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, 0.5
+// GFX11: v_cos_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, src_scc
-// GFX11: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, src_scc
+// GFX11: v_cos_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
-v_cos_f16 v127, 0xfe0b
-// GFX11: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_cos_f16 v127.l, 0xfe0b
+// GFX11: v_cos_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16 v5.l, v1.h
+// GFX11: v_cos_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5.l, v127.h
+// GFX11: v_cos_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc3,0x0a,0x7e]
+
+v_cos_f16 v127.l, 0.5
+// GFX11: v_cos_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc2,0xfe,0x7e]
+
+v_cos_f16 v5.h, src_scc
+// GFX11: v_cos_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7f]
+
+v_cos_f16 v127.h, 0xfe0b
+// GFX11: v_cos_f16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32 v5, v1
 // GFX11: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
@@ -1271,11 +1286,11 @@ v_cvt_i32_f64 v5, src_scc
 v_cvt_i32_f64 v255, 0xaf123456
 // GFX11: v_cvt_i32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16 v5, v1
-// GFX11: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
+v_cvt_i32_i16 v5, v1.l
+// GFX11: v_cvt_i32_i16_e32 v5, v1.l              ; encoding: [0x01,0xd5,0x0a,0x7e]
 
-v_cvt_i32_i16 v5, v127
-// GFX11: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
+v_cvt_i32_i16 v5, v127.l
+// GFX11: v_cvt_i32_i16_e32 v5, v127.l            ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, s1
 // GFX11: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
@@ -1316,6 +1331,12 @@ v_cvt_i32_i16 v5, src_scc
 v_cvt_i32_i16 v255, 0xfe0b
 // GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+v_cvt_i32_i16 v5, v1.h
+// GFX11: v_cvt_i32_i16_e32 v5, v1.h              ; encoding: [0x81,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, v127.h
+// GFX11: v_cvt_i32_i16_e32 v5, v127.h            ; encoding: [0xff,0xd5,0x0a,0x7e]
+
 v_cvt_nearest_i32_f32 v5, v1
 // GFX11: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
 
@@ -1685,11 +1706,11 @@ v_cvt_u32_f64 v5, src_scc
 v_cvt_u32_f64 v255, 0xaf123456
 // GFX11: v_cvt_u32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16 v5, v1
-// GFX11: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
+v_cvt_u32_u16 v5, v1.l
+// GFX11: v_cvt_u32_u16_e32 v5, v1.l              ; encoding: [0x01,0xd7,0x0a,0x7e]
 
-v_cvt_u32_u16 v5, v127
-// GFX11: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
+v_cvt_u32_u16 v5, v127.l
+// GFX11: v_cvt_u32_u16_e32 v5, v127.l            ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, s1
 // GFX11: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
@@ -1730,6 +1751,12 @@ v_cvt_u32_u16 v5, src_scc
 v_cvt_u32_u16 v255, 0xfe0b
 // GFX11: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+v_cvt_u32_u16 v5, v1.h
+// GFX11: v_cvt_u32_u16_e32 v5, v1.h              ; encoding: [0x81,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, v127.h
+// GFX11: v_cvt_u32_u16_e32 v5, v127.h            ; encoding: [0xff,0xd7,0x0a,0x7e]
+
 v_exp_f16 v5.l, v1.l
 // GFX11: v_exp_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xb1,0x0a,0x7e]
 
@@ -2093,50 +2120,65 @@ v_floor_f64 v[5:6], src_scc
 v_floor_f64 v[254:255], 0xaf123456
 // GFX11: v_floor_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
-v_fract_f16 v5, v1
-// GFX11: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
+v_fract_f16 v5.l, v1.l
+// GFX11: v_fract_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, v127.l
+// GFX11: v_fract_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, s1
+// GFX11: v_fract_f16_e32 v5.l, s1                ; encoding: [0x01,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, v127
-// GFX11: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
+v_fract_f16 v5.l, s105
+// GFX11: v_fract_f16_e32 v5.l, s105              ; encoding: [0x69,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, s1
-// GFX11: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, vcc_lo
+// GFX11: v_fract_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, s105
-// GFX11: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, vcc_hi
+// GFX11: v_fract_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, vcc_lo
-// GFX11: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, ttmp15
+// GFX11: v_fract_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, vcc_hi
-// GFX11: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, m0
+// GFX11: v_fract_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, ttmp15
-// GFX11: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, exec_lo
+// GFX11: v_fract_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, m0
-// GFX11: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, exec_hi
+// GFX11: v_fract_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, exec_lo
-// GFX11: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, null
+// GFX11: v_fract_f16_e32 v5.l, null              ; encoding: [0x7c,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, exec_hi
-// GFX11: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, -1
+// GFX11: v_fract_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, null
-// GFX11: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, 0.5
+// GFX11: v_fract_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, -1
-// GFX11: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, src_scc
+// GFX11: v_fract_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, 0.5
-// GFX11: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
+v_fract_f16 v127.l, 0xfe0b
+// GFX11: v_fract_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_fract_f16 v5, src_scc
-// GFX11: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, v1.h
+// GFX11: v_fract_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbf,0x0a,0x7e]
 
-v_fract_f16 v127, 0xfe0b
-// GFX11: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_fract_f16 v5.l, v127.h
+// GFX11: v_fract_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbf,0x0a,0x7e]
+
+v_fract_f16 v127.l, 0.5
+// GFX11: v_fract_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbe,0xfe,0x7e]
+
+v_fract_f16 v5.h, src_scc
+// GFX11: v_fract_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7f]
+
+v_fract_f16 v127.h, 0xfe0b
+// GFX11: v_fract_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32 v5, v1
 // GFX11: v_fract_f32_e32 v5, v1                  ; encoding: [0x01,0x41,0x0a,0x7e]
@@ -2351,50 +2393,65 @@ v_frexp_exp_i32_f64 v5, src_scc
 v_frexp_exp_i32_f64 v255, 0xaf123456
 // GFX11: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16 v5, v1
-// GFX11: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, v1.l
+// GFX11: v_frexp_mant_f16_e32 v5.l, v1.l         ; encoding: [0x01,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, v127.l
+// GFX11: v_frexp_mant_f16_e32 v5.l, v127.l       ; encoding: [0x7f,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, s1
+// GFX11: v_frexp_mant_f16_e32 v5.l, s1           ; encoding: [0x01,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, v127
-// GFX11: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, s105
+// GFX11: v_frexp_mant_f16_e32 v5.l, s105         ; encoding: [0x69,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, s1
-// GFX11: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, vcc_lo
+// GFX11: v_frexp_mant_f16_e32 v5.l, vcc_lo       ; encoding: [0x6a,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, s105
-// GFX11: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, vcc_hi
+// GFX11: v_frexp_mant_f16_e32 v5.l, vcc_hi       ; encoding: [0x6b,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, vcc_lo
-// GFX11: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, ttmp15
+// GFX11: v_frexp_mant_f16_e32 v5.l, ttmp15       ; encoding: [0x7b,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, vcc_hi
-// GFX11: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, m0
+// GFX11: v_frexp_mant_f16_e32 v5.l, m0           ; encoding: [0x7d,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, ttmp15
-// GFX11: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, exec_lo
+// GFX11: v_frexp_mant_f16_e32 v5.l, exec_lo      ; encoding: [0x7e,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, m0
-// GFX11: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, exec_hi
+// GFX11: v_frexp_mant_f16_e32 v5.l, exec_hi      ; encoding: [0x7f,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, exec_lo
-// GFX11: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, null
+// GFX11: v_frexp_mant_f16_e32 v5.l, null         ; encoding: [0x7c,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, exec_hi
-// GFX11: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, -1
+// GFX11: v_frexp_mant_f16_e32 v5.l, -1           ; encoding: [0xc1,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, null
-// GFX11: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, 0.5
+// GFX11: v_frexp_mant_f16_e32 v5.l, 0.5          ; encoding: [0xf0,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, -1
-// GFX11: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, src_scc
+// GFX11: v_frexp_mant_f16_e32 v5.l, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, 0.5
-// GFX11: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v127.l, 0xfe0b
+// GFX11: v_frexp_mant_f16_e32 v127.l, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16 v5, src_scc
-// GFX11: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, v1.h
+// GFX11: v_frexp_mant_f16_e32 v5.l, v1.h         ; encoding: [0x81,0xb3,0x0a,0x7e]
 
-v_frexp_mant_f16 v127, 0xfe0b
-// GFX11: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16 v5.l, v127.h
+// GFX11: v_frexp_mant_f16_e32 v5.l, v127.h       ; encoding: [0xff,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v127.l, 0.5
+// GFX11: v_frexp_mant_f16_e32 v127.l, 0.5        ; encoding: [0xf0,0xb2,0xfe,0x7e]
+
+v_frexp_mant_f16 v5.h, src_scc
+// GFX11: v_frexp_mant_f16_e32 v5.h, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7f]
+
+v_frexp_mant_f16 v127.h, 0xfe0b
+// GFX11: v_frexp_mant_f16_e32 v127.h, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1
 // GFX11: v_frexp_mant_f32_e32 v5, v1             ; encoding: [0x01,0x81,0x0a,0x7e]
@@ -2684,50 +2741,65 @@ v_movrelsd_b32 v255, v255
 v_nop
 // GFX11: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
 
-v_not_b16 v5, v1
-// GFX11: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
+v_not_b16 v5.l, v1.l
+// GFX11: v_not_b16_e32 v5.l, v1.l                ; encoding: [0x01,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, v127.l
+// GFX11: v_not_b16_e32 v5.l, v127.l              ; encoding: [0x7f,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, s1
+// GFX11: v_not_b16_e32 v5.l, s1                  ; encoding: [0x01,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, v127
-// GFX11: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
+v_not_b16 v5.l, s105
+// GFX11: v_not_b16_e32 v5.l, s105                ; encoding: [0x69,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, s1
-// GFX11: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, vcc_lo
+// GFX11: v_not_b16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, s105
-// GFX11: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, vcc_hi
+// GFX11: v_not_b16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, vcc_lo
-// GFX11: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, ttmp15
+// GFX11: v_not_b16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, vcc_hi
-// GFX11: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, m0
+// GFX11: v_not_b16_e32 v5.l, m0                  ; encoding: [0x7d,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, ttmp15
-// GFX11: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, exec_lo
+// GFX11: v_not_b16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, m0
-// GFX11: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, exec_hi
+// GFX11: v_not_b16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, exec_lo
-// GFX11: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, null
+// GFX11: v_not_b16_e32 v5.l, null                ; encoding: [0x7c,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, exec_hi
-// GFX11: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, -1
+// GFX11: v_not_b16_e32 v5.l, -1                  ; encoding: [0xc1,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, null
-// GFX11: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, 0.5
+// GFX11: v_not_b16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, -1
-// GFX11: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, src_scc
+// GFX11: v_not_b16_e32 v5.l, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, 0.5
-// GFX11: v_not_b16_e32 v5, 0.5                   ; encoding: [0xf0,0xd2,0x0a,0x7e]
+v_not_b16 v127.l, 0xfe0b
+// GFX11: v_not_b16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_not_b16 v5, src_scc
-// GFX11: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, v1.h
+// GFX11: v_not_b16_e32 v5.l, v1.h                ; encoding: [0x81,0xd3,0x0a,0x7e]
 
-v_not_b16 v127, 0xfe0b
-// GFX11: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_not_b16 v5.l, v127.h
+// GFX11: v_not_b16_e32 v5.l, v127.h              ; encoding: [0xff,0xd3,0x0a,0x7e]
+
+v_not_b16 v127.l, 0.5
+// GFX11: v_not_b16_e32 v127.l, 0.5               ; encoding: [0xf0,0xd2,0xfe,0x7e]
+
+v_not_b16 v5.h, src_scc
+// GFX11: v_not_b16_e32 v5.h, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7f]
+
+v_not_b16 v127.h, 0xfe0b
+// GFX11: v_not_b16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_not_b32 v5, v1
 // GFX11: v_not_b32_e32 v5, v1                    ; encoding: [0x01,0x6f,0x0a,0x7e]
@@ -2978,50 +3050,65 @@ v_readfirstlane_b32 ttmp15, v1
 v_readfirstlane_b32 null, v255
 // GFX11: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
-v_rndne_f16 v5, v1
-// GFX11: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.l
+// GFX11: v_rndne_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.l
+// GFX11: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, s1
+// GFX11: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5.l, s105
+// GFX11: v_rndne_f16_e32 v5.l, s105              ; encoding: [0x69,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, v127
-// GFX11: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_lo
+// GFX11: v_rndne_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s1
-// GFX11: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_hi
+// GFX11: v_rndne_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s105
-// GFX11: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, ttmp15
+// GFX11: v_rndne_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_lo
-// GFX11: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, m0
+// GFX11: v_rndne_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_hi
-// GFX11: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_lo
+// GFX11: v_rndne_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, ttmp15
-// GFX11: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_hi
+// GFX11: v_rndne_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, m0
-// GFX11: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, null
+// GFX11: v_rndne_f16_e32 v5.l, null              ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_lo
-// GFX11: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, -1
+// GFX11: v_rndne_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_hi
-// GFX11: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, 0.5
+// GFX11: v_rndne_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, null
-// GFX11: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, src_scc
+// GFX11: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, -1
-// GFX11: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+v_rndne_f16 v127.l, 0xfe0b
+// GFX11: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v5, 0.5
-// GFX11: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.h
+// GFX11: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v5, src_scc
-// GFX11: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, v127.h
+// GFX11: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v127, 0xfe0b
-// GFX11: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v127.l, 0.5
+// GFX11: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+
+v_rndne_f16 v5.h, src_scc
+// GFX11: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+v_rndne_f16 v127.h, 0xfe0b
+// GFX11: v_rndne_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
 // GFX11: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
@@ -3236,95 +3323,119 @@ v_rsq_f64 v[5:6], src_scc
 v_rsq_f64 v[254:255], 0xaf123456
 // GFX11: v_rsq_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
-v_sat_pk_u8_i16 v5, v1
-// GFX11: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, v1
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, v1            ; encoding: [0x01,0xc5,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5.l, v255
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, v255          ; encoding: [0xff,0xc5,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5.l, s1
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, s1            ; encoding: [0x01,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5.l, s105
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, s105          ; encoding: [0x69,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, v255
-// GFX11: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, vcc_lo
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, vcc_lo        ; encoding: [0x6a,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, s1
-// GFX11: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, vcc_hi
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, vcc_hi        ; encoding: [0x6b,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, s105
-// GFX11: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, ttmp15
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, ttmp15        ; encoding: [0x7b,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, vcc_lo
-// GFX11: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, m0
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, m0            ; encoding: [0x7d,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, vcc_hi
-// GFX11: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, exec_lo
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, exec_lo       ; encoding: [0x7e,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, ttmp15
-// GFX11: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, exec_hi
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, exec_hi       ; encoding: [0x7f,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, m0
-// GFX11: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, null
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, null          ; encoding: [0x7c,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, exec_lo
-// GFX11: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, -1
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, -1            ; encoding: [0xc1,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, exec_hi
-// GFX11: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, 0.5
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, 0.5           ; encoding: [0xf0,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, null
-// GFX11: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.l, src_scc
+// GFX11: v_sat_pk_u8_i16_e32 v5.l, src_scc       ; encoding: [0xfd,0xc4,0x0a,0x7e]
 
-v_sat_pk_u8_i16 v5, -1
-// GFX11: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v127.l, 0xfe0b
+// GFX11: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b      ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_sat_pk_u8_i16 v5, 0.5
-// GFX11: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v127.l, 0.5
+// GFX11: v_sat_pk_u8_i16_e32 v127.l, 0.5         ; encoding: [0xf0,0xc4,0xfe,0x7e]
 
-v_sat_pk_u8_i16 v5, src_scc
-// GFX11: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
+v_sat_pk_u8_i16 v5.h, src_scc
+// GFX11: v_sat_pk_u8_i16_e32 v5.h, src_scc       ; encoding: [0xfd,0xc4,0x0a,0x7f]
 
-v_sat_pk_u8_i16 v127, 0xfe0b
-// GFX11: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_sat_pk_u8_i16 v127.h, 0xfe0b
+// GFX11: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b      ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
-v_sin_f16 v5, v1
-// GFX11: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
+v_sin_f16 v5.l, v1.l
+// GFX11: v_sin_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xc1,0x0a,0x7e]
 
-v_sin_f16 v5, v127
-// GFX11: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
+v_sin_f16 v5.l, v127.l
+// GFX11: v_sin_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
-v_sin_f16 v5, s1
-// GFX11: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, s1
+// GFX11: v_sin_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, s105
-// GFX11: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, s105
+// GFX11: v_sin_f16_e32 v5.l, s105                ; encoding: [0x69,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, vcc_lo
-// GFX11: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, vcc_lo
+// GFX11: v_sin_f16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, vcc_hi
-// GFX11: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, vcc_hi
+// GFX11: v_sin_f16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, ttmp15
-// GFX11: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, ttmp15
+// GFX11: v_sin_f16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, m0
-// GFX11: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, m0
+// GFX11: v_sin_f16_e32 v5.l, m0                  ; encoding: [0x7d,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, exec_lo
-// GFX11: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, exec_lo
+// GFX11: v_sin_f16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, exec_hi
-// GFX11: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, exec_hi
+// GFX11: v_sin_f16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, null
-// GFX11: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, null
+// GFX11: v_sin_f16_e32 v5.l, null                ; encoding: [0x7c,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, -1
-// GFX11: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, -1
+// GFX11: v_sin_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, 0.5
-// GFX11: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, 0.5
+// GFX11: v_sin_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, src_scc
-// GFX11: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, src_scc
+// GFX11: v_sin_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
-v_sin_f16 v127, 0xfe0b
-// GFX11: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_sin_f16 v127.l, 0xfe0b
+// GFX11: v_sin_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v5.l, v1.h
+// GFX11: v_sin_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5.l, v127.h
+// GFX11: v_sin_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc1,0x0a,0x7e]
+
+v_sin_f16 v127.l, 0.5
+// GFX11: v_sin_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc0,0xfe,0x7e]
+
+v_sin_f16 v5.h, src_scc
+// GFX11: v_sin_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7f]
+
+v_sin_f16 v127.h, 0xfe0b
+// GFX11: v_sin_f16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32 v5, v1
 // GFX11: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
@@ -3524,50 +3635,65 @@ v_swaprel_b32 v5, v1
 v_swaprel_b32 v255, v255
 // GFX11: v_swaprel_b32 v255, v255                ; encoding: [0xff,0xd1,0xfe,0x7f]
 
-v_trunc_f16 v5, v1
-// GFX11: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
+v_trunc_f16 v5.l, v1.l
+// GFX11: v_trunc_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, v127.l
+// GFX11: v_trunc_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, s1
+// GFX11: v_trunc_f16_e32 v5.l, s1                ; encoding: [0x01,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, s105
+// GFX11: v_trunc_f16_e32 v5.l, s105              ; encoding: [0x69,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, vcc_lo
+// GFX11: v_trunc_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, vcc_hi
+// GFX11: v_trunc_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, v127
-// GFX11: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
+v_trunc_f16 v5.l, ttmp15
+// GFX11: v_trunc_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, s1
-// GFX11: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, m0
+// GFX11: v_trunc_f16_e32 v5.l, m0                ; encoding: [0x7d,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, s105
-// GFX11: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, exec_lo
+// GFX11: v_trunc_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, vcc_lo
-// GFX11: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, exec_hi
+// GFX11: v_trunc_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, vcc_hi
-// GFX11: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, null
+// GFX11: v_trunc_f16_e32 v5.l, null              ; encoding: [0x7c,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, ttmp15
-// GFX11: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, -1
+// GFX11: v_trunc_f16_e32 v5.l, -1                ; encoding: [0xc1,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, m0
-// GFX11: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, 0.5
+// GFX11: v_trunc_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, exec_lo
-// GFX11: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, src_scc
+// GFX11: v_trunc_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, exec_hi
-// GFX11: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
+v_trunc_f16 v127.l, 0xfe0b
+// GFX11: v_trunc_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_trunc_f16 v5, null
-// GFX11: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, v1.h
+// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
 
-v_trunc_f16 v5, -1
-// GFX11: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, v127.h
+// GFX11: v_trunc_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbb,0x0a,0x7e]
 
-v_trunc_f16 v5, 0.5
-// GFX11: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
+v_trunc_f16 v127.l, 0.5
+// GFX11: v_trunc_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xba,0xfe,0x7e]
 
-v_trunc_f16 v5, src_scc
-// GFX11: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
+v_trunc_f16 v5.h, src_scc
+// GFX11: v_trunc_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7f]
 
-v_trunc_f16 v127, 0xfe0b
-// GFX11: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_trunc_f16 v127.h, 0xfe0b
+// GFX11: v_trunc_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32 v5, v1
 // GFX11: v_trunc_f32_e32 v5, v1                  ; encoding: [0x01,0x43,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
index 98e4b29..2bdb9ec 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
@@ -212,47 +212,56 @@ v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_cos_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cos_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_cos_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cos_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cos_f16 v5, v1 row_mirror
-// GFX11: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_mirror
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cos_f16 v5, v1 row_half_mirror
-// GFX11: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shl:1
-// GFX11: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shl:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shl:15
-// GFX11: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shl:15
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shr:1
-// GFX11: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shr:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shr:15
-// GFX11: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shr:15
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_ror:1
-// GFX11: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_ror:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cos_f16 v5, v1 row_ror:15
-// GFX11: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_ror:15
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cos_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cos_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_cos_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cos_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cos_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_cos_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_cos_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_cos_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -926,47 +935,56 @@ v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_i32_i16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i32_i16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_i32_i16 v5, v1.l row_mirror
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_mirror
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_half_mirror
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_half_mirror
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shl:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shl:1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shl:15
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shl:15
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shr:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shr:1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shr:15
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shr:15
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_ror:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_ror:1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_ror:15
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_ror:15
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_i32_i16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_i32_i16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
 
-v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+v_cvt_i32_i16 v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_i32_i16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_cvt_i32_i16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x09,0x13]
+
+v_cvt_i32_i16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1262,47 +1280,56 @@ v_cvt_u32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cvt_u32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_u32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_cvt_u32_u16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_u32_u16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1.l row_mirror
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_u32_u16 v5, v1.l row_half_mirror
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_mirror
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shl:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_half_mirror
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shl:15
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shl:1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shr:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shl:15
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shr:15
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shr:1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_ror:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shr:15
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_ror:15
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_ror:1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_ror:15
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cvt_u32_u16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cvt_u32_u16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_u32_u16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cvt_u32_u16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
 
-v_cvt_u32_u16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_u32_u16 v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_u32_u16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_cvt_u32_u16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+v_cvt_u32_u16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cvt_u32_u16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x09,0x13]
+
+v_cvt_u32_u16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cvt_u32_u16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_exp_f16 v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: v_exp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1598,47 +1625,56 @@ v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_fract_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_fract_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_fract_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_fract_f16 v5.l, v1.l row_mirror
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_fract_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_fract_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_fract_f16 v5.l, v1.l row_shl:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_fract_f16 v5, v1 row_mirror
-// GFX11: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shl:15
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_half_mirror
-// GFX11: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shr:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shl:1
-// GFX11: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shr:15
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shl:15
-// GFX11: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_ror:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shr:1
-// GFX11: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_ror:15
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shr:15
-// GFX11: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_fract_f16 v5, v1 row_ror:1
-// GFX11: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_fract_f16 v5, v1 row_ror:15
-// GFX11: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_fract_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_fract_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fract_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_fract_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_fract_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_fract_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1766,47 +1802,56 @@ v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f
 v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l row_mirror
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shl:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_mirror
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shl:15
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_half_mirror
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shr:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shl:1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shr:15
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shl:15
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_ror:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shr:1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_ror:15
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shr:15
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_ror:1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_frexp_mant_f16 v5, v1 row_ror:15
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_frexp_mant_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_frexp_mant_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_frexp_mant_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_frexp_mant_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2144,47 +2189,56 @@ v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_not_b16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_not_b16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_not_b16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_not_b16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_not_b16 v5, v1 row_mirror
-// GFX11: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_not_b16 v5.l, v1.l row_mirror
+// GFX11: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_not_b16 v5, v1 row_half_mirror
-// GFX11: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_not_b16 v5.l, v1.l row_half_mirror
+// GFX11: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_not_b16 v5, v1 row_shl:1
-// GFX11: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shl:1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_not_b16 v5, v1 row_shl:15
-// GFX11: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shl:15
+// GFX11: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_not_b16 v5, v1 row_shr:1
-// GFX11: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shr:1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_not_b16 v5, v1 row_shr:15
-// GFX11: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shr:15
+// GFX11: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_not_b16 v5, v1 row_ror:1
-// GFX11: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_not_b16 v5.l, v1.l row_ror:1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_not_b16 v5, v1 row_ror:15
-// GFX11: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_ror:15
+// GFX11: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_not_b16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_not_b16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_not_b16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+v_not_b16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_not_b16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_not_b16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_not_b16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_not_b16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_not_b32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2354,47 +2408,56 @@ v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rndne_f16 v5.l, v1.l row_mirror
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_mirror
-// GFX11: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_half_mirror
-// GFX11: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:1
-// GFX11: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:15
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:15
-// GFX11: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rndne_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rndne_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rndne_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_rndne_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_rndne_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2522,89 +2585,107 @@ v_rsq_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rsq_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_sat_pk_u8_i16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16 v5.l, v1 quad_perm:[3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sat_pk_u8_i16 v5.l, v1 quad_perm:[0,1,2,3]
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sat_pk_u8_i16 v5.l, v1 row_mirror
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_half_mirror
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_mirror
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_shl:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_half_mirror
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_shl:15
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_shl:1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_shr:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_shl:15
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_shr:15
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_shr:1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_ror:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_shr:15
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_ror:15
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_ror:1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_sat_pk_u8_i16 v5, v1 row_ror:15
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_sat_pk_u8_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_sat_pk_u8_i16 v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_sat_pk_u8_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_sat_pk_u8_i16 v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30]
 
-v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_sat_pk_u8_i16 v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sat_pk_u8_i16_dpp v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01]
 
-v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30]
+v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x09,0x13]
 
-v_sin_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_sin_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_sin_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_sin_f16 v5, v1 row_mirror
-// GFX11: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_sin_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_sin_f16 v5, v1 row_half_mirror
-// GFX11: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_mirror
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shl:1
-// GFX11: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shl:15
-// GFX11: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shl:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shr:1
-// GFX11: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shl:15
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shr:15
-// GFX11: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shr:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_sin_f16 v5, v1 row_ror:1
-// GFX11: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shr:15
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_ror:15
-// GFX11: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_ror:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_ror:15
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_sin_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_sin_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_sin_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sin_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_sin_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sin_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+v_sin_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_sin_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_sin_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2732,47 +2813,56 @@ v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_trunc_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_trunc_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16 v5.l, v1.l row_mirror
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_trunc_f16 v5.l, v1.l row_half_mirror
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_trunc_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_trunc_f16 v5.l, v1.l row_shl:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_mirror
-// GFX11: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shl:15
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_half_mirror
-// GFX11: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shr:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shl:1
-// GFX11: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shr:15
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shl:15
-// GFX11: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_ror:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shr:1
-// GFX11: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_ror:15
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shr:15
-// GFX11: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_ror:1
-// GFX11: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16 v5, v1 row_ror:15
-// GFX11: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_trunc_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_trunc_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_trunc_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_trunc_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_trunc_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_trunc_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
index ab4606a..ba0c349 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
@@ -50,14 +50,23 @@ v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_cos_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cos_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_cos_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_cos_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -218,14 +227,23 @@ v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 
-v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_cvt_i32_i16 v5, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -299,14 +317,23 @@ v_cvt_u32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_u32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_u32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cvt_u32_u16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 
-v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x7f,0x77,0x39,0x05]
 
-v_cvt_u32_u16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_cvt_u32_u16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_u32_u16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cvt_u32_u16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_exp_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_exp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -377,14 +404,23 @@ v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_fract_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05]
 
-v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_fract_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_fract_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -416,14 +452,23 @@ v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -500,14 +545,23 @@ v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_not_b16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_not_b16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_not_b16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_not_b16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -548,14 +602,23 @@ v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_rndne_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -587,23 +650,41 @@ v_rsq_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rsq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v127.l, v255 dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16 v127.l, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_dpp v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05]
 
-v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_sin_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+v_sin_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_sin_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -635,14 +716,23 @@ v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05]
 
-v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_trunc_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_trunc_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 4ae9134..dea33dc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
 
@@ -47,6 +47,12 @@ v_ceil_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_cos_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_cos_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_cos_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -56,6 +62,24 @@ v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_cos_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_cos_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -65,6 +89,24 @@ v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_cos_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_cvt_f16_f32_e32 v128, 0xaf123456
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -236,6 +278,24 @@ v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
 
+v_cvt_i32_i16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
 v_cvt_norm_i16_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
@@ -371,6 +431,24 @@ v_cvt_u32_u16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_u32_u16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
 
+v_cvt_u32_u16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
 v_exp_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -458,6 +536,12 @@ v_floor_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_fract_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_fract_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_fract_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -467,6 +551,24 @@ v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_fract_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_fract_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -476,6 +578,24 @@ v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_fract_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_frexp_exp_i16_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
@@ -521,6 +641,12 @@ v_frexp_exp_i16_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_frexp_mant_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_frexp_mant_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
 v_frexp_mant_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -530,6 +656,24 @@ v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction
 
+v_frexp_mant_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
 v_frexp_mant_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -539,6 +683,24 @@ v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:31: error: invalid operand for instruction
 
+v_frexp_mant_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+
 v_log_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -584,6 +746,12 @@ v_log_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_not_b16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
+v_not_b16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_not_b16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -593,6 +761,24 @@ v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
+v_not_b16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_not_b16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
@@ -602,6 +788,24 @@ v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
+v_not_b16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_rcp_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -647,6 +851,12 @@ v_rcp_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_rndne_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_rndne_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -656,6 +866,24 @@ v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -665,6 +893,24 @@ v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_rsq_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -716,9 +962,33 @@ v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
 v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:30: error: invalid operand for instruction
 
+v_sat_pk_u8_i16_e32 v199.h, v5.h
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.h, v5.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.h, v5.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5.l
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_sin_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_sin_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_sin_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -728,6 +998,24 @@ v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_sin_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_sin_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -737,6 +1025,24 @@ v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_sin_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_sqrt_f16_e32 v128.h, 0xfe0b
 // GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction
 
@@ -794,6 +1100,12 @@ v_swap_b16_e32 v128.l, v0.l
 v_trunc_f16_e32 v128, 0xfe0b
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_trunc_f16_e32 v128.h, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v128.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_trunc_f16_e32 v255, v1
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -803,6 +1115,24 @@ v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_trunc_f16_e32 v255.h, v1.h
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_trunc_f16_e32 v5, v199
 // GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -811,3 +1141,21 @@ v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 
 v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
index 1d44172..5cb81c6 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_promote.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s
 
@@ -68,71 +68,137 @@ v_ceil_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_ceil_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_ceil_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cos_f16 v128, 0xfe0b
-// GFX11: v_cos_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_cos_f16 v128.h, 0xfe0b
+// GFX11: v_cos_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_cos_f16 v255, -1
-// GFX11: v_cos_f16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16 v128.l, 0xfe0b
+// GFX11: v_cos_f16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_cos_f16 v255, 0.5
-// GFX11: v_cos_f16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00]
+v_cos_f16 v255.h, -1
+// GFX11: v_cos_f16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16 v255, exec_hi
-// GFX11: v_cos_f16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16 v255.h, 0.5
+// GFX11: v_cos_f16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe1,0xd5,0xf0,0x00,0x00,0x00]
 
-v_cos_f16 v255, exec_lo
-// GFX11: v_cos_f16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16 v255.h, exec_hi
+// GFX11: v_cos_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16 v255, m0
-// GFX11: v_cos_f16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16 v255.h, exec_lo
+// GFX11: v_cos_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16 v255, null
-// GFX11: v_cos_f16_e64 v255, null                ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16 v255.h, m0
+// GFX11: v_cos_f16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16 v255, s1
-// GFX11: v_cos_f16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16 v255.h, null
+// GFX11: v_cos_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16 v255, s105
-// GFX11: v_cos_f16_e64 v255, s105                ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16 v255.h, s1
+// GFX11: v_cos_f16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16 v255, src_scc
-// GFX11: v_cos_f16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00]
+v_cos_f16 v255.h, s105
+// GFX11: v_cos_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16 v255, ttmp15
-// GFX11: v_cos_f16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16 v255.h, src_scc
+// GFX11: v_cos_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xfd,0x00,0x00,0x00]
 
-v_cos_f16 v255, v1
-// GFX11: v_cos_f16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16 v255.h, ttmp15
+// GFX11: v_cos_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_f16 v255.h, v1.h
+// GFX11: v_cos_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v255, v127
-// GFX11: v_cos_f16_e64 v255, v127                ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00]
+v_cos_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cos_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_cos_f16 v255.h, v127.h
+// GFX11: v_cos_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x7f,0x01,0x00,0x00]
 
-v_cos_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_cos_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_cos_f16 v255, vcc_hi
-// GFX11: v_cos_f16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_cos_f16 v255, vcc_lo
-// GFX11: v_cos_f16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16 v255.h, vcc_hi
+// GFX11: v_cos_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199
-// GFX11: v_cos_f16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+v_cos_f16 v255.h, vcc_lo
+// GFX11: v_cos_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cos_f16 v255.l, -1
+// GFX11: v_cos_f16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cos_f16 v255.l, 0.5
+// GFX11: v_cos_f16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, exec_hi
+// GFX11: v_cos_f16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, exec_lo
+// GFX11: v_cos_f16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, m0
+// GFX11: v_cos_f16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, null
+// GFX11: v_cos_f16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, s1
+// GFX11: v_cos_f16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, s105
+// GFX11: v_cos_f16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, src_scc
+// GFX11: v_cos_f16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, ttmp15
+// GFX11: v_cos_f16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, v1.l
+// GFX11: v_cos_f16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cos_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cos_f16 v255.l, v127.l
+// GFX11: v_cos_f16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00]
+
+v_cos_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_cos_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_cos_f16 v255.l, vcc_hi
+// GFX11: v_cos_f16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, vcc_lo
+// GFX11: v_cos_f16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_f16 v5.h, v199.h
+// GFX11: v_cos_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cos_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cos_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_cos_f16 v5.l, v199.l
+// GFX11: v_cos_f16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cos_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cos_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_cvt_f16_f32 v128.h, 0xaf123456
 // GFX11: v_cvt_f16_f32_e64 v128.h, 0xaf123456 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
@@ -662,14 +728,23 @@ v_cvt_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX11: v_cvt_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd3,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_i32_i16 v5, v199
-// GFX11: v_cvt_i32_i16_e64 v5, v199              ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00]
+v_cvt_i32_i16 v5, v199.h
+// GFX11: v_cvt_i32_i16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_i32_i16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_i32_i16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cvt_i32_i16 v5, v199.l
+// GFX11: v_cvt_i32_i16_e64 v5, v199.l            ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00]
 
-v_cvt_i32_i16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cvt_i32_i16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_cvt_norm_i16_f16 v128.h, 0xfe0b
 // GFX11: v_cvt_norm_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe3,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1067,14 +1142,23 @@ v_cvt_u16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_u16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX11: v_cvt_u16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd2,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_u32_u16 v5, v199
-// GFX11: v_cvt_u32_u16_e64 v5, v199              ; encoding: [0x05,0x00,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+v_cvt_u32_u16 v5, v199.h
+// GFX11: v_cvt_u32_u16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_cvt_u32_u16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_u32_u16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cvt_u32_u16 v5, v199.l
+// GFX11: v_cvt_u32_u16_e64 v5, v199.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_exp_f16 v128, 0xfe0b
 // GFX11: v_exp_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd8,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1208,71 +1292,137 @@ v_floor_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_floor_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_floor_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_fract_f16 v128, 0xfe0b
-// GFX11: v_fract_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_fract_f16 v128.h, 0xfe0b
+// GFX11: v_fract_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v128.l, 0xfe0b
+// GFX11: v_fract_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v255.h, -1
+// GFX11: v_fract_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, 0.5
+// GFX11: v_fract_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xf0,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, exec_hi
+// GFX11: v_fract_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, exec_lo
+// GFX11: v_fract_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, m0
+// GFX11: v_fract_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, null
+// GFX11: v_fract_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, s1
+// GFX11: v_fract_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x01,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, s105
+// GFX11: v_fract_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x69,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, src_scc
+// GFX11: v_fract_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xfd,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, ttmp15
+// GFX11: v_fract_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, v1.h
+// GFX11: v_fract_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_fract_f16 v255, -1
-// GFX11: v_fract_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_fract_f16 v255, 0.5
-// GFX11: v_fract_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h
+// GFX11: v_fract_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x7f,0x01,0x00,0x00]
 
-v_fract_f16 v255, exec_hi
-// GFX11: v_fract_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_fract_f16 v255, exec_lo
-// GFX11: v_fract_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_fract_f16 v255, m0
-// GFX11: v_fract_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16 v255.h, vcc_hi
+// GFX11: v_fract_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16 v255, null
-// GFX11: v_fract_f16_e64 v255, null              ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16 v255.h, vcc_lo
+// GFX11: v_fract_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16 v255, s1
-// GFX11: v_fract_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16 v255.l, -1
+// GFX11: v_fract_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16 v255, s105
-// GFX11: v_fract_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16 v255.l, 0.5
+// GFX11: v_fract_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00]
 
-v_fract_f16 v255, src_scc
-// GFX11: v_fract_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00]
+v_fract_f16 v255.l, exec_hi
+// GFX11: v_fract_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16 v255, ttmp15
-// GFX11: v_fract_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16 v255.l, exec_lo
+// GFX11: v_fract_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1
-// GFX11: v_fract_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16 v255.l, m0
+// GFX11: v_fract_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_fract_f16 v255.l, null
+// GFX11: v_fract_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_fract_f16 v255.l, s1
+// GFX11: v_fract_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127
-// GFX11: v_fract_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00]
+v_fract_f16 v255.l, s105
+// GFX11: v_fract_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_fract_f16 v255.l, src_scc
+// GFX11: v_fract_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_fract_f16 v255.l, ttmp15
+// GFX11: v_fract_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16 v255, vcc_hi
-// GFX11: v_fract_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16 v255.l, v1.l
+// GFX11: v_fract_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
-v_fract_f16 v255, vcc_lo
-// GFX11: v_fract_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_fract_f16 v5, v199
-// GFX11: v_fract_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+v_fract_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_fract_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_fract_f16 v255.l, v127.l
+// GFX11: v_fract_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00]
 
-v_fract_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_fract_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_fract_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_fract_f16 v255.l, vcc_hi
+// GFX11: v_fract_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+
+v_fract_f16 v255.l, vcc_lo
+// GFX11: v_fract_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f16 v5.h, v199.h
+// GFX11: v_fract_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+
+v_fract_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_fract_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_fract_f16 v5.l, v199.l
+// GFX11: v_fract_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+
+v_fract_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_fract_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_frexp_exp_i16_f16 v128.h, 0xfe0b
 // GFX11: v_frexp_exp_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xda,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1406,71 +1556,137 @@ v_frexp_exp_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_exp_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX11: v_frexp_exp_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xda,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v128, 0xfe0b
-// GFX11: v_frexp_mant_f16_e64 v128, 0xfe0b       ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16 v128.h, 0xfe0b
+// GFX11: v_frexp_mant_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v128.l, 0xfe0b
+// GFX11: v_frexp_mant_f16_e64 v128.l, 0xfe0b     ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, -1
+// GFX11: v_frexp_mant_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, 0.5
+// GFX11: v_frexp_mant_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, exec_hi
+// GFX11: v_frexp_mant_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, exec_lo
+// GFX11: v_frexp_mant_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, m0
+// GFX11: v_frexp_mant_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, null
+// GFX11: v_frexp_mant_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, s1
+// GFX11: v_frexp_mant_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, s105
+// GFX11: v_frexp_mant_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, src_scc
+// GFX11: v_frexp_mant_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, ttmp15
+// GFX11: v_frexp_mant_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, -1
-// GFX11: v_frexp_mant_f16_e64 v255, -1           ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h
+// GFX11: v_frexp_mant_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v255, 0.5
-// GFX11: v_frexp_mant_f16_e64 v255, 0.5          ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v255, exec_hi
-// GFX11: v_frexp_mant_f16_e64 v255, exec_hi      ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v255, exec_lo
-// GFX11: v_frexp_mant_f16_e64 v255, exec_lo      ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h
+// GFX11: v_frexp_mant_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x7f,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v255, m0
-// GFX11: v_frexp_mant_f16_e64 v255, m0           ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v255, null
-// GFX11: v_frexp_mant_f16_e64 v255, null         ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v255, s1
-// GFX11: v_frexp_mant_f16_e64 v255, s1           ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, vcc_hi
+// GFX11: v_frexp_mant_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, s105
-// GFX11: v_frexp_mant_f16_e64 v255, s105         ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, vcc_lo
+// GFX11: v_frexp_mant_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, src_scc
-// GFX11: v_frexp_mant_f16_e64 v255, src_scc      ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, -1
+// GFX11: v_frexp_mant_f16_e64 v255.l, -1         ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, ttmp15
-// GFX11: v_frexp_mant_f16_e64 v255, ttmp15       ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, 0.5
+// GFX11: v_frexp_mant_f16_e64 v255.l, 0.5        ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1
-// GFX11: v_frexp_mant_f16_e64 v255, v1           ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, exec_hi
+// GFX11: v_frexp_mant_f16_e64 v255.l, exec_hi    ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, exec_lo
+// GFX11: v_frexp_mant_f16_e64 v255.l, exec_lo    ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, m0
+// GFX11: v_frexp_mant_f16_e64 v255.l, m0         ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127
-// GFX11: v_frexp_mant_f16_e64 v255, v127         ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, null
+// GFX11: v_frexp_mant_f16_e64 v255.l, null       ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, s1
+// GFX11: v_frexp_mant_f16_e64 v255.l, s1         ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, s105
+// GFX11: v_frexp_mant_f16_e64 v255.l, s105       ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, vcc_hi
-// GFX11: v_frexp_mant_f16_e64 v255, vcc_hi       ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, src_scc
+// GFX11: v_frexp_mant_f16_e64 v255.l, src_scc    ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, vcc_lo
-// GFX11: v_frexp_mant_f16_e64 v255, vcc_lo       ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, ttmp15
+// GFX11: v_frexp_mant_f16_e64 v255.l, ttmp15     ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v5, v199
-// GFX11: v_frexp_mant_f16_e64 v5, v199           ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, v1.l
+// GFX11: v_frexp_mant_f16_e64 v255.l, v1.l       ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v255.l, v127.l
+// GFX11: v_frexp_mant_f16_e64 v255.l, v127.l     ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v255.l, vcc_hi
+// GFX11: v_frexp_mant_f16_e64 v255.l, vcc_hi     ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.l, vcc_lo
+// GFX11: v_frexp_mant_f16_e64 v255.l, vcc_lo     ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v5.h, v199.h
+// GFX11: v_frexp_mant_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v199.l
+// GFX11: v_frexp_mant_f16_e64 v5.l, v199.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_log_f16 v128, 0xfe0b
 // GFX11: v_log_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd7,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1538,71 +1754,137 @@ v_log_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_log_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_log_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_not_b16 v128, 0xfe0b
-// GFX11: v_not_b16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16 v128.h, 0xfe0b
+// GFX11: v_not_b16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v128.l, 0xfe0b
+// GFX11: v_not_b16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v255.h, -1
+// GFX11: v_not_b16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_not_b16 v255.h, 0.5
+// GFX11: v_not_b16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_not_b16 v255.h, exec_hi
+// GFX11: v_not_b16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_not_b16 v255.h, exec_lo
+// GFX11: v_not_b16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_not_b16 v255.h, m0
+// GFX11: v_not_b16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_not_b16 v255.h, null
+// GFX11: v_not_b16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_not_b16 v255.h, s1
+// GFX11: v_not_b16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0x01,0x00,0x00,0x00]
+
+v_not_b16 v255.h, s105
+// GFX11: v_not_b16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16 v255, -1
-// GFX11: v_not_b16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16 v255.h, src_scc
+// GFX11: v_not_b16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16 v255, 0.5
-// GFX11: v_not_b16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16 v255.h, ttmp15
+// GFX11: v_not_b16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16 v255, exec_hi
-// GFX11: v_not_b16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h
+// GFX11: v_not_b16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
-v_not_b16 v255, exec_lo
-// GFX11: v_not_b16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_not_b16 v255, m0
-// GFX11: v_not_b16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_not_b16 v255, null
-// GFX11: v_not_b16_e64 v255, null                ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h
+// GFX11: v_not_b16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x7f,0x01,0x00,0x00]
 
-v_not_b16 v255, s1
-// GFX11: v_not_b16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_not_b16 v255, s105
-// GFX11: v_not_b16_e64 v255, s105                ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_not_b16 v255, src_scc
-// GFX11: v_not_b16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16 v255.h, vcc_hi
+// GFX11: v_not_b16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16 v255, ttmp15
-// GFX11: v_not_b16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16 v255.h, vcc_lo
+// GFX11: v_not_b16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16 v255, v1
-// GFX11: v_not_b16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16 v255.l, -1
+// GFX11: v_not_b16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16 v255.l, 0.5
+// GFX11: v_not_b16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_not_b16 v255.l, exec_hi
+// GFX11: v_not_b16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16 v255, v127
-// GFX11: v_not_b16_e64 v255, v127                ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00]
+v_not_b16 v255.l, exec_lo
+// GFX11: v_not_b16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_not_b16 v255.l, m0
+// GFX11: v_not_b16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_not_b16 v255.l, null
+// GFX11: v_not_b16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16 v255, vcc_hi
-// GFX11: v_not_b16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16 v255.l, s1
+// GFX11: v_not_b16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16 v255, vcc_lo
-// GFX11: v_not_b16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16 v255.l, s105
+// GFX11: v_not_b16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16 v5, v199
-// GFX11: v_not_b16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+v_not_b16 v255.l, src_scc
+// GFX11: v_not_b16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_not_b16 v255.l, ttmp15
+// GFX11: v_not_b16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_not_b16 v255.l, v1.l
+// GFX11: v_not_b16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_not_b16 v255.l, v127.l
+// GFX11: v_not_b16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00]
+
+v_not_b16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_not_b16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_not_b16 v255.l, vcc_hi
+// GFX11: v_not_b16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_not_b16 v255.l, vcc_lo
+// GFX11: v_not_b16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_not_b16 v5.h, v199.h
+// GFX11: v_not_b16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_not_b16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_not_b16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_not_b16 v5.l, v199.l
+// GFX11: v_not_b16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_not_b16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_not_b16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rcp_f16 v128, 0xfe0b
 // GFX11: v_rcp_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd4,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1670,71 +1952,137 @@ v_rcp_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rcp_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_rcp_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_rndne_f16 v128, 0xfe0b
-// GFX11: v_rndne_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v128.h, 0xfe0b
+// GFX11: v_rndne_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v128.l, 0xfe0b
+// GFX11: v_rndne_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v255.h, -1
+// GFX11: v_rndne_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, 0.5
+// GFX11: v_rndne_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xf0,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, exec_hi
+// GFX11: v_rndne_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, exec_lo
+// GFX11: v_rndne_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, m0
+// GFX11: v_rndne_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, null
+// GFX11: v_rndne_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, s1
+// GFX11: v_rndne_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v255, -1
-// GFX11: v_rndne_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16 v255.h, s105
+// GFX11: v_rndne_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v255, 0.5
-// GFX11: v_rndne_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+v_rndne_f16 v255.h, src_scc
+// GFX11: v_rndne_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_hi
-// GFX11: v_rndne_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v255.h, ttmp15
+// GFX11: v_rndne_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_lo
-// GFX11: v_rndne_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h
+// GFX11: v_rndne_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16 v255, m0
-// GFX11: v_rndne_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v255, null
-// GFX11: v_rndne_f16_e64 v255, null              ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, s1
-// GFX11: v_rndne_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h
+// GFX11: v_rndne_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x7f,0x01,0x00,0x00]
 
-v_rndne_f16 v255, s105
-// GFX11: v_rndne_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_rndne_f16 v255, src_scc
-// GFX11: v_rndne_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, ttmp15
-// GFX11: v_rndne_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_hi
+// GFX11: v_rndne_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1
-// GFX11: v_rndne_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16 v255.h, vcc_lo
+// GFX11: v_rndne_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16 v255.l, -1
+// GFX11: v_rndne_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, 0.5
+// GFX11: v_rndne_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127
-// GFX11: v_rndne_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+v_rndne_f16 v255.l, exec_hi
+// GFX11: v_rndne_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_rndne_f16 v255.l, exec_lo
+// GFX11: v_rndne_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, m0
+// GFX11: v_rndne_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16 v255, vcc_hi
-// GFX11: v_rndne_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16 v255.l, null
+// GFX11: v_rndne_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16 v255, vcc_lo
-// GFX11: v_rndne_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16 v255.l, s1
+// GFX11: v_rndne_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199
-// GFX11: v_rndne_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+v_rndne_f16 v255.l, s105
+// GFX11: v_rndne_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_rndne_f16 v255.l, src_scc
+// GFX11: v_rndne_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, ttmp15
+// GFX11: v_rndne_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l
+// GFX11: v_rndne_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, v127.l
+// GFX11: v_rndne_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, vcc_hi
+// GFX11: v_rndne_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, vcc_lo
+// GFX11: v_rndne_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h
+// GFX11: v_rndne_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v199.l
+// GFX11: v_rndne_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rsq_f16 v128, 0xfe0b
 // GFX11: v_rsq_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd6,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1802,80 +2150,155 @@ v_rsq_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rsq_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_rsq_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd6,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_sat_pk_u8_i16 v199, v5
-// GFX11: v_sat_pk_u8_i16_e64 v199, v5            ; encoding: [0xc7,0x00,0xe2,0xd5,0x05,0x01,0x00,0x00]
+v_sat_pk_u8_i16 v199.h, v5
+// GFX11: v_sat_pk_u8_i16_e64 v199.h, v5 op_sel:[0,1] ; encoding: [0xc7,0x40,0xe2,0xd5,0x05,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
+
+v_sat_pk_u8_i16 v199.l, v5
+// GFX11: v_sat_pk_u8_i16_e64 v199.l, v5          ; encoding: [0xc7,0x00,0xe2,0xd5,0x05,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v199.l, v5 quad_perm:[3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v199.l, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
+
+v_sin_f16 v128.h, 0xfe0b
+// GFX11: v_sin_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sat_pk_u8_i16 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v199, v5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05]
+v_sin_f16 v128.l, 0xfe0b
+// GFX11: v_sin_f16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sat_pk_u8_i16 v199, v5 quad_perm:[3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v199, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
+v_sin_f16 v255.h, -1
+// GFX11: v_sin_f16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16 v128, 0xfe0b
-// GFX11: v_sin_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_sin_f16 v255.h, 0.5
+// GFX11: v_sin_f16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe0,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sin_f16 v255, -1
-// GFX11: v_sin_f16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16 v255.h, exec_hi
+// GFX11: v_sin_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16 v255, 0.5
-// GFX11: v_sin_f16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00]
+v_sin_f16 v255.h, exec_lo
+// GFX11: v_sin_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16 v255, exec_hi
-// GFX11: v_sin_f16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16 v255.h, m0
+// GFX11: v_sin_f16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16 v255, exec_lo
-// GFX11: v_sin_f16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16 v255.h, null
+// GFX11: v_sin_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16 v255, m0
-// GFX11: v_sin_f16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16 v255.h, s1
+// GFX11: v_sin_f16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16 v255, null
-// GFX11: v_sin_f16_e64 v255, null                ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16 v255.h, s105
+// GFX11: v_sin_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16 v255, s1
-// GFX11: v_sin_f16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16 v255.h, src_scc
+// GFX11: v_sin_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xfd,0x00,0x00,0x00]
 
-v_sin_f16 v255, s105
-// GFX11: v_sin_f16_e64 v255, s105                ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16 v255.h, ttmp15
+// GFX11: v_sin_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16 v255, src_scc
-// GFX11: v_sin_f16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h
+// GFX11: v_sin_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16 v255, ttmp15
-// GFX11: v_sin_f16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v255, v1
-// GFX11: v_sin_f16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sin_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_sin_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sin_f16 v255.h, v127.h
+// GFX11: v_sin_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x7f,0x01,0x00,0x00]
 
-v_sin_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sin_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_sin_f16 v255, v127
-// GFX11: v_sin_f16_e64 v255, v127                ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00]
+v_sin_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_sin_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_sin_f16 v255.h, vcc_hi
+// GFX11: v_sin_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_sin_f16 v255.h, vcc_lo
+// GFX11: v_sin_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16 v255, vcc_hi
-// GFX11: v_sin_f16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16 v255.l, -1
+// GFX11: v_sin_f16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16 v255, vcc_lo
-// GFX11: v_sin_f16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16 v255.l, 0.5
+// GFX11: v_sin_f16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199
-// GFX11: v_sin_f16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+v_sin_f16 v255.l, exec_hi
+// GFX11: v_sin_f16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_sin_f16 v255.l, exec_lo
+// GFX11: v_sin_f16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_sin_f16 v255.l, m0
+// GFX11: v_sin_f16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, null
+// GFX11: v_sin_f16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, s1
+// GFX11: v_sin_f16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, s105
+// GFX11: v_sin_f16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, src_scc
+// GFX11: v_sin_f16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, ttmp15
+// GFX11: v_sin_f16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, v1.l
+// GFX11: v_sin_f16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_sin_f16 v255.l, v127.l
+// GFX11: v_sin_f16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00]
+
+v_sin_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_sin_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_sin_f16 v255.l, vcc_hi
+// GFX11: v_sin_f16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, vcc_lo
+// GFX11: v_sin_f16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_f16 v5.h, v199.h
+// GFX11: v_sin_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+
+v_sin_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_sin_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_sin_f16 v5.l, v199.l
+// GFX11: v_sin_f16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+
+v_sin_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_sin_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_sqrt_f16 v128, 0xfe0b
 // GFX11: v_sqrt_f16_e64 v128, 0xfe0b             ; encoding: [0x80,0x00,0xd5,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1943,69 +2366,134 @@ v_sqrt_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_sqrt_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX11: v_sqrt_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_trunc_f16 v128, 0xfe0b
-// GFX11: v_trunc_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_trunc_f16 v128.h, 0xfe0b
+// GFX11: v_trunc_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v128.l, 0xfe0b
+// GFX11: v_trunc_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v255.h, -1
+// GFX11: v_trunc_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, 0.5
+// GFX11: v_trunc_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, exec_hi
+// GFX11: v_trunc_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, exec_lo
+// GFX11: v_trunc_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, m0
+// GFX11: v_trunc_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, null
+// GFX11: v_trunc_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, s1
+// GFX11: v_trunc_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, s105
+// GFX11: v_trunc_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x69,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, src_scc
+// GFX11: v_trunc_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xfd,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, ttmp15
+// GFX11: v_trunc_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, v1.h
+// GFX11: v_trunc_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v255.h, v127.h
+// GFX11: v_trunc_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x7f,0x01,0x00,0x00]
+
+v_trunc_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_trunc_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_trunc_f16 v255.h, vcc_hi
+// GFX11: v_trunc_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, vcc_lo
+// GFX11: v_trunc_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, -1
+// GFX11: v_trunc_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, 0.5
+// GFX11: v_trunc_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00]
 
-v_trunc_f16 v255, -1
-// GFX11: v_trunc_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16 v255.l, exec_hi
+// GFX11: v_trunc_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16 v255, 0.5
-// GFX11: v_trunc_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+v_trunc_f16 v255.l, exec_lo
+// GFX11: v_trunc_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16 v255, exec_hi
-// GFX11: v_trunc_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16 v255.l, m0
+// GFX11: v_trunc_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16 v255, exec_lo
-// GFX11: v_trunc_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16 v255.l, null
+// GFX11: v_trunc_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16 v255, m0
-// GFX11: v_trunc_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16 v255.l, s1
+// GFX11: v_trunc_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
-v_trunc_f16 v255, null
-// GFX11: v_trunc_f16_e64 v255, null              ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16 v255.l, s105
+// GFX11: v_trunc_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16 v255, s1
-// GFX11: v_trunc_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16 v255.l, src_scc
+// GFX11: v_trunc_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00]
 
-v_trunc_f16 v255, s105
-// GFX11: v_trunc_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16 v255.l, ttmp15
+// GFX11: v_trunc_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16 v255, src_scc
-// GFX11: v_trunc_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l
+// GFX11: v_trunc_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16 v255, ttmp15
-// GFX11: v_trunc_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_trunc_f16 v255, v1
-// GFX11: v_trunc_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_trunc_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_trunc_f16 v255.l, v127.l
+// GFX11: v_trunc_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00]
 
-v_trunc_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_trunc_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_trunc_f16 v255, v127
-// GFX11: v_trunc_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00]
+v_trunc_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_trunc_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_trunc_f16 v255.l, vcc_hi
+// GFX11: v_trunc_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_trunc_f16 v255.l, vcc_lo
+// GFX11: v_trunc_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16 v255, vcc_hi
-// GFX11: v_trunc_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16 v5.h, v199.h
+// GFX11: v_trunc_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0xc7,0x01,0x00,0x00]
 
-v_trunc_f16 v255, vcc_lo
-// GFX11: v_trunc_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_trunc_f16 v5, v199
-// GFX11: v_trunc_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00]
+v_trunc_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_trunc_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_trunc_f16 v5.l, v199.l
+// GFX11: v_trunc_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00]
 
+v_trunc_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_trunc_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_trunc_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 628edf74..6bc92bc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -2231,53 +2231,77 @@ v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
 v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
 // GFX11: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
-v_fma_f16 v5, v1, v2, s3
-// GFX11: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+v_fma_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
 
-v_fma_f16 v5, v255, s2, s105
-// GFX11: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+v_fma_f16 v5.l, v255.l, s2, s105
+// GFX11: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
 
-v_fma_f16 v5, s1, v255, exec_hi
-// GFX11: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+v_fma_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
 
-v_fma_f16 v5, s105, s105, exec_lo
-// GFX11: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+v_fma_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_fma_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_fma_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_fma_f16 v5, m0, 0.5, m0
-// GFX11: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+v_fma_f16 v5.l, m0, 0.5, m0
+// GFX11: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_fma_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
 
-v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX11: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX11: v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX11: v_fma_f16 v5, null, exec_lo, -|0xfe0b|  ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX11: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX11: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX11: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX11: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX11: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX11: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX11: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2
-// GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2
+// GFX11: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, v255.h, s2, s105
+// GFX11: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+v_fma_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+
+v_fma_f16 v5.l, 0.5, -m0, 0.5
+// GFX11: v_fma_f16 v5.l, 0.5, -m0, 0.5           ; encoding: [0x05,0x00,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX11: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1  ; encoding: [0x05,0x02,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+
+v_fma_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_fma_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_fma_f32 v5, v1, v2, s3
 // GFX11: v_fma_f32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
@@ -3722,50 +3746,62 @@ v_max_u16 v5.l, v255.l, v255.h
 v_max_u16 v255.h, 0xfe0b, vcc_hi
 // GFX11: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, v1, v2, s3
-// GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+v_maxmin_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 
-v_maxmin_f16 v5, v255, s2, s105
-// GFX11: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+v_maxmin_f16 v5.l, v255.l, s2, s105
+// GFX11: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maxmin_f16 v5, s1, v255, exec_hi
-// GFX11: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+v_maxmin_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maxmin_f16 v5, s105, s105, exec_lo
-// GFX11: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+v_maxmin_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_maxmin_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_maxmin_f16 v5, m0, 0.5, m0
-// GFX11: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maxmin_f16 v5.l, m0, 0.5, m0
+// GFX11: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maxmin_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX11: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX11: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_f16 v5.l, v255.h, s2, s105
+// GFX11: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+
+v_maxmin_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+
+v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_maxmin_f32 v5, v1, v2, s3
 // GFX11: v_maxmin_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00]
@@ -4799,50 +4835,62 @@ v_min_u16 v5.l, v255.l, v255.h
 v_min_u16 v255.h, 0xfe0b, vcc_hi
 // GFX11: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, v1, v2, s3
-// GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+v_minmax_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+
+v_minmax_f16 v5.l, v255.l, s2, s105
+// GFX11: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+
+v_minmax_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+
+v_minmax_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+
+v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_minmax_f16 v5, v255, s2, s105
-// GFX11: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, s1, v255, exec_hi
-// GFX11: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minmax_f16 v5, s105, s105, exec_lo
-// GFX11: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+v_minmax_f16 v5.l, m0, 0.5, m0
+// GFX11: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minmax_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minmax_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, m0, 0.5, m0
-// GFX11: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minmax_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX11: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX11: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_minmax_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minmax_f16 v5.l, v255.h, s2, s105
+// GFX11: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
 
-v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_minmax_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
 
-v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_minmax_f32 v5, v1, v2, s3
 // GFX11: v_minmax_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index acbdcfc..5fa1334 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -1508,47 +1508,83 @@ v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 ban
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+
+v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
+
+v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x09,0x13]
+
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2624,47 +2660,92 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3668,47 +3749,92 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+
+v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4833,20 +4959,20 @@ v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX11: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX11: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
 // GFX11: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x53,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
index f38ff6a..c26834c 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s
@@ -220,47 +220,56 @@ v_clz_i32_u32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_cos_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_cos_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -967,47 +976,50 @@ v_cvt_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_mirror
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+
+v_cvt_i32_i16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1330,47 +1342,50 @@ v_cvt_u32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_mirror
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_cvt_u32_u16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1684,47 +1699,56 @@ v_floor_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_fract_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_fract_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1861,47 +1885,56 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2248,47 +2281,56 @@ v_movrelsd_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:
 v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_mirror
-// GFX11: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_not_b16_e64_dpp v5.h, v1.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+
+v_not_b16_e64_dpp v5.l, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+
+v_not_b16_e64_dpp v255.h, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2467,47 +2509,56 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2644,89 +2695,101 @@ v_rsq_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl
 v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_sin_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_sin_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2863,47 +2926,56 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_mirror
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_half_mirror
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shl:1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shl:15
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shr:1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shr:15
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_ror:1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16_e64_dpp v5, v1 row_ror:15
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_trunc_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_trunc_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX11: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index cef5017..2fc0206 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -814,41 +814,74 @@ v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x54,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x48,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x48,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x48,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x48,0xd6,0xea,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1627,41 +1660,80 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2401,41 +2473,80 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3130,20 +3241,20 @@ v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX11: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX11: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x53,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
index 9540788..259be1d 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s
@@ -61,17 +61,26 @@ v_clz_i32_u32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xb9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_cos_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_cos_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -283,14 +292,17 @@ v_cvt_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x88,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -385,14 +397,17 @@ v_cvt_u32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x87,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_cvt_u32_u16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x08,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -487,17 +502,26 @@ v_floor_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xa4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_fract_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -538,17 +562,26 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0xbf,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -640,14 +673,23 @@ v_movrelsd_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xc4,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_not_b16_e64_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v255.h, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -703,17 +745,26 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xab,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -760,26 +811,38 @@ v_rsq_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xae,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_sin_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_sin_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -826,17 +889,26 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xb3,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x08,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc1,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
index 3850f02..379cf062 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop1.s
@@ -262,50 +262,59 @@ v_clz_i32_u32_e64 v5, src_scc
 v_clz_i32_u32_e64 v255, 0xaf123456
 // GFX11: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cos_f16_e64 v5, v1
-// GFX11: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v1.l
+// GFX11: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, v255
-// GFX11: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v255.l
+// GFX11: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, s1
-// GFX11: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s1
+// GFX11: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, s105
-// GFX11: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s105
+// GFX11: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_lo
-// GFX11: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_lo
+// GFX11: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_hi
-// GFX11: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_hi
+// GFX11: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, ttmp15
-// GFX11: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, ttmp15
+// GFX11: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, m0
-// GFX11: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, m0
+// GFX11: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_lo
-// GFX11: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_lo
+// GFX11: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_hi
-// GFX11: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_hi
+// GFX11: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, null
-// GFX11: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, null
+// GFX11: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, -1
-// GFX11: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, -1
+// GFX11: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, 0.5 mul:2
-// GFX11: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+v_cos_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cos_f16_e64 v5, src_scc mul:4
-// GFX11: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+v_cos_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cos_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32_e64 v5, v1
 // GFX11: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
@@ -1267,11 +1276,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc|
 v_cvt_i32_f64_e64 v255, 0xaf123456 clamp
 // GFX11: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16_e64 v5, v1
-// GFX11: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v1.l
+// GFX11: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_i32_i16_e64 v5, v255
-// GFX11: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v255.l
+// GFX11: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_i32_i16_e64 v5, s1
 // GFX11: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1312,6 +1321,9 @@ v_cvt_i32_i16_e64 v5, src_scc
 v_cvt_i32_i16_e64 v255, 0xfe0b
 // GFX11: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_cvt_i32_i16_e64 v5, v255.h
+// GFX11: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00]
+
 v_cvt_nearest_i32_f32_e64 v5, v1
 // GFX11: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
 
@@ -1690,11 +1702,11 @@ v_cvt_u32_f64_e64 v5, -|src_scc|
 v_cvt_u32_f64_e64 v255, 0xaf123456 clamp
 // GFX11: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16_e64 v5, v1
-// GFX11: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v1.l
+// GFX11: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_u16_e64 v5, v255
-// GFX11: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v255.l
+// GFX11: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_u32_u16_e64 v5, s1
 // GFX11: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1735,6 +1747,9 @@ v_cvt_u32_u16_e64 v5, src_scc
 v_cvt_u32_u16_e64 v255, 0xfe0b
 // GFX11: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_cvt_u32_u16_e64 v5, v255.h
+// GFX11: [0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
 v_exp_f16_e64 v5, v1
 // GFX11: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2086,50 +2101,59 @@ v_floor_f64_e64 v[5:6], -|src_scc| mul:4
 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX11: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_fract_f16_e64 v5, v1
-// GFX11: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, v1.l
+// GFX11: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5.l, v255.l
+// GFX11: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
-v_fract_f16_e64 v5, v255
-// GFX11: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, s1
+// GFX11: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s1
-// GFX11: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, s105
+// GFX11: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s105
-// GFX11: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_lo
+// GFX11: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_lo
-// GFX11: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_hi
+// GFX11: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_hi
-// GFX11: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, ttmp15
+// GFX11: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, ttmp15
-// GFX11: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, m0
+// GFX11: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, m0
-// GFX11: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_lo
+// GFX11: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_lo
-// GFX11: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_hi
+// GFX11: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_hi
-// GFX11: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, null
+// GFX11: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, null
-// GFX11: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, -1
+// GFX11: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, -1
-// GFX11: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
-v_fract_f16_e64 v5, 0.5 mul:2
-// GFX11: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+v_fract_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
-v_fract_f16_e64 v5, src_scc mul:4
-// GFX11: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_fract_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_fract_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+v_fract_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32_e64 v5, v1
 // GFX11: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
@@ -2347,50 +2371,59 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc|
 v_frexp_exp_i32_f64_e64 v255, 0xaf123456
 // GFX11: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16_e64 v5, v1
-// GFX11: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, v1.l
+// GFX11: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5.l, v255.l
+// GFX11: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, v255
-// GFX11: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s1
+// GFX11: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s1
-// GFX11: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s105
+// GFX11: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s105
-// GFX11: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_lo
+// GFX11: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_lo
-// GFX11: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_hi
+// GFX11: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_hi
-// GFX11: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, ttmp15
+// GFX11: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, ttmp15
-// GFX11: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, m0
+// GFX11: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, m0
-// GFX11: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_lo
+// GFX11: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_lo
-// GFX11: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_hi
+// GFX11: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_hi
-// GFX11: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, null
+// GFX11: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, null
-// GFX11: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, -1
+// GFX11: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, -1
-// GFX11: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
-v_frexp_mant_f16_e64 v5, 0.5 mul:2
-// GFX11: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+v_frexp_mant_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
-v_frexp_mant_f16_e64 v5, src_scc mul:4
-// GFX11: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32_e64 v5, v1
 // GFX11: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
@@ -2674,50 +2707,59 @@ v_movrelsd_b32_e64 v255, v255
 v_nop_e64
 // GFX11: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, v1
-// GFX11: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, v1.l
+// GFX11: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5.l, v255.l
+// GFX11: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
-v_not_b16_e64 v5, v255
-// GFX11: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, s1
+// GFX11: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s1
-// GFX11: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, s105
+// GFX11: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s105
-// GFX11: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_lo
+// GFX11: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_lo
-// GFX11: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_hi
+// GFX11: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_hi
-// GFX11: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, ttmp15
+// GFX11: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, ttmp15
-// GFX11: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, m0
+// GFX11: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, m0
-// GFX11: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_lo
+// GFX11: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_lo
-// GFX11: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_hi
+// GFX11: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_hi
-// GFX11: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, null
+// GFX11: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, null
-// GFX11: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, -1
+// GFX11: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, -1
-// GFX11: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, 0.5
+// GFX11: v_not_b16_e64 v5.l, 0.5                 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, 0.5
-// GFX11: v_not_b16_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, src_scc
+// GFX11: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, src_scc
-// GFX11: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16_e64 v255.l, 0xfe0b
+// GFX11: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_not_b16_e64 v255, 0xfe0b
-// GFX11: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+v_not_b16_e64 v255.h, 0xfe0b
+// GFX11: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_not_b32_e64 v5, v1
 // GFX11: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
@@ -2938,50 +2980,59 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX11: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX11: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX11: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.l
+// GFX11: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX11: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX11: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX11: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX11: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX11: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX11: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX11: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX11: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX11: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX11: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX11: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX11: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX11: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX11: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX11: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX11: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX11: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX11: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX11: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX11: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX11: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX11: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX11: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32_e64 v5, v1
 // GFX11: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
@@ -3190,95 +3241,107 @@ v_rsq_f64_e64 v[5:6], -|src_scc| mul:4
 v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX11: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_sat_pk_u8_i16_e64 v5, v1
-// GFX11: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, v1
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, v1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5.l, v255
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, v255          ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, v255
-// GFX11: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, s1
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, s1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, s1
-// GFX11: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, s105
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, s105          ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, s105
-// GFX11: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, vcc_lo
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, vcc_lo        ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, vcc_lo
-// GFX11: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, vcc_hi
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, vcc_hi        ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, vcc_hi
-// GFX11: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, ttmp15
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, ttmp15        ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, ttmp15
-// GFX11: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, m0
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, m0            ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, m0
-// GFX11: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, exec_lo
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, exec_lo       ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, exec_lo
-// GFX11: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, exec_hi
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, exec_hi       ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, exec_hi
-// GFX11: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, null
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, null          ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, null
-// GFX11: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, -1
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, -1            ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, -1
-// GFX11: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, 0.5
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, 0.5           ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, 0.5
-// GFX11: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, src_scc
+// GFX11: v_sat_pk_u8_i16_e64 v5.l, src_scc       ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, src_scc
-// GFX11: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v255.l, 0xfe0b
+// GFX11: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b      ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v255, 0xfe0b
-// GFX11: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_sat_pk_u8_i16_e64 v255.h, 0xfe0b
+// GFX11: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sin_f16_e64 v5, v1
-// GFX11: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sin_f16_e64 v5.l, v1.l
+// GFX11: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, v255
-// GFX11: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+v_sin_f16_e64 v5.l, v255.l
+// GFX11: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, s1
-// GFX11: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s1
+// GFX11: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, s105
-// GFX11: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s105
+// GFX11: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_lo
-// GFX11: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_lo
+// GFX11: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_hi
-// GFX11: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_hi
+// GFX11: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, ttmp15
-// GFX11: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, ttmp15
+// GFX11: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, m0
-// GFX11: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, m0
+// GFX11: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_lo
-// GFX11: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_lo
+// GFX11: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_hi
-// GFX11: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_hi
+// GFX11: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, null
-// GFX11: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, null
+// GFX11: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, -1
-// GFX11: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, -1
+// GFX11: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, 0.5 mul:2
-// GFX11: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+v_sin_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
-v_sin_f16_e64 v5, src_scc mul:4
-// GFX11: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+v_sin_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
-v_sin_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32_e64 v5, v1
 // GFX11: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
@@ -3451,50 +3514,59 @@ v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4
 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX11: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_trunc_f16_e64 v5, v1
-// GFX11: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, v1.l
+// GFX11: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, v255.l
+// GFX11: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, s1
+// GFX11: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, s105
+// GFX11: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, v255
-// GFX11: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_lo
+// GFX11: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s1
-// GFX11: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_hi
+// GFX11: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s105
-// GFX11: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, ttmp15
+// GFX11: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_lo
-// GFX11: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, m0
+// GFX11: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_hi
-// GFX11: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_lo
+// GFX11: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, ttmp15
-// GFX11: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_hi
+// GFX11: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, m0
-// GFX11: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, null
+// GFX11: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_lo
-// GFX11: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, -1
+// GFX11: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_hi
-// GFX11: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, 0.5 mul:2
+// GFX11: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
-v_trunc_f16_e64 v5, null
-// GFX11: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, src_scc mul:4
+// GFX11: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
-v_trunc_f16_e64 v5, -1
-// GFX11: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX11: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_trunc_f16_e64 v5, 0.5 mul:2
-// GFX11: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+v_trunc_f16_e64 v5.h, v1.h
+// GFX11: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v5, src_scc mul:4
-// GFX11: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+v_trunc_f16_e64 v5.l, v255.h
+// GFX11: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX11: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_trunc_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX11: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32_e64 v5, v1
 // GFX11: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
index a0d11c9..ee82fa3 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
@@ -255,3 +255,127 @@ image_store_pck v5, v1, s[8:15] dmask:0x1 th:TH_STORE_NT
 
 image_store_mip_pck v5, [v0, v1], s[8:15] dmask:0x1 th:TH_STORE_NT
 // NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: missing dim operand
+
+// null is not allowed as SRSRC or SSAMP
+image_atomic_add v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_and v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_cmpswap v[0:1], v[10:11], null dmask:0x3 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_dec v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_inc v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_or v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_smin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_sub v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_swap v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umax v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_umin v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_atomic_xor v1, v[10:11], null dmask:0x1 dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, null, s[4:11], dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4 v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_b v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_c v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4h v[64:67], v32, s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_l v[64:67], v[32:33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], [v32, v33], null, s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_gather4_o v[64:67], [v32, v33], s[4:11], null dmask:0x1 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_load v[4:7], v0, null dmask:0xf dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_store v[0:3], v[254:255], null dmask:0xf dim:SQ_RSRC_IMG_2D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample v[5:6], v1, s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_b v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_c v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], [v1, v2, v3], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_d v[5:6], [v1, v2, v3], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_l v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_sample_o v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], null
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], null
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
index 668f767..2ef0274 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
@@ -541,6 +541,25 @@ s_load_b512 s[20:35], s[2:3], m0
 s_load_b512 s[20:35], s[2:3], 0x0
 // GFX12: s_load_b512 s[20:35], s[2:3], 0x0       ; encoding: [0x01,0x85,0x00,0xf4,0x00,0x00,0x00,0xf8]
 
+// null as dst
+s_load_b32 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b96 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b256 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b512 null, s[2:3], s0 offset:0x0
+// GFX12: encoding: [0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00]
+
 s_buffer_load_b32 s5, s[4:7], s0
 // GFX12: s_buffer_load_b32 s5, s[4:7], s0 offset:0x0 ; encoding: [0x42,0x01,0x02,0xf4,0x00,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s
new file mode 100644
index 0000000..49d7c72
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s
@@ -0,0 +1,49 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s
+
+s_buffer_load_b32 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b64 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b128 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b256 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_b512 s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dword s4, null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx2 s[4:5], null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx4 s[4:7], null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx8 s[4:11], null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_dwordx16 s[4:19], null, s101
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_atc_probe_buffer 7, null, s2
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_prefetch_data null, 100, s10, 7
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_i8 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_u8 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_i16 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_u16 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s
new file mode 100644
index 0000000..040119c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_err.s
@@ -0,0 +1,49 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s
+
+tbuffer_load_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_d16_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+tbuffer_store_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s
new file mode 100644
index 0000000..2c9ce7a
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_err.s
@@ -0,0 +1,220 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=NOGFX12 --implicit-check-not=error: %s
+
+buffer_atomic_add_f32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_add_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_b32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_and_b64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_b32 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cmpswap_b64 v[5:8], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_cond_sub_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_dec_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_inc_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_i32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_i64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_num_f32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_max_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_i32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_i64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_min_num_f32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_b32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_or_b64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_f16 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_clamp_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_u32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_sub_u64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_b32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_swap_b64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_b32 v5, v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_xor_b64 v[5:6], v0, null, s3 idxen
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_b128 v[5:8], v0, null, s3 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_b32 v5, v0, null, s3 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b64 v[1:2], v0, null, s4 idxen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b96 v[1:3], v0, null, s4 idxen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_b16 v5, v0, null, s3 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xy v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_hi_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_d16_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xy v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyz v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_format_xyzw v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_i16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_i8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_u16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_load_u8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b128 v[3:6], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b32 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b64 v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_b96 v[3:5], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xy v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xyz v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_format_xyzw v[3:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_b16 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_b8 v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_d16_hi_format_x v3, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_x v1, v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xy v[1:2], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyz v[1:3], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_store_format_xyzw v[1:4], v0, null, s1 offen offset:4095
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index edd3b91..e21e5bf 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -265,50 +265,62 @@ v_clz_i32_u32 v5, src_scc
 v_clz_i32_u32 v255, 0xaf123456
 // GFX12: v_clz_i32_u32_e32 v255, 0xaf123456 ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cos_f16 v5, v1
-// GFX12: v_cos_f16_e32 v5, v1 ; encoding: [0x01,0xc3,0x0a,0x7e]
+v_cos_f16 v5.l, v1.l
+// GFX12: v_cos_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc3,0x0a,0x7e]
 
-v_cos_f16 v5, v127
-// GFX12: v_cos_f16_e32 v5, v127 ; encoding: [0x7f,0xc3,0x0a,0x7e]
+v_cos_f16 v5.l, v127.l
+// GFX12: v_cos_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
-v_cos_f16 v5, s1
-// GFX12: v_cos_f16_e32 v5, s1 ; encoding: [0x01,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, s1
+// GFX12: v_cos_f16_e32 v5.l, s1 ; encoding: [0x01,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, s105
-// GFX12: v_cos_f16_e32 v5, s105 ; encoding: [0x69,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, s105
+// GFX12: v_cos_f16_e32 v5.l, s105 ; encoding: [0x69,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, vcc_lo
-// GFX12: v_cos_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, vcc_lo
+// GFX12: v_cos_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, vcc_hi
-// GFX12: v_cos_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, vcc_hi
+// GFX12: v_cos_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, ttmp15
-// GFX12: v_cos_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, ttmp15
+// GFX12: v_cos_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, m0
-// GFX12: v_cos_f16_e32 v5, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, m0
+// GFX12: v_cos_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, exec_lo
-// GFX12: v_cos_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, exec_lo
+// GFX12: v_cos_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, exec_hi
-// GFX12: v_cos_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, exec_hi
+// GFX12: v_cos_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, null
-// GFX12: v_cos_f16_e32 v5, null ; encoding: [0x7c,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, null
+// GFX12: v_cos_f16_e32 v5.l, null ; encoding: [0x7c,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, -1
-// GFX12: v_cos_f16_e32 v5, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, -1
+// GFX12: v_cos_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, 0.5
-// GFX12: v_cos_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, 0.5
+// GFX12: v_cos_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc2,0x0a,0x7e]
 
-v_cos_f16 v5, src_scc
-// GFX12: v_cos_f16_e32 v5, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e]
+v_cos_f16 v5.l, src_scc
+// GFX12: v_cos_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
-v_cos_f16 v127, 0xfe0b
-// GFX12: v_cos_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_cos_f16 v127.l, 0xfe0b
+// GFX12: v_cos_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16 v5.l, v1.h
+// GFX12: v_cos_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5.l, v127.h
+// GFX12: v_cos_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5.h, src_scc
+// GFX12: v_cos_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc2,0x0a,0x7f]
+
+v_cos_f16 v127.h, 0xfe0b
+// GFX12: v_cos_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32 v5, v1
 // GFX12: v_cos_f32_e32 v5, v1 ; encoding: [0x01,0x6d,0x0a,0x7e]
@@ -1338,11 +1350,11 @@ v_cvt_i32_f64 v5, src_scc
 v_cvt_i32_f64 v255, 0xaf123456
 // GFX12: v_cvt_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16 v5, v1
-// GFX12: v_cvt_i32_i16_e32 v5, v1 ; encoding: [0x01,0xd5,0x0a,0x7e]
+v_cvt_i32_i16 v5, v1.l
+// GFX12: v_cvt_i32_i16_e32 v5, v1.l ; encoding: [0x01,0xd5,0x0a,0x7e]
 
-v_cvt_i32_i16 v5, v127
-// GFX12: v_cvt_i32_i16_e32 v5, v127 ; encoding: [0x7f,0xd5,0x0a,0x7e]
+v_cvt_i32_i16 v5, v127.l
+// GFX12: v_cvt_i32_i16_e32 v5, v127.l ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
 v_cvt_i32_i16 v5, s1
 // GFX12: v_cvt_i32_i16_e32 v5, s1 ; encoding: [0x01,0xd4,0x0a,0x7e]
@@ -1384,6 +1396,12 @@ v_cvt_i32_i16 v5, src_scc
 v_cvt_i32_i16 v255, 0xfe0b
 // GFX12: v_cvt_i32_i16_e32 v255, 0xfe0b ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+v_cvt_i32_i16 v5, v1.h
+// GFX12: v_cvt_i32_i16_e32 v5, v1.h ; encoding: [0x81,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, v127.h
+// GFX12: v_cvt_i32_i16_e32 v5, v127.h ; encoding: [0xff,0xd5,0x0a,0x7e]
+
 v_cvt_nearest_i32_f32 v5, v1
 // GFX12: v_cvt_nearest_i32_f32_e32 v5, v1 ; encoding: [0x01,0x19,0x0a,0x7e]
 
@@ -1771,11 +1789,11 @@ v_cvt_u32_f64 v5, src_scc
 v_cvt_u32_f64 v255, 0xaf123456
 // GFX12: v_cvt_u32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16 v5, v1
-// GFX12: v_cvt_u32_u16_e32 v5, v1 ; encoding: [0x01,0xd7,0x0a,0x7e]
+v_cvt_u32_u16 v5, v1.l
+// GFX12: v_cvt_u32_u16_e32 v5, v1.l ; encoding: [0x01,0xd7,0x0a,0x7e]
 
-v_cvt_u32_u16 v5, v127
-// GFX12: v_cvt_u32_u16_e32 v5, v127 ; encoding: [0x7f,0xd7,0x0a,0x7e]
+v_cvt_u32_u16 v5, v127.l
+// GFX12: v_cvt_u32_u16_e32 v5, v127.l ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
 v_cvt_u32_u16 v5, s1
 // GFX12: v_cvt_u32_u16_e32 v5, s1 ; encoding: [0x01,0xd6,0x0a,0x7e]
@@ -1817,6 +1835,12 @@ v_cvt_u32_u16 v5, src_scc
 v_cvt_u32_u16 v255, 0xfe0b
 // GFX12: v_cvt_u32_u16_e32 v255, 0xfe0b ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+v_cvt_u32_u16 v5, v1.h
+// GFX12: v_cvt_u32_u16_e32 v5, v1.h ; encoding: [0x81,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, v127.h
+// GFX12: v_cvt_u32_u16_e32 v5, v127.h ; encoding: [0xff,0xd7,0x0a,0x7e]
+
 v_exp_f16 v5.l, v1.l
 // GFX12: v_exp_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb1,0x0a,0x7e]
 
@@ -2168,50 +2192,62 @@ v_floor_f64 v[5:6], src_scc
 v_floor_f64 v[254:255], 0xaf123456
 // GFX12: v_floor_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
-v_fract_f16 v5, v1
-// GFX12: v_fract_f16_e32 v5, v1 ; encoding: [0x01,0xbf,0x0a,0x7e]
+v_fract_f16 v5.l, v1.l
+// GFX12: v_fract_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, v127.l
+// GFX12: v_fract_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbf,0x0a,0x7e]
 
-v_fract_f16 v5, v127
-// GFX12: v_fract_f16_e32 v5, v127 ; encoding: [0x7f,0xbf,0x0a,0x7e]
+v_fract_f16 v5.l, s1
+// GFX12: v_fract_f16_e32 v5.l, s1 ; encoding: [0x01,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, s1
-// GFX12: v_fract_f16_e32 v5, s1 ; encoding: [0x01,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, s105
+// GFX12: v_fract_f16_e32 v5.l, s105 ; encoding: [0x69,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, s105
-// GFX12: v_fract_f16_e32 v5, s105 ; encoding: [0x69,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, vcc_lo
+// GFX12: v_fract_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, vcc_lo
-// GFX12: v_fract_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, vcc_hi
+// GFX12: v_fract_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, vcc_hi
-// GFX12: v_fract_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, ttmp15
+// GFX12: v_fract_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, ttmp15
-// GFX12: v_fract_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, m0
+// GFX12: v_fract_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, m0
-// GFX12: v_fract_f16_e32 v5, m0 ; encoding: [0x7d,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, exec_lo
+// GFX12: v_fract_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, exec_lo
-// GFX12: v_fract_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, exec_hi
+// GFX12: v_fract_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, exec_hi
-// GFX12: v_fract_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, null
+// GFX12: v_fract_f16_e32 v5.l, null ; encoding: [0x7c,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, null
-// GFX12: v_fract_f16_e32 v5, null ; encoding: [0x7c,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, -1
+// GFX12: v_fract_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, -1
-// GFX12: v_fract_f16_e32 v5, -1 ; encoding: [0xc1,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, 0.5
+// GFX12: v_fract_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, 0.5
-// GFX12: v_fract_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbe,0x0a,0x7e]
+v_fract_f16 v5.l, src_scc
+// GFX12: v_fract_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
-v_fract_f16 v5, src_scc
-// GFX12: v_fract_f16_e32 v5, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7e]
+v_fract_f16 v127.l, 0xfe0b
+// GFX12: v_fract_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_fract_f16 v127, 0xfe0b
-// GFX12: v_fract_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_fract_f16 v5.l, v1.h
+// GFX12: v_fract_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, v127.h
+// GFX12: v_fract_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.h, src_scc
+// GFX12: v_fract_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbe,0x0a,0x7f]
+
+v_fract_f16 v127.h, 0xfe0b
+// GFX12: v_fract_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32 v5, v1
 // GFX12: v_fract_f32_e32 v5, v1 ; encoding: [0x01,0x41,0x0a,0x7e]
@@ -2432,50 +2468,62 @@ v_frexp_exp_i32_f64 v5, src_scc
 v_frexp_exp_i32_f64 v255, 0xaf123456
 // GFX12: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16 v5, v1
-// GFX12: v_frexp_mant_f16_e32 v5, v1 ; encoding: [0x01,0xb3,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, v1.l
+// GFX12: v_frexp_mant_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, v127.l
+// GFX12: v_frexp_mant_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb3,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, v127
-// GFX12: v_frexp_mant_f16_e32 v5, v127 ; encoding: [0x7f,0xb3,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, s1
+// GFX12: v_frexp_mant_f16_e32 v5.l, s1 ; encoding: [0x01,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, s1
-// GFX12: v_frexp_mant_f16_e32 v5, s1 ; encoding: [0x01,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, s105
+// GFX12: v_frexp_mant_f16_e32 v5.l, s105 ; encoding: [0x69,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, s105
-// GFX12: v_frexp_mant_f16_e32 v5, s105 ; encoding: [0x69,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, vcc_lo
+// GFX12: v_frexp_mant_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, vcc_lo
-// GFX12: v_frexp_mant_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, vcc_hi
+// GFX12: v_frexp_mant_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, vcc_hi
-// GFX12: v_frexp_mant_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, ttmp15
+// GFX12: v_frexp_mant_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, ttmp15
-// GFX12: v_frexp_mant_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, m0
+// GFX12: v_frexp_mant_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, m0
-// GFX12: v_frexp_mant_f16_e32 v5, m0 ; encoding: [0x7d,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, exec_lo
+// GFX12: v_frexp_mant_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, exec_lo
-// GFX12: v_frexp_mant_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, exec_hi
+// GFX12: v_frexp_mant_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, exec_hi
-// GFX12: v_frexp_mant_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, null
+// GFX12: v_frexp_mant_f16_e32 v5.l, null ; encoding: [0x7c,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, null
-// GFX12: v_frexp_mant_f16_e32 v5, null ; encoding: [0x7c,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, -1
+// GFX12: v_frexp_mant_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, -1
-// GFX12: v_frexp_mant_f16_e32 v5, -1 ; encoding: [0xc1,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, 0.5
+// GFX12: v_frexp_mant_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, 0.5
-// GFX12: v_frexp_mant_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v5.l, src_scc
+// GFX12: v_frexp_mant_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v5, src_scc
-// GFX12: v_frexp_mant_f16_e32 v5, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7e]
+v_frexp_mant_f16 v127.l, 0xfe0b
+// GFX12: v_frexp_mant_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16 v127, 0xfe0b
-// GFX12: v_frexp_mant_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16 v5.l, v1.h
+// GFX12: v_frexp_mant_f16_e32 v5.l, v1.h ; encoding: [0x81,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, v127.h
+// GFX12: v_frexp_mant_f16_e32 v5.l, v127.h ; encoding: [0xff,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.h, src_scc
+// GFX12: v_frexp_mant_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xb2,0x0a,0x7f]
+
+v_frexp_mant_f16 v127.h, 0xfe0b
+// GFX12: v_frexp_mant_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1
 // GFX12: v_frexp_mant_f32_e32 v5, v1 ; encoding: [0x01,0x81,0x0a,0x7e]
@@ -2759,51 +2807,63 @@ v_movrelsd_b32 v255, v255
 v_nop
 // GFX12: v_nop ; encoding: [0x00,0x00,0x00,0x7e]
 
-v_not_b16 v5, v1
-// GFX12: v_not_b16_e32 v5, v1 ; encoding: [0x01,0xd3,0x0a,0x7e]
+v_not_b16 v5.l, v1.l
+// GFX12: v_not_b16_e32 v5.l, v1.l ; encoding: [0x01,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, v127.l
+// GFX12: v_not_b16_e32 v5.l, v127.l ; encoding: [0x7f,0xd3,0x0a,0x7e]
 
-v_not_b16 v5, v127
-// GFX12: v_not_b16_e32 v5, v127 ; encoding: [0x7f,0xd3,0x0a,0x7e]
+v_not_b16 v5.l, s1
+// GFX12: v_not_b16_e32 v5.l, s1 ; encoding: [0x01,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, s1
-// GFX12: v_not_b16_e32 v5, s1 ; encoding: [0x01,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, s105
+// GFX12: v_not_b16_e32 v5.l, s105 ; encoding: [0x69,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, s105
-// GFX12: v_not_b16_e32 v5, s105 ; encoding: [0x69,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, vcc_lo
+// GFX12: v_not_b16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, vcc_lo
-// GFX12: v_not_b16_e32 v5, vcc_lo ; encoding: [0x6a,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, vcc_hi
+// GFX12: v_not_b16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, vcc_hi
-// GFX12: v_not_b16_e32 v5, vcc_hi ; encoding: [0x6b,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, ttmp15
+// GFX12: v_not_b16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, ttmp15
-// GFX12: v_not_b16_e32 v5, ttmp15 ; encoding: [0x7b,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, m0
+// GFX12: v_not_b16_e32 v5.l, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, m0
-// GFX12: v_not_b16_e32 v5, m0 ; encoding: [0x7d,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, exec_lo
+// GFX12: v_not_b16_e32 v5.l, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, exec_lo
-// GFX12: v_not_b16_e32 v5, exec_lo ; encoding: [0x7e,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, exec_hi
+// GFX12: v_not_b16_e32 v5.l, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, exec_hi
-// GFX12: v_not_b16_e32 v5, exec_hi ; encoding: [0x7f,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, null
+// GFX12: v_not_b16_e32 v5.l, null ; encoding: [0x7c,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, null
-// GFX12: v_not_b16_e32 v5, null ; encoding: [0x7c,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, -1
+// GFX12: v_not_b16_e32 v5.l, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, -1
-// GFX12: v_not_b16_e32 v5, -1 ; encoding: [0xc1,0xd2,0x0a,0x7e]
+v_not_b16 v5.l, 0.5
+// GFX12-ASM: v_not_b16_e32 v5.l, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e]
+// GFX12-DIS: v_not_b16_e32 v5.l, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
-v_not_b16 v5, 0.5
-// GFX12-ASM: v_not_b16_e32 v5, 0.5 ; encoding: [0xf0,0xd2,0x0a,0x7e]
-// GFX12-DIS: v_not_b16_e32 v5, 0x3800 ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+v_not_b16 v5.l, src_scc
+// GFX12: v_not_b16_e32 v5.l, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
-v_not_b16 v5, src_scc
-// GFX12: v_not_b16_e32 v5, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7e]
+v_not_b16 v127.l, 0xfe0b
+// GFX12: v_not_b16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_not_b16 v127, 0xfe0b
-// GFX12: v_not_b16_e32 v127, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_not_b16 v5.l, v1.h
+// GFX12: v_not_b16_e32 v5.l, v1.h ; encoding: [0x81,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, v127.h
+// GFX12: v_not_b16_e32 v5.l, v127.h ; encoding: [0xff,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.h, src_scc
+// GFX12: v_not_b16_e32 v5.h, src_scc ; encoding: [0xfd,0xd2,0x0a,0x7f]
+
+v_not_b16 v127.h, 0xfe0b
+// GFX12: v_not_b16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_not_b32 v5, v1
 // GFX12: v_not_b32_e32 v5, v1 ; encoding: [0x01,0x6f,0x0a,0x7e]
@@ -3048,50 +3108,62 @@ v_readfirstlane_b32 ttmp15, v1
 v_readfirstlane_b32 null, v255
 // GFX12: v_readfirstlane_b32 null, v255 ; encoding: [0xff,0x05,0xf8,0x7e]
 
-v_rndne_f16 v5, v1
-// GFX12: v_rndne_f16_e32 v5, v1 ; encoding: [0x01,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.l
+// GFX12: v_rndne_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.l
+// GFX12: v_rndne_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, s1
+// GFX12: v_rndne_f16_e32 v5.l, s1 ; encoding: [0x01,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, v127
-// GFX12: v_rndne_f16_e32 v5, v127 ; encoding: [0x7f,0xbd,0x0a,0x7e]
+v_rndne_f16 v5.l, s105
+// GFX12: v_rndne_f16_e32 v5.l, s105 ; encoding: [0x69,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s1
-// GFX12: v_rndne_f16_e32 v5, s1 ; encoding: [0x01,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_lo
+// GFX12: v_rndne_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, s105
-// GFX12: v_rndne_f16_e32 v5, s105 ; encoding: [0x69,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, vcc_hi
+// GFX12: v_rndne_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_lo
-// GFX12: v_rndne_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, ttmp15
+// GFX12: v_rndne_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, vcc_hi
-// GFX12: v_rndne_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, m0
+// GFX12: v_rndne_f16_e32 v5.l, m0 ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, ttmp15
-// GFX12: v_rndne_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_lo
+// GFX12: v_rndne_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, m0
-// GFX12: v_rndne_f16_e32 v5, m0 ; encoding: [0x7d,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, exec_hi
+// GFX12: v_rndne_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_lo
-// GFX12: v_rndne_f16_e32 v5, exec_lo ; encoding: [0x7e,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, null
+// GFX12: v_rndne_f16_e32 v5.l, null ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, exec_hi
-// GFX12: v_rndne_f16_e32 v5, exec_hi ; encoding: [0x7f,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, -1
+// GFX12: v_rndne_f16_e32 v5.l, -1 ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, null
-// GFX12: v_rndne_f16_e32 v5, null ; encoding: [0x7c,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, 0.5
+// GFX12: v_rndne_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, -1
-// GFX12: v_rndne_f16_e32 v5, -1 ; encoding: [0xc1,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, src_scc
+// GFX12: v_rndne_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v5, 0.5
-// GFX12: v_rndne_f16_e32 v5, 0.5 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+v_rndne_f16 v127.l, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16 v5, src_scc
-// GFX12: v_rndne_f16_e32 v5, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7e]
+v_rndne_f16 v5.l, v1.h
+// GFX12: v_rndne_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbd,0x0a,0x7e]
 
-v_rndne_f16 v127, 0xfe0b
-// GFX12: v_rndne_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v5.l, v127.h
+// GFX12: v_rndne_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.h, src_scc
+// GFX12: v_rndne_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+v_rndne_f16 v127.h, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32 v5, v1
 // GFX12: v_rndne_f32_e32 v5, v1 ; encoding: [0x01,0x47,0x0a,0x7e]
@@ -3301,94 +3373,127 @@ v_rsq_f64 v[254:255], 0xaf123456
 // GFX12: v_rsq_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_sat_pk_u8_i16 v5, v1
-// GFX12: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, v1 ; encoding: [0x01,0xc5,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, v1 ; encoding: [0x01,0xc5,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, v255
-// GFX12: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, v255 ; encoding: [0xff,0xc5,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, v255 ; encoding: [0xff,0xc5,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, s1
-// GFX12: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, s1 ; encoding: [0x01,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, s1 ; encoding: [0x01,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, s105
-// GFX12: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, s105 ; encoding: [0x69,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, s105 ; encoding: [0x69,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, vcc_lo
-// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, vcc_hi
-// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, ttmp15
-// GFX12: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, m0
-// GFX12: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, m0 ; encoding: [0x7d,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, exec_lo
-// GFX12: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, exec_hi
-// GFX12: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, null
-// GFX12: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, null ; encoding: [0x7c,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, null ; encoding: [0x7c,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, -1
-// GFX12: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, -1 ; encoding: [0xc1,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, 0.5
-// GFX12: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v5, src_scc
-// GFX12: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v5, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v5.l, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7e]
 
 v_sat_pk_u8_i16 v127, 0xfe0b
-// GFX12: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12-ASM: v_sat_pk_u8_i16_e32 v127, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+// GFX12-DIS: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sat_pk_u8_i16 v5.h, src_scc
+// GFX12: v_sat_pk_u8_i16_e32 v5.h, src_scc ; encoding: [0xfd,0xc4,0x0a,0x7f]
+
+v_sat_pk_u8_i16 v127.h, 0xfe0b
+// GFX12: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
-v_sin_f16 v5, v1
-// GFX12: v_sin_f16_e32 v5, v1 ; encoding: [0x01,0xc1,0x0a,0x7e]
+v_sin_f16 v5.l, v1.l
+// GFX12: v_sin_f16_e32 v5.l, v1.l ; encoding: [0x01,0xc1,0x0a,0x7e]
 
-v_sin_f16 v5, v127
-// GFX12: v_sin_f16_e32 v5, v127 ; encoding: [0x7f,0xc1,0x0a,0x7e]
+v_sin_f16 v5.l, v127.l
+// GFX12: v_sin_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
-v_sin_f16 v5, s1
-// GFX12: v_sin_f16_e32 v5, s1 ; encoding: [0x01,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, s1
+// GFX12: v_sin_f16_e32 v5.l, s1 ; encoding: [0x01,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, s105
-// GFX12: v_sin_f16_e32 v5, s105 ; encoding: [0x69,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, s105
+// GFX12: v_sin_f16_e32 v5.l, s105 ; encoding: [0x69,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, vcc_lo
-// GFX12: v_sin_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, vcc_lo
+// GFX12: v_sin_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, vcc_hi
-// GFX12: v_sin_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, vcc_hi
+// GFX12: v_sin_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, ttmp15
-// GFX12: v_sin_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, ttmp15
+// GFX12: v_sin_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, m0
-// GFX12: v_sin_f16_e32 v5, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, m0
+// GFX12: v_sin_f16_e32 v5.l, m0 ; encoding: [0x7d,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, exec_lo
-// GFX12: v_sin_f16_e32 v5, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, exec_lo
+// GFX12: v_sin_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, exec_hi
-// GFX12: v_sin_f16_e32 v5, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, exec_hi
+// GFX12: v_sin_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, null
-// GFX12: v_sin_f16_e32 v5, null ; encoding: [0x7c,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, null
+// GFX12: v_sin_f16_e32 v5.l, null ; encoding: [0x7c,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, -1
-// GFX12: v_sin_f16_e32 v5, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, -1
+// GFX12: v_sin_f16_e32 v5.l, -1 ; encoding: [0xc1,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, 0.5
-// GFX12: v_sin_f16_e32 v5, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, 0.5
+// GFX12: v_sin_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xc0,0x0a,0x7e]
 
-v_sin_f16 v5, src_scc
-// GFX12: v_sin_f16_e32 v5, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e]
+v_sin_f16 v5.l, src_scc
+// GFX12: v_sin_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
-v_sin_f16 v127, 0xfe0b
-// GFX12: v_sin_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_sin_f16 v127.l, 0xfe0b
+// GFX12: v_sin_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v5.l, v1.h
+// GFX12: v_sin_f16_e32 v5.l, v1.h ; encoding: [0x81,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5.l, v127.h
+// GFX12: v_sin_f16_e32 v5.l, v127.h ; encoding: [0xff,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5.h, src_scc
+// GFX12: v_sin_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xc0,0x0a,0x7f]
+
+v_sin_f16 v127.h, 0xfe0b
+// GFX12: v_sin_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32 v5, v1
 // GFX12: v_sin_f32_e32 v5, v1 ; encoding: [0x01,0x6b,0x0a,0x7e]
@@ -3582,50 +3687,62 @@ v_swaprel_b32 v5, v1
 v_swaprel_b32 v255, v255
 // GFX12: v_swaprel_b32 v255, v255 ; encoding: [0xff,0xd1,0xfe,0x7f]
 
-v_trunc_f16 v5, v1
-// GFX12: v_trunc_f16_e32 v5, v1 ; encoding: [0x01,0xbb,0x0a,0x7e]
+v_trunc_f16 v5.l, v1.l
+// GFX12: v_trunc_f16_e32 v5.l, v1.l ; encoding: [0x01,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, v127.l
+// GFX12: v_trunc_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, s1
+// GFX12: v_trunc_f16_e32 v5.l, s1 ; encoding: [0x01,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, s105
+// GFX12: v_trunc_f16_e32 v5.l, s105 ; encoding: [0x69,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5.l, vcc_lo
+// GFX12: v_trunc_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, v127
-// GFX12: v_trunc_f16_e32 v5, v127 ; encoding: [0x7f,0xbb,0x0a,0x7e]
+v_trunc_f16 v5.l, vcc_hi
+// GFX12: v_trunc_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, s1
-// GFX12: v_trunc_f16_e32 v5, s1 ; encoding: [0x01,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, ttmp15
+// GFX12: v_trunc_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, s105
-// GFX12: v_trunc_f16_e32 v5, s105 ; encoding: [0x69,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, m0
+// GFX12: v_trunc_f16_e32 v5.l, m0 ; encoding: [0x7d,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, vcc_lo
-// GFX12: v_trunc_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, exec_lo
+// GFX12: v_trunc_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, vcc_hi
-// GFX12: v_trunc_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, exec_hi
+// GFX12: v_trunc_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, ttmp15
-// GFX12: v_trunc_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, null
+// GFX12: v_trunc_f16_e32 v5.l, null ; encoding: [0x7c,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, m0
-// GFX12: v_trunc_f16_e32 v5, m0 ; encoding: [0x7d,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, -1
+// GFX12: v_trunc_f16_e32 v5.l, -1 ; encoding: [0xc1,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, exec_lo
-// GFX12: v_trunc_f16_e32 v5, exec_lo ; encoding: [0x7e,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, 0.5
+// GFX12: v_trunc_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, exec_hi
-// GFX12: v_trunc_f16_e32 v5, exec_hi ; encoding: [0x7f,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, src_scc
+// GFX12: v_trunc_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e]
 
-v_trunc_f16 v5, null
-// GFX12: v_trunc_f16_e32 v5, null ; encoding: [0x7c,0xba,0x0a,0x7e]
+v_trunc_f16 v127.l, 0xfe0b
+// GFX12: v_trunc_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_trunc_f16 v5, -1
-// GFX12: v_trunc_f16_e32 v5, -1 ; encoding: [0xc1,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, v1.h
+// GFX12: v_trunc_f16_e32 v5.l, v1.h ; encoding: [0x81,0xbb,0x0a,0x7e]
 
-v_trunc_f16 v5, 0.5
-// GFX12: v_trunc_f16_e32 v5, 0.5 ; encoding: [0xf0,0xba,0x0a,0x7e]
+v_trunc_f16 v5.l, v127.h
+// GFX12: v_trunc_f16_e32 v5.l, v127.h ; encoding: [0xff,0xbb,0x0a,0x7e]
 
-v_trunc_f16 v5, src_scc
-// GFX12: v_trunc_f16_e32 v5, src_scc ; encoding: [0xfd,0xba,0x0a,0x7e]
+v_trunc_f16 v5.h, src_scc
+// GFX12: v_trunc_f16_e32 v5.h, src_scc ; encoding: [0xfd,0xba,0x0a,0x7f]
 
-v_trunc_f16 v127, 0xfe0b
-// GFX12: v_trunc_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_trunc_f16 v127.h, 0xfe0b
+// GFX12: v_trunc_f16_e32 v127.h, 0xfe0b ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32 v5, v1
 // GFX12: v_trunc_f32_e32 v5, v1 ; encoding: [0x01,0x43,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index 56b42f1..e821fb3 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -214,47 +214,53 @@ v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_cos_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cos_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_cos_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cos_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cos_f16 v5, v1 row_mirror
-// GFX12: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_mirror
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cos_f16 v5, v1 row_half_mirror
-// GFX12: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shl:1
-// GFX12: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shl:1
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shl:15
-// GFX12: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shl:15
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shr:1
-// GFX12: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shr:1
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cos_f16 v5, v1 row_shr:15
-// GFX12: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_shr:15
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_ror:1
-// GFX12: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_ror:1
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cos_f16 v5, v1 row_ror:15
-// GFX12: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_ror:15
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cos_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cos_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cos_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_cos_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cos_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_cos_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_cos_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -970,47 +976,53 @@ v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_i32_i16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i32_i16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_i32_i16 v5, v1.l row_mirror
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_mirror
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_half_mirror
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_half_mirror
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shl:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shl:1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shl:15
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shl:15
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shr:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shr:1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_shr:15
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_shr:15
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_ror:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_ror:1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_ror:15
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_ror:15
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_i32_i16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_i32_i16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_i32_i16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
 
-v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+v_cvt_i32_i16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x09,0x13]
+
+v_cvt_i32_i16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1324,47 +1336,53 @@ v_cvt_u32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cvt_u32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_u32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_cvt_u32_u16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_u32_u16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_cvt_u32_u16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_u32_u16 v5, v1.l row_mirror
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_mirror
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_half_mirror
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_half_mirror
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shl:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shl:1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shl:15
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shl:15
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shr:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shr:1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_shr:15
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_shr:15
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_ror:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_ror:1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_ror:15
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_ror:15
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_cvt_u32_u16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_u32_u16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_cvt_u32_u16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_u32_u16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_cvt_u32_u16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_u32_u16 v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_u32_u16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
 
-v_cvt_u32_u16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+v_cvt_u32_u16 v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_u32_u16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x09,0x13]
+
+v_cvt_u32_u16 v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_u32_u16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_exp_f16 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_exp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1660,47 +1678,53 @@ v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_fract_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_fract_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_fract_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_fract_f16 v5.l, v1.l row_mirror
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_fract_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_fract_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_fract_f16 v5, v1 row_mirror
-// GFX12: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shl:1
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_fract_f16 v5, v1 row_half_mirror
-// GFX12: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shl:15
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shl:1
-// GFX12: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shr:1
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shl:15
-// GFX12: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_shr:15
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shr:1
-// GFX12: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_ror:1
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_fract_f16 v5, v1 row_shr:15
-// GFX12: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_ror:15
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_fract_f16 v5, v1 row_ror:1
-// GFX12: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_fract_f16 v5, v1 row_ror:15
-// GFX12: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_fract_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_fract_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_fract_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_fract_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_fract_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1834,47 +1858,53 @@ v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 f
 v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v1.l row_mirror
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_mirror
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shl:1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_half_mirror
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shl:15
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shl:1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shr:1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shl:15
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_shr:15
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shr:1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_ror:1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_shr:15
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_ror:15
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_ror:1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_frexp_mant_f16 v5, v1 row_ror:15
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_frexp_mant_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_frexp_mant_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_frexp_mant_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_frexp_mant_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2212,47 +2242,53 @@ v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_not_b16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_not_b16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_not_b16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_not_b16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_not_b16 v5, v1 row_mirror
-// GFX12: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_not_b16 v5.l, v1.l row_mirror
+// GFX12: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_not_b16 v5, v1 row_half_mirror
-// GFX12: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_not_b16 v5.l, v1.l row_half_mirror
+// GFX12: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_not_b16 v5, v1 row_shl:1
-// GFX12: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shl:1
+// GFX12: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_not_b16 v5, v1 row_shl:15
-// GFX12: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shl:15
+// GFX12: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_not_b16 v5, v1 row_shr:1
-// GFX12: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shr:1
+// GFX12: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_not_b16 v5, v1 row_shr:15
-// GFX12: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_shr:15
+// GFX12: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_not_b16 v5, v1 row_ror:1
-// GFX12: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_not_b16 v5.l, v1.l row_ror:1
+// GFX12: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_not_b16 v5, v1 row_ror:15
-// GFX12: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_not_b16 v5.l, v1.l row_ror:15
+// GFX12: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_not_b16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_not_b16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_not_b16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+v_not_b16 v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_not_b16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_not_b16 v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
 v_not_b32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2422,47 +2458,53 @@ v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rndne_f16 v5.l, v1.l row_mirror
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_mirror
-// GFX12: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_half_mirror
-// GFX12: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shl:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shl:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_shr:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_shr:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:1
-// GFX12: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_ror:15
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_ror:15
-// GFX12: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rndne_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rndne_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rndne_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rndne_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_rndne_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2632,47 +2674,59 @@ v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30]
 
-v_sin_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x09,0x13]
+
+v_sat_pk_u8_i16 v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_sin_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_sin_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_sin_f16 v5, v1 row_mirror
-// GFX12: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_sin_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_sin_f16 v5, v1 row_half_mirror
-// GFX12: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_mirror
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shl:1
-// GFX12: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shl:15
-// GFX12: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shl:1
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shr:1
-// GFX12: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shl:15
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_shr:15
-// GFX12: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shr:1
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_sin_f16 v5, v1 row_ror:1
-// GFX12: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_shr:15
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_ror:15
-// GFX12: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_ror:1
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_sin_f16 v5.l, v1.l row_ror:15
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_sin_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_sin_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_sin_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sin_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_sin_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x09,0x13]
+
+v_sin_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_sin_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2800,47 +2854,53 @@ v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
-v_trunc_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_trunc_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16 v5.l, v1.l row_mirror
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_trunc_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_trunc_f16 v5.l, v1.l row_half_mirror
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_mirror
-// GFX12: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shl:1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_half_mirror
-// GFX12: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shl:15
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shl:1
-// GFX12: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shr:1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shl:15
-// GFX12: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_shr:15
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shr:1
-// GFX12: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_ror:1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_shr:15
-// GFX12: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_ror:15
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_ror:1
-// GFX12: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_trunc_f16 v5, v1 row_ror:15
-// GFX12: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_trunc_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
-v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_trunc_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
-v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_trunc_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_trunc_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_trunc_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index 09f3069..ecf408e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -49,14 +49,20 @@ v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_cos_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cos_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_cos_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -244,14 +250,20 @@ v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 
-v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_cvt_i32_i16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -334,14 +346,20 @@ v_cvt_u32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_u32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_u32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16 v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_u32_u16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
 
-v_cvt_u32_u16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_cvt_u32_u16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_u32_u16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_u32_u16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_exp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_exp_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -406,14 +424,20 @@ v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_fract_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_fract_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -448,14 +472,20 @@ v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -529,14 +559,20 @@ v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_not_b16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_not_b16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_not_b16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_not_b16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -574,14 +610,20 @@ v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rndne_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_rndne_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -619,14 +661,26 @@ v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
 
-v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_sin_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_sin_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+v_sin_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -655,14 +709,20 @@ v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_trunc_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_trunc_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
index 0ccad9c..ad08a5c 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 ; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
 
@@ -25,6 +26,12 @@ v_ceil_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_cos_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_cos_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_cos_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -34,6 +41,24 @@ v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_cos_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_cos_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_cos_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -43,6 +68,24 @@ v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_cos_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_cos_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_cvt_f16_f32_e32 v128.h, 0xaf123456
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
@@ -250,6 +293,24 @@ v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
 
+v_cvt_i32_i16_e32 v5, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_cvt_norm_i16_f16_e32 v128.h, 0xfe0b
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
@@ -385,6 +446,24 @@ v_cvt_u32_u16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_u32_u16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
 
+v_cvt_u32_u16_e32 v5, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_exp_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -430,6 +509,12 @@ v_floor_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_fract_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_fract_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_fract_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -439,6 +524,24 @@ v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_fract_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_fract_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_fract_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -448,6 +551,24 @@ v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_fract_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_fract_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_frexp_exp_i16_f16_e32 v128.h, 0xfe0b
 // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
@@ -493,6 +614,12 @@ v_frexp_exp_i16_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 v_frexp_mant_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_frexp_mant_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
 v_frexp_mant_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -502,6 +629,24 @@ v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction
 
+v_frexp_mant_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
+
 v_frexp_mant_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -511,6 +656,24 @@ v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:31: error: invalid operand for instruction
 
+v_frexp_mant_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+
 v_log_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -535,6 +698,12 @@ v_log_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_not_b16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
 
+v_not_b16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_not_b16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
 
@@ -544,6 +713,24 @@ v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
 
+v_not_b16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_not_b16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_not_b16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
@@ -553,6 +740,24 @@ v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
+v_not_b16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_not_b16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_rcp_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -577,6 +782,12 @@ v_rcp_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 v_rndne_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_rndne_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -586,6 +797,24 @@ v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_rndne_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -595,6 +824,24 @@ v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_rndne_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
 v_rsq_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -625,9 +872,33 @@ v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
 v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:30: error: invalid operand for instruction
 
+v_sat_pk_u8_i16_e32 v199.h, v5
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.h, v5 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199.l, v5 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_sin_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_sin_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_sin_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -637,6 +908,24 @@ v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_sin_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
+v_sin_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:15: error: invalid operand for instruction
+
 v_sin_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -646,6 +935,24 @@ v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
+v_sin_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
+v_sin_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:21: error: invalid operand for instruction
+
 v_sqrt_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -685,6 +992,12 @@ v_swap_b16_e32 v128.l, v0.l
 v_trunc_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
+v_trunc_f16_e32 v128.h, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v128.l, 0xfe0b
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_trunc_f16_e32 v255, v1
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -694,6 +1007,24 @@ v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
 v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
+v_trunc_f16_e32 v255.h, v1.h
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:17: error: invalid operand for instruction
+
 v_trunc_f16_e32 v5, v199
 // GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
 
@@ -702,3 +1033,21 @@ v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 
 v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
index f220ec2..cc5870f 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_promote.s
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX12 --implicit-check-not=_e32 %s
 
 v_ceil_f16 v128, 0xfe0b
@@ -67,71 +67,137 @@ v_ceil_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_ceil_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_ceil_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cos_f16 v128, 0xfe0b
-// GFX12: v_cos_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_cos_f16 v128.h, 0xfe0b
+// GFX12: v_cos_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_cos_f16 v255, -1
-// GFX12: v_cos_f16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16 v128.l, 0xfe0b
+// GFX12: v_cos_f16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe1,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_cos_f16 v255, 0.5
-// GFX12: v_cos_f16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00]
+v_cos_f16 v255.h, -1
+// GFX12: v_cos_f16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16 v255, exec_hi
-// GFX12: v_cos_f16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16 v255.h, 0.5
+// GFX12: v_cos_f16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe1,0xd5,0xf0,0x00,0x00,0x00]
 
-v_cos_f16 v255, exec_lo
-// GFX12: v_cos_f16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16 v255.h, exec_hi
+// GFX12: v_cos_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16 v255, m0
-// GFX12: v_cos_f16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16 v255.h, exec_lo
+// GFX12: v_cos_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16 v255, null
-// GFX12: v_cos_f16_e64 v255, null                ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16 v255.h, m0
+// GFX12: v_cos_f16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16 v255, s1
-// GFX12: v_cos_f16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16 v255.h, null
+// GFX12: v_cos_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16 v255, s105
-// GFX12: v_cos_f16_e64 v255, s105                ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16 v255.h, s1
+// GFX12: v_cos_f16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16 v255, src_scc
-// GFX12: v_cos_f16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00]
+v_cos_f16 v255.h, s105
+// GFX12: v_cos_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16 v255, ttmp15
-// GFX12: v_cos_f16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16 v255.h, src_scc
+// GFX12: v_cos_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0xfd,0x00,0x00,0x00]
 
-v_cos_f16 v255, v1
-// GFX12: v_cos_f16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16 v255.h, ttmp15
+// GFX12: v_cos_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_f16 v255.h, v1.h
+// GFX12: v_cos_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cos_f16 v255, v127
-// GFX12: v_cos_f16_e64 v255, v127                ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00]
+v_cos_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cos_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_cos_f16 v255.h, v127.h
+// GFX12: v_cos_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe1,0xd5,0x7f,0x01,0x00,0x00]
 
-v_cos_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_cos_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_cos_f16 v255, vcc_hi
-// GFX12: v_cos_f16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_cos_f16 v255, vcc_lo
-// GFX12: v_cos_f16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16 v255.h, vcc_hi
+// GFX12: v_cos_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199
-// GFX12: v_cos_f16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+v_cos_f16 v255.h, vcc_lo
+// GFX12: v_cos_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cos_f16 v255.l, -1
+// GFX12: v_cos_f16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cos_f16 v255.l, 0.5
+// GFX12: v_cos_f16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, exec_hi
+// GFX12: v_cos_f16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, exec_lo
+// GFX12: v_cos_f16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, m0
+// GFX12: v_cos_f16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, null
+// GFX12: v_cos_f16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, s1
+// GFX12: v_cos_f16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, s105
+// GFX12: v_cos_f16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, src_scc
+// GFX12: v_cos_f16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, ttmp15
+// GFX12: v_cos_f16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, v1.l
+// GFX12: v_cos_f16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cos_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cos_f16 v255.l, v127.l
+// GFX12: v_cos_f16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe1,0xd5,0x7f,0x01,0x00,0x00]
+
+v_cos_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_cos_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_cos_f16 v255.l, vcc_hi
+// GFX12: v_cos_f16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_f16 v255.l, vcc_lo
+// GFX12: v_cos_f16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_f16 v5.h, v199.h
+// GFX12: v_cos_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cos_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cos_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_cos_f16 v5.l, v199.l
+// GFX12: v_cos_f16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cos_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cos_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_cvt_f16_f32 v128.h, 0xaf123456
 // GFX12: v_cvt_f16_f32_e64 v128.h, 0xaf123456 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
@@ -622,14 +688,23 @@ v_cvt_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX12: v_cvt_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd3,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_i32_i16 v5, v199
-// GFX12: v_cvt_i32_i16_e64 v5, v199              ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00]
+v_cvt_i32_i16 v5, v199.h
+// GFX12: v_cvt_i32_i16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_i32_i16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_i32_i16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cvt_i32_i16 v5, v199.l
+// GFX12: v_cvt_i32_i16_e64 v5, v199.l            ; encoding: [0x05,0x00,0xea,0xd5,0xc7,0x01,0x00,0x00]
 
-v_cvt_i32_i16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cvt_i32_i16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_cvt_norm_i16_f16 v128.h, 0xfe0b
 // GFX12: v_cvt_norm_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe3,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1027,14 +1102,23 @@ v_cvt_u16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_u16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX12: v_cvt_u16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd2,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_u32_u16 v5, v199
-// GFX12: v_cvt_u32_u16_e64 v5, v199              ; encoding: [0x05,0x00,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+v_cvt_u32_u16 v5, v199.h
+// GFX12: v_cvt_u32_u16_e64 v5, v199.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v5, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v199.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_cvt_u32_u16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_cvt_u32_u16 v5, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v199.h op_sel:[1,0] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_cvt_u32_u16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_cvt_u32_u16 v5, v199.l
+// GFX12: v_cvt_u32_u16_e64 v5, v199.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xc7,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v5, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_exp_f16 v128, 0xfe0b
 // GFX12: v_exp_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd8,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1168,71 +1252,137 @@ v_floor_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_floor_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_floor_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdb,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_fract_f16 v128, 0xfe0b
-// GFX12: v_fract_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_fract_f16 v128.h, 0xfe0b
+// GFX12: v_fract_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v128.l, 0xfe0b
+// GFX12: v_fract_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xdf,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v255.h, -1
+// GFX12: v_fract_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, 0.5
+// GFX12: v_fract_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xf0,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, exec_hi
+// GFX12: v_fract_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, exec_lo
+// GFX12: v_fract_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, m0
+// GFX12: v_fract_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, null
+// GFX12: v_fract_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, s1
+// GFX12: v_fract_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x01,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, s105
+// GFX12: v_fract_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x69,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, src_scc
+// GFX12: v_fract_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0xfd,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, ttmp15
+// GFX12: v_fract_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+
+v_fract_f16 v255.h, v1.h
+// GFX12: v_fract_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_fract_f16 v255, -1
-// GFX12: v_fract_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_fract_f16 v255, 0.5
-// GFX12: v_fract_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h
+// GFX12: v_fract_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdf,0xd5,0x7f,0x01,0x00,0x00]
 
-v_fract_f16 v255, exec_hi
-// GFX12: v_fract_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_fract_f16 v255, exec_lo
-// GFX12: v_fract_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_fract_f16 v255, m0
-// GFX12: v_fract_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16 v255.h, vcc_hi
+// GFX12: v_fract_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16 v255, null
-// GFX12: v_fract_f16_e64 v255, null              ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16 v255.h, vcc_lo
+// GFX12: v_fract_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16 v255, s1
-// GFX12: v_fract_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16 v255.l, -1
+// GFX12: v_fract_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16 v255, s105
-// GFX12: v_fract_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16 v255.l, 0.5
+// GFX12: v_fract_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x00]
 
-v_fract_f16 v255, src_scc
-// GFX12: v_fract_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00]
+v_fract_f16 v255.l, exec_hi
+// GFX12: v_fract_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16 v255, ttmp15
-// GFX12: v_fract_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16 v255.l, exec_lo
+// GFX12: v_fract_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1
-// GFX12: v_fract_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16 v255.l, m0
+// GFX12: v_fract_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_fract_f16 v255.l, null
+// GFX12: v_fract_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_fract_f16 v255.l, s1
+// GFX12: v_fract_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127
-// GFX12: v_fract_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00]
+v_fract_f16 v255.l, s105
+// GFX12: v_fract_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_fract_f16 v255.l, src_scc
+// GFX12: v_fract_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x00]
 
-v_fract_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_fract_f16 v255.l, ttmp15
+// GFX12: v_fract_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16 v255, vcc_hi
-// GFX12: v_fract_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16 v255.l, v1.l
+// GFX12: v_fract_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
-v_fract_f16 v255, vcc_lo
-// GFX12: v_fract_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_fract_f16 v5, v199
-// GFX12: v_fract_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+v_fract_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_fract_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_fract_f16 v255.l, v127.l
+// GFX12: v_fract_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xdf,0xd5,0x7f,0x01,0x00,0x00]
 
-v_fract_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_fract_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_fract_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_fract_f16 v255.l, vcc_hi
+// GFX12: v_fract_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+
+v_fract_f16 v255.l, vcc_lo
+// GFX12: v_fract_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f16 v5.h, v199.h
+// GFX12: v_fract_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+
+v_fract_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_fract_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_fract_f16 v5.l, v199.l
+// GFX12: v_fract_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xc7,0x01,0x00,0x00]
+
+v_fract_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_fract_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_frexp_exp_i16_f16 v128.h, 0xfe0b
 // GFX12: v_frexp_exp_i16_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xda,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1366,71 +1516,137 @@ v_frexp_exp_i16_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 v_frexp_exp_i16_f16 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX12: v_frexp_exp_i16_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xda,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v128, 0xfe0b
-// GFX12: v_frexp_mant_f16_e64 v128, 0xfe0b       ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16 v128.h, 0xfe0b
+// GFX12: v_frexp_mant_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v128.l, 0xfe0b
+// GFX12: v_frexp_mant_f16_e64 v128.l, 0xfe0b     ; encoding: [0x80,0x00,0xd9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, -1
+// GFX12: v_frexp_mant_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, 0.5
+// GFX12: v_frexp_mant_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, exec_hi
+// GFX12: v_frexp_mant_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, exec_lo
+// GFX12: v_frexp_mant_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, m0
+// GFX12: v_frexp_mant_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, null
+// GFX12: v_frexp_mant_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, s1
+// GFX12: v_frexp_mant_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, s105
+// GFX12: v_frexp_mant_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, src_scc
+// GFX12: v_frexp_mant_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, ttmp15
+// GFX12: v_frexp_mant_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.h, v1.h
+// GFX12: v_frexp_mant_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v255, -1
-// GFX12: v_frexp_mant_f16_e64 v255, -1           ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v255, 0.5
-// GFX12: v_frexp_mant_f16_e64 v255, 0.5          ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v255, exec_hi
-// GFX12: v_frexp_mant_f16_e64 v255, exec_hi      ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h
+// GFX12: v_frexp_mant_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xd9,0xd5,0x7f,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v255, exec_lo
-// GFX12: v_frexp_mant_f16_e64 v255, exec_lo      ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v255, m0
-// GFX12: v_frexp_mant_f16_e64 v255, m0           ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v255, null
-// GFX12: v_frexp_mant_f16_e64 v255, null         ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, vcc_hi
+// GFX12: v_frexp_mant_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, s1
-// GFX12: v_frexp_mant_f16_e64 v255, s1           ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.h, vcc_lo
+// GFX12: v_frexp_mant_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, s105
-// GFX12: v_frexp_mant_f16_e64 v255, s105         ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, -1
+// GFX12: v_frexp_mant_f16_e64 v255.l, -1         ; encoding: [0xff,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, src_scc
-// GFX12: v_frexp_mant_f16_e64 v255, src_scc      ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, 0.5
+// GFX12: v_frexp_mant_f16_e64 v255.l, 0.5        ; encoding: [0xff,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, ttmp15
-// GFX12: v_frexp_mant_f16_e64 v255, ttmp15       ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, exec_hi
+// GFX12: v_frexp_mant_f16_e64 v255.l, exec_hi    ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1
-// GFX12: v_frexp_mant_f16_e64 v255, v1           ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, exec_lo
+// GFX12: v_frexp_mant_f16_e64 v255.l, exec_lo    ; encoding: [0xff,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, m0
+// GFX12: v_frexp_mant_f16_e64 v255.l, m0         ; encoding: [0xff,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, null
+// GFX12: v_frexp_mant_f16_e64 v255.l, null       ; encoding: [0xff,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127
-// GFX12: v_frexp_mant_f16_e64 v255, v127         ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, s1
+// GFX12: v_frexp_mant_f16_e64 v255.l, s1         ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, s105
+// GFX12: v_frexp_mant_f16_e64 v255.l, s105       ; encoding: [0xff,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, src_scc
+// GFX12: v_frexp_mant_f16_e64 v255.l, src_scc    ; encoding: [0xff,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, vcc_hi
-// GFX12: v_frexp_mant_f16_e64 v255, vcc_hi       ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, ttmp15
+// GFX12: v_frexp_mant_f16_e64 v255.l, ttmp15     ; encoding: [0xff,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16 v255, vcc_lo
-// GFX12: v_frexp_mant_f16_e64 v255, vcc_lo       ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16 v255.l, v1.l
+// GFX12: v_frexp_mant_f16_e64 v255.l, v1.l       ; encoding: [0xff,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16 v5, v199
-// GFX12: v_frexp_mant_f16_e64 v5, v199           ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+v_frexp_mant_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_frexp_mant_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_frexp_mant_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_frexp_mant_f16 v255.l, v127.l
+// GFX12: v_frexp_mant_f16_e64 v255.l, v127.l     ; encoding: [0xff,0x00,0xd9,0xd5,0x7f,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v255.l, vcc_hi
+// GFX12: v_frexp_mant_f16_e64 v255.l, vcc_hi     ; encoding: [0xff,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v255.l, vcc_lo
+// GFX12: v_frexp_mant_f16_e64 v255.l, vcc_lo     ; encoding: [0xff,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v5.h, v199.h
+// GFX12: v_frexp_mant_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5.l, v199.l
+// GFX12: v_frexp_mant_f16_e64 v5.l, v199.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_log_f16 v128, 0xfe0b
 // GFX12: v_log_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd7,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1498,71 +1714,137 @@ v_log_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_log_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_log_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd7,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_not_b16 v128, 0xfe0b
-// GFX12: v_not_b16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16 v128.h, 0xfe0b
+// GFX12: v_not_b16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v128.l, 0xfe0b
+// GFX12: v_not_b16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v255.h, -1
+// GFX12: v_not_b16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_not_b16 v255.h, 0.5
+// GFX12: v_not_b16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_not_b16 v255.h, exec_hi
+// GFX12: v_not_b16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_not_b16 v255.h, exec_lo
+// GFX12: v_not_b16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_not_b16 v255.h, m0
+// GFX12: v_not_b16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_not_b16 v255.h, null
+// GFX12: v_not_b16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_not_b16 v255.h, s1
+// GFX12: v_not_b16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe9,0xd5,0x01,0x00,0x00,0x00]
+
+v_not_b16 v255.h, s105
+// GFX12: v_not_b16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x69,0x00,0x00,0x00]
+
+v_not_b16 v255.h, src_scc
+// GFX12: v_not_b16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16 v255, -1
-// GFX12: v_not_b16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16 v255.h, ttmp15
+// GFX12: v_not_b16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16 v255, 0.5
-// GFX12: v_not_b16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h
+// GFX12: v_not_b16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
-v_not_b16 v255, exec_hi
-// GFX12: v_not_b16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_not_b16 v255, exec_lo
-// GFX12: v_not_b16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_not_b16 v255, m0
-// GFX12: v_not_b16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h
+// GFX12: v_not_b16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe9,0xd5,0x7f,0x01,0x00,0x00]
 
-v_not_b16 v255, null
-// GFX12: v_not_b16_e64 v255, null                ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_not_b16 v255, s1
-// GFX12: v_not_b16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_not_b16 v255, s105
-// GFX12: v_not_b16_e64 v255, s105                ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16 v255.h, vcc_hi
+// GFX12: v_not_b16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16 v255, src_scc
-// GFX12: v_not_b16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16 v255.h, vcc_lo
+// GFX12: v_not_b16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16 v255, ttmp15
-// GFX12: v_not_b16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16 v255.l, -1
+// GFX12: v_not_b16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16 v255, v1
-// GFX12: v_not_b16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16 v255.l, 0.5
+// GFX12: v_not_b16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16 v255.l, exec_hi
+// GFX12: v_not_b16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_not_b16 v255.l, exec_lo
+// GFX12: v_not_b16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16 v255, v127
-// GFX12: v_not_b16_e64 v255, v127                ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00]
+v_not_b16 v255.l, m0
+// GFX12: v_not_b16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_not_b16 v255.l, null
+// GFX12: v_not_b16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_not_b16 v255.l, s1
+// GFX12: v_not_b16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16 v255, vcc_hi
-// GFX12: v_not_b16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16 v255.l, s105
+// GFX12: v_not_b16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16 v255, vcc_lo
-// GFX12: v_not_b16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16 v255.l, src_scc
+// GFX12: v_not_b16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16 v5, v199
-// GFX12: v_not_b16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+v_not_b16 v255.l, ttmp15
+// GFX12: v_not_b16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_not_b16 v255.l, v1.l
+// GFX12: v_not_b16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
-v_not_b16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_not_b16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_not_b16 v255.l, v127.l
+// GFX12: v_not_b16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe9,0xd5,0x7f,0x01,0x00,0x00]
+
+v_not_b16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_not_b16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_not_b16 v255.l, vcc_hi
+// GFX12: v_not_b16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_not_b16 v255.l, vcc_lo
+// GFX12: v_not_b16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_not_b16 v5.h, v199.h
+// GFX12: v_not_b16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_not_b16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_not_b16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_not_b16 v5.l, v199.l
+// GFX12: v_not_b16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xc7,0x01,0x00,0x00]
+
+v_not_b16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_not_b16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rcp_f16 v128, 0xfe0b
 // GFX12: v_rcp_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd4,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1630,71 +1912,137 @@ v_rcp_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_rcp_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_rcp_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd4,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_rndne_f16 v128, 0xfe0b
-// GFX12: v_rndne_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_rndne_f16 v128.h, 0xfe0b
+// GFX12: v_rndne_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v128.l, 0xfe0b
+// GFX12: v_rndne_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xde,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v255.h, -1
+// GFX12: v_rndne_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, 0.5
+// GFX12: v_rndne_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xf0,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, exec_hi
+// GFX12: v_rndne_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, exec_lo
+// GFX12: v_rndne_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, m0
+// GFX12: v_rndne_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, null
+// GFX12: v_rndne_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, s1
+// GFX12: v_rndne_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f16 v255.h, s105
+// GFX12: v_rndne_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v255, -1
-// GFX12: v_rndne_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16 v255.h, src_scc
+// GFX12: v_rndne_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v255, 0.5
-// GFX12: v_rndne_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
+v_rndne_f16 v255.h, ttmp15
+// GFX12: v_rndne_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, exec_hi
-// GFX12: v_rndne_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h
+// GFX12: v_rndne_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16 v255, exec_lo
-// GFX12: v_rndne_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_rndne_f16 v255, m0
-// GFX12: v_rndne_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, null
-// GFX12: v_rndne_f16_e64 v255, null              ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h
+// GFX12: v_rndne_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xde,0xd5,0x7f,0x01,0x00,0x00]
 
-v_rndne_f16 v255, s1
-// GFX12: v_rndne_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_rndne_f16 v255, s105
-// GFX12: v_rndne_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_rndne_f16 v255, src_scc
-// GFX12: v_rndne_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_hi
+// GFX12: v_rndne_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16 v255, ttmp15
-// GFX12: v_rndne_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16 v255.h, vcc_lo
+// GFX12: v_rndne_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1
-// GFX12: v_rndne_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16 v255.l, -1
+// GFX12: v_rndne_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16 v255.l, 0.5
+// GFX12: v_rndne_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xde,0xd5,0xf0,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, exec_hi
+// GFX12: v_rndne_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127
-// GFX12: v_rndne_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+v_rndne_f16 v255.l, exec_lo
+// GFX12: v_rndne_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_rndne_f16 v255.l, m0
+// GFX12: v_rndne_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, null
+// GFX12: v_rndne_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16 v255, vcc_hi
-// GFX12: v_rndne_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16 v255.l, s1
+// GFX12: v_rndne_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16 v255, vcc_lo
-// GFX12: v_rndne_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16 v255.l, s105
+// GFX12: v_rndne_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199
-// GFX12: v_rndne_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+v_rndne_f16 v255.l, src_scc
+// GFX12: v_rndne_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xde,0xd5,0xfd,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_rndne_f16 v255.l, ttmp15
+// GFX12: v_rndne_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_rndne_f16 v255.l, v1.l
+// GFX12: v_rndne_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, v127.l
+// GFX12: v_rndne_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xde,0xd5,0x7f,0x01,0x00,0x00]
+
+v_rndne_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_rndne_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_rndne_f16 v255.l, vcc_hi
+// GFX12: v_rndne_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16 v255.l, vcc_lo
+// GFX12: v_rndne_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h
+// GFX12: v_rndne_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_rndne_f16 v5.l, v199.l
+// GFX12: v_rndne_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xde,0xd5,0xc7,0x01,0x00,0x00]
+
+v_rndne_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_rndne_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_rsq_f16 v128, 0xfe0b
 // GFX12: v_rsq_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xd6,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1771,71 +2119,146 @@ v_sat_pk_u8_i16 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
 v_sat_pk_u8_i16 v199, v5 quad_perm:[3,2,1,0]
 // GFX12: v_sat_pk_u8_i16_e64_dpp v199, v5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
 
-v_sin_f16 v128, 0xfe0b
-// GFX12: v_sin_f16_e64 v128, 0xfe0b              ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_sat_pk_u8_i16 v199.h, v5
+// GFX12: v_sat_pk_u8_i16_e64 v199.h, v5 op_sel:[0,1] ; encoding: [0xc7,0x40,0xe2,0xd5,0x05,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16 v199.h, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xc7,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x05,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v199.h, v5 quad_perm:[3,2,1,0]
+// GFX12: v_sat_pk_u8_i16_e64_dpp v199.h, v5 op_sel:[0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xc7,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x05,0x1b,0x00,0xff]
+
+v_sin_f16 v128.h, 0xfe0b
+// GFX12: v_sin_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v128.l, 0xfe0b
+// GFX12: v_sin_f16_e64 v128.l, 0xfe0b            ; encoding: [0x80,0x00,0xe0,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v255.h, -1
+// GFX12: v_sin_f16_e64 v255.h, -1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_f16 v255.h, 0.5
+// GFX12: v_sin_f16_e64 v255.h, 0.5 op_sel:[0,1]  ; encoding: [0xff,0x40,0xe0,0xd5,0xf0,0x00,0x00,0x00]
+
+v_sin_f16 v255.h, exec_hi
+// GFX12: v_sin_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16 v255, -1
-// GFX12: v_sin_f16_e64 v255, -1                  ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16 v255.h, exec_lo
+// GFX12: v_sin_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16 v255, 0.5
-// GFX12: v_sin_f16_e64 v255, 0.5                 ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00]
+v_sin_f16 v255.h, m0
+// GFX12: v_sin_f16_e64 v255.h, m0 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16 v255, exec_hi
-// GFX12: v_sin_f16_e64 v255, exec_hi             ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16 v255.h, null
+// GFX12: v_sin_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16 v255, exec_lo
-// GFX12: v_sin_f16_e64 v255, exec_lo             ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16 v255.h, s1
+// GFX12: v_sin_f16_e64 v255.h, s1 op_sel:[0,1]   ; encoding: [0xff,0x40,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16 v255, m0
-// GFX12: v_sin_f16_e64 v255, m0                  ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16 v255.h, s105
+// GFX12: v_sin_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16 v255, null
-// GFX12: v_sin_f16_e64 v255, null                ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16 v255.h, src_scc
+// GFX12: v_sin_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0xfd,0x00,0x00,0x00]
 
-v_sin_f16 v255, s1
-// GFX12: v_sin_f16_e64 v255, s1                  ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16 v255.h, ttmp15
+// GFX12: v_sin_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16 v255, s105
-// GFX12: v_sin_f16_e64 v255, s105                ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h
+// GFX12: v_sin_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16 v255, src_scc
-// GFX12: v_sin_f16_e64 v255, src_scc             ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sin_f16 v255, ttmp15
-// GFX12: v_sin_f16_e64 v255, ttmp15              ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_sin_f16 v255, v1
-// GFX12: v_sin_f16_e64 v255, v1                  ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sin_f16 v255.h, v127.h
+// GFX12: v_sin_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xe0,0xd5,0x7f,0x01,0x00,0x00]
 
-v_sin_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sin_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_sin_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sin_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_sin_f16 v255, v127
-// GFX12: v_sin_f16_e64 v255, v127                ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00]
+v_sin_f16 v255.h, vcc_hi
+// GFX12: v_sin_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_sin_f16 v255.h, vcc_lo
+// GFX12: v_sin_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_sin_f16 v255.l, -1
+// GFX12: v_sin_f16_e64 v255.l, -1                ; encoding: [0xff,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16 v255, vcc_hi
-// GFX12: v_sin_f16_e64 v255, vcc_hi              ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16 v255.l, 0.5
+// GFX12: v_sin_f16_e64 v255.l, 0.5               ; encoding: [0xff,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sin_f16 v255, vcc_lo
-// GFX12: v_sin_f16_e64 v255, vcc_lo              ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16 v255.l, exec_hi
+// GFX12: v_sin_f16_e64 v255.l, exec_hi           ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199
-// GFX12: v_sin_f16_e64 v5, v199                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+v_sin_f16 v255.l, exec_lo
+// GFX12: v_sin_f16_e64 v255.l, exec_lo           ; encoding: [0xff,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_sin_f16 v255.l, m0
+// GFX12: v_sin_f16_e64 v255.l, m0                ; encoding: [0xff,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_sin_f16 v255.l, null
+// GFX12: v_sin_f16_e64 v255.l, null              ; encoding: [0xff,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, s1
+// GFX12: v_sin_f16_e64 v255.l, s1                ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, s105
+// GFX12: v_sin_f16_e64 v255.l, s105              ; encoding: [0xff,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, src_scc
+// GFX12: v_sin_f16_e64 v255.l, src_scc           ; encoding: [0xff,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, ttmp15
+// GFX12: v_sin_f16_e64 v255.l, ttmp15            ; encoding: [0xff,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, v1.l
+// GFX12: v_sin_f16_e64 v255.l, v1.l              ; encoding: [0xff,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_sin_f16 v255.l, v127.l
+// GFX12: v_sin_f16_e64 v255.l, v127.l            ; encoding: [0xff,0x00,0xe0,0xd5,0x7f,0x01,0x00,0x00]
+
+v_sin_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_sin_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_sin_f16 v255.l, vcc_hi
+// GFX12: v_sin_f16_e64 v255.l, vcc_hi            ; encoding: [0xff,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_f16 v255.l, vcc_lo
+// GFX12: v_sin_f16_e64 v255.l, vcc_lo            ; encoding: [0xff,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_f16 v5.h, v199.h
+// GFX12: v_sin_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+
+v_sin_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_sin_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+
+v_sin_f16 v5.l, v199.l
+// GFX12: v_sin_f16_e64 v5.l, v199.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xc7,0x01,0x00,0x00]
+
+v_sin_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+
+v_sin_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
 v_sqrt_f16 v128, 0xfe0b
 // GFX12: v_sqrt_f16_e64 v128, 0xfe0b             ; encoding: [0x80,0x00,0xd5,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
@@ -1903,68 +2326,134 @@ v_sqrt_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
 v_sqrt_f16 v5, v199 quad_perm:[3,2,1,0]
 // GFX12: v_sqrt_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd5,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_trunc_f16 v128, 0xfe0b
-// GFX12: v_trunc_f16_e64 v128, 0xfe0b            ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_trunc_f16 v128.h, 0xfe0b
+// GFX12: v_trunc_f16_e64 v128.h, 0xfe0b op_sel:[0,1] ; encoding: [0x80,0x40,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v128.l, 0xfe0b
+// GFX12: v_trunc_f16_e64 v128.l, 0xfe0b          ; encoding: [0x80,0x00,0xdd,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v255.h, -1
+// GFX12: v_trunc_f16_e64 v255.h, -1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, 0.5
+// GFX12: v_trunc_f16_e64 v255.h, 0.5 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, exec_hi
+// GFX12: v_trunc_f16_e64 v255.h, exec_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, exec_lo
+// GFX12: v_trunc_f16_e64 v255.h, exec_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, m0
+// GFX12: v_trunc_f16_e64 v255.h, m0 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, null
+// GFX12: v_trunc_f16_e64 v255.h, null op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, s1
+// GFX12: v_trunc_f16_e64 v255.h, s1 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, s105
+// GFX12: v_trunc_f16_e64 v255.h, s105 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x69,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, src_scc
+// GFX12: v_trunc_f16_e64 v255.h, src_scc op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0xfd,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, ttmp15
+// GFX12: v_trunc_f16_e64 v255.h, ttmp15 op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, v1.h
+// GFX12: v_trunc_f16_e64 v255.h, v1.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16 v255.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v255.h, v1.h quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.h, v1.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v255.h, v127.h
+// GFX12: v_trunc_f16_e64 v255.h, v127.h op_sel:[1,1] ; encoding: [0xff,0x48,0xdd,0xd5,0x7f,0x01,0x00,0x00]
+
+v_trunc_f16 v255.h, v127.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+
+v_trunc_f16 v255.h, v127.h quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.h, v127.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+
+v_trunc_f16 v255.h, vcc_hi
+// GFX12: v_trunc_f16_e64 v255.h, vcc_hi op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_trunc_f16 v255.h, vcc_lo
+// GFX12: v_trunc_f16_e64 v255.h, vcc_lo op_sel:[0,1] ; encoding: [0xff,0x40,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, -1
+// GFX12: v_trunc_f16_e64 v255.l, -1              ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, 0.5
+// GFX12: v_trunc_f16_e64 v255.l, 0.5             ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+
+v_trunc_f16 v255.l, exec_hi
+// GFX12: v_trunc_f16_e64 v255.l, exec_hi         ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16 v255, -1
-// GFX12: v_trunc_f16_e64 v255, -1                ; encoding: [0xff,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16 v255.l, exec_lo
+// GFX12: v_trunc_f16_e64 v255.l, exec_lo         ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16 v255, 0.5
-// GFX12: v_trunc_f16_e64 v255, 0.5               ; encoding: [0xff,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x00]
+v_trunc_f16 v255.l, m0
+// GFX12: v_trunc_f16_e64 v255.l, m0              ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16 v255, exec_hi
-// GFX12: v_trunc_f16_e64 v255, exec_hi           ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16 v255.l, null
+// GFX12: v_trunc_f16_e64 v255.l, null            ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16 v255, exec_lo
-// GFX12: v_trunc_f16_e64 v255, exec_lo           ; encoding: [0xff,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16 v255.l, s1
+// GFX12: v_trunc_f16_e64 v255.l, s1              ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
-v_trunc_f16 v255, m0
-// GFX12: v_trunc_f16_e64 v255, m0                ; encoding: [0xff,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16 v255.l, s105
+// GFX12: v_trunc_f16_e64 v255.l, s105            ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16 v255, null
-// GFX12: v_trunc_f16_e64 v255, null              ; encoding: [0xff,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16 v255.l, src_scc
+// GFX12: v_trunc_f16_e64 v255.l, src_scc         ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00]
 
-v_trunc_f16 v255, s1
-// GFX12: v_trunc_f16_e64 v255, s1                ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16 v255.l, ttmp15
+// GFX12: v_trunc_f16_e64 v255.l, ttmp15          ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16 v255, s105
-// GFX12: v_trunc_f16_e64 v255, s105              ; encoding: [0xff,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l
+// GFX12: v_trunc_f16_e64 v255.l, v1.l            ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16 v255, src_scc
-// GFX12: v_trunc_f16_e64 v255, src_scc           ; encoding: [0xff,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_trunc_f16 v255, ttmp15
-// GFX12: v_trunc_f16_e64 v255, ttmp15            ; encoding: [0xff,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16 v255.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_trunc_f16 v255, v1
-// GFX12: v_trunc_f16_e64 v255, v1                ; encoding: [0xff,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16 v255.l, v127.l
+// GFX12: v_trunc_f16_e64 v255.l, v127.l          ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00]
 
-v_trunc_f16 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v255, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_trunc_f16 v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
 
-v_trunc_f16 v255, v1 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v255, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_trunc_f16 v255.l, v127.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v255.l, v127.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
 
-v_trunc_f16 v255, v127
-// GFX12: v_trunc_f16_e64 v255, v127              ; encoding: [0xff,0x00,0xdd,0xd5,0x7f,0x01,0x00,0x00]
+v_trunc_f16 v255.l, vcc_hi
+// GFX12: v_trunc_f16_e64 v255.l, vcc_hi          ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16 v255, v127 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v255, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x7f,0x77,0x39,0x05]
+v_trunc_f16 v255.l, vcc_lo
+// GFX12: v_trunc_f16_e64 v255.l, vcc_lo          ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16 v255, v127 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v255, v127 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x7f,0x1b,0x00,0xff]
+v_trunc_f16 v5.h, v199.h
+// GFX12: v_trunc_f16_e64 v5.h, v199.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0xc7,0x01,0x00,0x00]
 
-v_trunc_f16 v255, vcc_hi
-// GFX12: v_trunc_f16_e64 v255, vcc_hi            ; encoding: [0xff,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16 v5.h, v199.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_trunc_f16 v255, vcc_lo
-// GFX12: v_trunc_f16_e64 v255, vcc_lo            ; encoding: [0xff,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16 v5.h, v199.h quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.h, v199.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
 
-v_trunc_f16 v5, v199
-// GFX12: v_trunc_f16_e64 v5, v199                ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00]
+v_trunc_f16 v5.l, v199.l
+// GFX12: v_trunc_f16_e64 v5.l, v199.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xc7,0x01,0x00,0x00]
 
-v_trunc_f16 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v199 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
+v_trunc_f16 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0xc7,0x77,0x39,0x05]
 
-v_trunc_f16 v5, v199 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v199 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
+v_trunc_f16 v5.l, v199.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v199.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0xc7,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index 0309b2e..3e7b7d2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -2234,50 +2234,62 @@ v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
 v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
 // GFX12: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
-v_fma_f16 v5, v1, v2, s3
-// GFX12: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+v_fma_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
 
-v_fma_f16 v5, v255, s2, s105
-// GFX12: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+v_fma_f16 v5.l, v255.l, s2, s105
+// GFX12: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
 
-v_fma_f16 v5, s1, v255, exec_hi
-// GFX12: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+v_fma_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
 
-v_fma_f16 v5, s105, s105, exec_lo
-// GFX12: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+v_fma_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_fma_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_fma_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_fma_f16 v5, m0, 0.5, m0
-// GFX12: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+v_fma_f16 v5.l, m0, 0.5, m0
+// GFX12: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_fma_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
 
-v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX12: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX12: v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX12: v_fma_f16 v5, null, exec_lo, -|0xfe0b|  ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
+// GFX12: v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX12: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX12: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX12: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX12: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX12: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX12: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX12: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
+// GFX12: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, v255.h, s2, s105
+// GFX12: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+v_fma_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX12: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_fma_f32 v5, v1, v2, s3
 // GFX12: v_fma_f32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
@@ -3164,50 +3176,62 @@ v_mad_co_u64_u32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc
 v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp
 // GFX12: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xfe,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf]
 
-v_max3_num_f16 v5, v1, v2, s3
-// GFX12: v_max3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+v_max3_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_max3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+
+v_max3_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_max3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
 
-v_max3_num_f16 v5, v255, s2, s105
-// GFX12: v_max3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+v_max3_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
 
-v_max3_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_max3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+v_max3_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_max3_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_max3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_max3_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_max3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+v_max3_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_max3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_max3_num_f16 v5, m0, 0.5, m0
-// GFX12: v_max3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
 
-v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX12: v_max3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX12: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
+// GFX12: v_max3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX12: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX12: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX12: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX12: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX12: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX12: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX12: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
+// GFX12: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX12: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_max3_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_max3_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+
+v_max3_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_max3_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+
+v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX12: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_max3_num_f32 v5, v1, v2, s3
 // GFX12: v_max3_num_f32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00]
@@ -3566,50 +3590,62 @@ v_max_u16 v255.l, 0xfe0b, vcc_hi
 v_max_u16 v255.h, 0xfe0b, vcc_hi
 // GFX12: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, v1, v2, s3
-// GFX12: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+v_maxmin_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+
+v_maxmin_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+
+v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maxmin_num_f16 v5, v255, s2, s105
-// GFX12: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+v_maxmin_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_maxmin_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_maxmin_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maxmin_num_f16 v5, m0, 0.5, m0
-// GFX12: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX12: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX12: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX12: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX12: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX12: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX12: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+v_maxmin_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX12: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+
+v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_maxmin_num_f32 v5, v1, v2, s3
 // GFX12: v_maxmin_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00]
@@ -4142,50 +4178,62 @@ v_med3_u32 v5, src_scc, vcc_lo, -1
 v_med3_u32 v255, 0xaf123456, vcc_hi, null
 // GFX12: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_min3_num_f16 v5, v1, v2, s3
-// GFX12: v_min3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+v_min3_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_min3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+
+v_min3_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_min3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+
+v_min3_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+
+v_min3_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_min3_num_f16 v5, v255, s2, s105
-// GFX12: v_min3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_min3_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_min3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_min3_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_min3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_min3_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_min3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+v_min3_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_min3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
 
-v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX12: v_min3_num_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_min3_num_f16 v5, m0, 0.5, m0
-// GFX12: v_min3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
+// GFX12: v_min3_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX12: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX12: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX12: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX12: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX12: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX12: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
+// GFX12: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX12: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+v_min3_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_min3_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2b,0xd6,0xff,0x05,0xa4,0x01]
 
-v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX12: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+v_min3_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_min3_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0x01,0xfe,0xff,0x01]
 
-v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX12: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX12: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_min3_num_f32 v5, v1, v2, s3
 // GFX12: v_min3_num_f32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00]
@@ -4544,50 +4592,62 @@ v_min_u16 v255.l, 0xfe0b, vcc_hi
 v_min_u16 v255.h, 0xfe0b, vcc_hi
 // GFX12: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, v1, v2, s3
-// GFX12: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+v_minmax_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+
+v_minmax_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+
+v_minmax_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+
+v_minmax_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+
+v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_minmax_num_f16 v5, v255, s2, s105
-// GFX12: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minmax_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+v_minmax_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minmax_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX12: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, m0, 0.5, m0
-// GFX12: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX12: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX12: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX12: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minmax_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
 
-v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX12: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_minmax_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
 
-v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX12: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX12: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_minmax_num_f32 v5, v1, v2, s3
 // GFX12: v_minmax_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
index 59cb1a4..ffcf651 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
@@ -6,11 +6,11 @@ v_min3_f32 v5, v1, v2, v3
 v_max3_f32 v5, v1, v2, v3
 // GFX12: v_max3_num_f32 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x04]
 
-v_min3_f16 v5, v1, v2, v3
-// GFX12: v_min3_num_f16 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04]
+v_min3_f16 v5.l, v1.l, v2.l, v3.l
+// GFX12: v_min3_num_f16 v5.l, v1.l, v2.l, v3.l   ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04]
 
-v_max3_f16 v5, v1, v2, v3
-// GFX12: v_max3_num_f16 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04]
+v_max3_f16 v5.l, v1.l, v2.l, v3.l
+// GFX12: v_max3_num_f16 v5.l, v1.l, v2.l, v3.l   ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04]
 
 v_med3_f32 v5, v1, v2, v3
 // GFX12: v_med3_num_f32 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x31,0xd6,0x01,0x05,0x0e,0x04]
@@ -24,11 +24,11 @@ v_minmax_f32_e64_dpp v0, -v1, -v2, -v3 dpp8:[0,1,2,3,4,5,6,7]
 v_maxmin_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7]
 // GFX12: v_maxmin_num_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x80,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0xc6,0xfa]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 v_mad_i64_i32 v[5:6], s12, v1, v2, v[3:4]
 // GFX12: v_mad_co_i64_i32 v[5:6], s12, v1, v2, v[3:4] ; encoding: [0x05,0x0c,0xff,0xd6,0x01,0x05,0x0e,0x04]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index b769324..aa804cc 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1775,53 +1775,68 @@ v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 ban
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+
+v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x09,0x13]
+
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2480,53 +2495,53 @@ v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bou
 v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x80,0x0b,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2906,53 +2921,98 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
-v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
 
-v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3515,53 +3575,53 @@ v_med3_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x21,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3941,53 +4001,98 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+
+v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -5245,20 +5350,20 @@ v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX12: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX12: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
 // GFX12: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x53,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff]
@@ -5302,20 +5407,20 @@ v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] row_xmask:0 row_mask:0x1
 v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 
-v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX12: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
-v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
 // GFX12: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x4d,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff]
@@ -5392,20 +5497,20 @@ v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] row_xmask:0 row_mask:0x
 v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX12: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 
-v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX12: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
-v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
 // GFX12: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x4a,0xd6,0xfa,0x04,0xfe,0x01,0x01,0x2f,0x01,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index f76dd26..e93a65e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -1008,47 +1008,62 @@ v_div_fixup_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x54,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x48,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x48,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fma_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x48,0xd6,0xea,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x48,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1545,47 +1560,47 @@ v_mad_u32_u24_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x80,0x0b,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2c,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2c,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_max3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2c,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1863,47 +1878,86 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x6b,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6b,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2340,47 +2394,47 @@ v_med3_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x21,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2b,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2b,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_min3_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2b,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2658,47 +2712,86 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x6a,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6a,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3514,20 +3607,20 @@ v_div_fixup_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5
 v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX12: v_div_fixup_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x53,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05]
@@ -3571,20 +3664,20 @@ v_mad_u32_u16_e64_dpp v5, v1, v2, 0.5 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
 v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_max3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x4d,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05]
@@ -3661,20 +3754,20 @@ v_med3_u16_e64_dpp v5.l, v1.l, v2.l, -1 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
 v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX12: v_med3_u16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_min3_i16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x4a,0xd6,0xe9,0x04,0xfe,0x01,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index 015619d..d49a708 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -262,50 +262,59 @@ v_clz_i32_u32_e64 v5, src_scc
 v_clz_i32_u32_e64 v255, 0xaf123456
 // GFX12: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cos_f16_e64 v5, v1
-// GFX12: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v1.l
+// GFX12: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, v255
-// GFX12: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v255.l
+// GFX12: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, s1
-// GFX12: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s1
+// GFX12: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, s105
-// GFX12: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s105
+// GFX12: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_lo
-// GFX12: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_lo
+// GFX12: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_hi
-// GFX12: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_hi
+// GFX12: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, ttmp15
-// GFX12: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, ttmp15
+// GFX12: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, m0
-// GFX12: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, m0
+// GFX12: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_lo
-// GFX12: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_lo
+// GFX12: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_hi
-// GFX12: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_hi
+// GFX12: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, null
-// GFX12: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, null
+// GFX12: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, -1
-// GFX12: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, -1
+// GFX12: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, 0.5 mul:2
-// GFX12: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+v_cos_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cos_f16_e64 v5, src_scc mul:4
-// GFX12: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+v_cos_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cos_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16_e64 v5.h, v1.h
+// GFX12: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5.l, v255.h
+// GFX12: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_cos_f32_e64 v5, v1
 // GFX12: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
@@ -1417,11 +1426,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc|
 v_cvt_i32_f64_e64 v255, 0xaf123456 clamp
 // GFX12: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16_e64 v5, v1
-// GFX12: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v1.l
+// GFX12: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_i32_i16_e64 v5, v255
-// GFX12: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v255.l
+// GFX12: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_i32_i16_e64 v5, s1
 // GFX12: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1462,6 +1471,9 @@ v_cvt_i32_i16_e64 v5, src_scc
 v_cvt_i32_i16_e64 v255, 0xfe0b
 // GFX12: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_cvt_i32_i16_e64 v5, v255.h
+// GFX12: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00]
+
 v_cvt_nearest_i32_f32_e64 v5, v1
 // GFX12: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
 
@@ -1840,11 +1852,11 @@ v_cvt_u32_f64_e64 v5, -|src_scc|
 v_cvt_u32_f64_e64 v255, 0xaf123456 clamp
 // GFX12: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16_e64 v5, v1
-// GFX12: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v1.l
+// GFX12: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_u16_e64 v5, v255
-// GFX12: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v255.l
+// GFX12: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_u32_u16_e64 v5, s1
 // GFX12: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1885,6 +1897,9 @@ v_cvt_u32_u16_e64 v5, src_scc
 v_cvt_u32_u16_e64 v255, 0xfe0b
 // GFX12: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+v_cvt_u32_u16_e64 v5, v255.h
+// GFX12: v_cvt_u32_u16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
 v_exp_f16_e64 v5, v1
 // GFX12: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2236,50 +2251,59 @@ v_floor_f64_e64 v[5:6], -|src_scc| mul:4
 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX12: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_fract_f16_e64 v5, v1
-// GFX12: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, v1.l
+// GFX12: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5.l, v255.l
+// GFX12: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
-v_fract_f16_e64 v5, v255
-// GFX12: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, s1
+// GFX12: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s1
-// GFX12: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, s105
+// GFX12: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s105
-// GFX12: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_lo
+// GFX12: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_lo
-// GFX12: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_hi
+// GFX12: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_hi
-// GFX12: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, ttmp15
+// GFX12: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, ttmp15
-// GFX12: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, m0
+// GFX12: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, m0
-// GFX12: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_lo
+// GFX12: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_lo
-// GFX12: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_hi
+// GFX12: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_hi
-// GFX12: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, null
+// GFX12: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, null
-// GFX12: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, -1
+// GFX12: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, -1
-// GFX12: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
-v_fract_f16_e64 v5, 0.5 mul:2
-// GFX12: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+v_fract_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
-v_fract_f16_e64 v5, src_scc mul:4
-// GFX12: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_fract_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_fract_f16_e64 v5.h, v1.h
+// GFX12: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5.l, v255.h
+// GFX12: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+v_fract_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_fract_f32_e64 v5, v1
 // GFX12: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
@@ -2497,50 +2521,59 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc|
 v_frexp_exp_i32_f64_e64 v255, 0xaf123456
 // GFX12: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16_e64 v5, v1
-// GFX12: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, v1.l
+// GFX12: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5.l, v255.l
+// GFX12: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, v255
-// GFX12: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s1
+// GFX12: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s1
-// GFX12: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s105
+// GFX12: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s105
-// GFX12: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_lo
+// GFX12: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_lo
-// GFX12: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_hi
+// GFX12: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_hi
-// GFX12: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, ttmp15
+// GFX12: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, ttmp15
-// GFX12: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, m0
+// GFX12: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, m0
-// GFX12: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_lo
+// GFX12: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_lo
-// GFX12: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_hi
+// GFX12: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_hi
-// GFX12: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, null
+// GFX12: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, null
-// GFX12: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, -1
+// GFX12: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, -1
-// GFX12: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
-v_frexp_mant_f16_e64 v5, 0.5 mul:2
-// GFX12: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+v_frexp_mant_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
-v_frexp_mant_f16_e64 v5, src_scc mul:4
-// GFX12: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16_e64 v5.h, v1.h
+// GFX12: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5.l, v255.h
+// GFX12: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f32_e64 v5, v1
 // GFX12: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
@@ -2824,50 +2857,59 @@ v_movrelsd_b32_e64 v255, v255
 v_nop_e64
 // GFX12: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, v1
-// GFX12: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, v1.l
+// GFX12: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5.l, v255.l
+// GFX12: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
-v_not_b16_e64 v5, v255
-// GFX12: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, s1
+// GFX12: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s1
-// GFX12: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, s105
+// GFX12: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s105
-// GFX12: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_lo
+// GFX12: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_lo
-// GFX12: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_hi
+// GFX12: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_hi
-// GFX12: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, ttmp15
+// GFX12: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, ttmp15
-// GFX12: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, m0
+// GFX12: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, m0
-// GFX12: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_lo
+// GFX12: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_lo
-// GFX12: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_hi
+// GFX12: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_hi
-// GFX12: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, null
+// GFX12: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, null
-// GFX12: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, -1
+// GFX12: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, -1
-// GFX12: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, 0.5
+// GFX12: v_not_b16_e64 v5.l, 0.5                 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, 0.5
-// GFX12: v_not_b16_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, src_scc
+// GFX12: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, src_scc
-// GFX12: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16_e64 v255.l, 0xfe0b
+// GFX12: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_not_b16_e64 v255, 0xfe0b
-// GFX12: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16_e64 v5.h, v1.h
+// GFX12: v_not_b16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5.l, v255.h
+// GFX12: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+v_not_b16_e64 v255.h, 0xfe0b
+// GFX12: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_not_b32_e64 v5, v1
 // GFX12: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
@@ -3088,50 +3130,59 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX12: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX12: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX12: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.l
+// GFX12: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX12: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX12: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX12: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX12: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX12: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX12: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX12: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX12: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX12: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX12: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX12: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX12: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX12: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX12: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX12: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX12: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX12: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX12: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX12: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX12: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX12: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX12: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX12: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v5.h, v1.h
+// GFX12: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5.l, v255.h
+// GFX12: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f32_e64 v5, v1
 // GFX12: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
@@ -3385,50 +3436,62 @@ v_sat_pk_u8_i16_e64 v5, src_scc
 v_sat_pk_u8_i16_e64 v255, 0xfe0b
 // GFX12: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sin_f16_e64 v5, v1
-// GFX12: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sat_pk_u8_i16_e64 v255.h, 0xfe0b
+// GFX12: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16_e64 v5.l, v1.l
+// GFX12: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, v255
-// GFX12: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+v_sin_f16_e64 v5.l, v255.l
+// GFX12: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, s1
-// GFX12: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s1
+// GFX12: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, s105
-// GFX12: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s105
+// GFX12: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_lo
-// GFX12: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_lo
+// GFX12: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_hi
-// GFX12: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_hi
+// GFX12: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, ttmp15
-// GFX12: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, ttmp15
+// GFX12: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, m0
-// GFX12: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, m0
+// GFX12: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_lo
-// GFX12: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_lo
+// GFX12: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_hi
-// GFX12: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_hi
+// GFX12: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, null
-// GFX12: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, null
+// GFX12: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, -1
-// GFX12: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, -1
+// GFX12: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, 0.5 mul:2
-// GFX12: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+v_sin_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
-v_sin_f16_e64 v5, src_scc mul:4
-// GFX12: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+v_sin_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
-v_sin_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16_e64 v5.h, v1.h
+// GFX12: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5.l, v255.h
+// GFX12: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_sin_f32_e64 v5, v1
 // GFX12: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
@@ -3601,50 +3664,59 @@ v_sqrt_f64_e64 v[5:6], -|src_scc| mul:4
 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX12: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_trunc_f16_e64 v5, v1
-// GFX12: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, v1.l
+// GFX12: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, v255.l
+// GFX12: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, s1
+// GFX12: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5.l, s105
+// GFX12: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, v255
-// GFX12: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_lo
+// GFX12: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s1
-// GFX12: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_hi
+// GFX12: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s105
-// GFX12: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, ttmp15
+// GFX12: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_lo
-// GFX12: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, m0
+// GFX12: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_hi
-// GFX12: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_lo
+// GFX12: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, ttmp15
-// GFX12: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_hi
+// GFX12: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, m0
-// GFX12: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, null
+// GFX12: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_lo
-// GFX12: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, -1
+// GFX12: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_hi
-// GFX12: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, 0.5 mul:2
+// GFX12: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
-v_trunc_f16_e64 v5, null
-// GFX12: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, src_scc mul:4
+// GFX12: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
-v_trunc_f16_e64 v5, -1
-// GFX12: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX12: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
-v_trunc_f16_e64 v5, 0.5 mul:2
-// GFX12: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+v_trunc_f16_e64 v5.h, v1.h
+// GFX12: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v5, src_scc mul:4
-// GFX12: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+v_trunc_f16_e64 v5.l, v255.h
+// GFX12: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX12: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_trunc_f16_e64 v255.h, -|0xfe0b| clamp div:2
+// GFX12: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f32_e64 v5, v1
 // GFX12: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 160bc3f..89102ae 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -211,47 +211,56 @@ v_clz_i32_u32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_cos_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_cos_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1000,47 +1009,50 @@ v_cvt_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_mirror
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+
+v_cvt_i32_i16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1363,47 +1375,50 @@ v_cvt_u32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_mirror
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_cvt_u32_u16_e64_dpp v255, v255.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_exp_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1699,47 +1714,56 @@ v_floor_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_fract_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_fract_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1876,47 +1900,56 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x05,0x30]
 
-v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2128,47 +2161,56 @@ v_movrels_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_mirror
-// GFX12: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
-v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
 
-v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+v_not_b16_e64_dpp v5.h, v1.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+
+v_not_b16_e64_dpp v5.l, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x09,0x13]
+
+v_not_b16_e64_dpp v255.h, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
 v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2338,47 +2380,56 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 boun
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2548,47 +2599,59 @@ v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl
 v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
-v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sat_pk_u8_i16_e64_dpp v255.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
+
+v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+v_sin_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+v_sin_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2716,47 +2779,56 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_mirror
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_half_mirror
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shl:1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shl:15
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shr:1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_shr:15
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-v_trunc_f16_e64_dpp v5, v1 row_ror:1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16_e64_dpp v5, v1 row_ror:15
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_trunc_f16_e64_dpp v5.h, v1.h mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_trunc_f16_e64_dpp v5.l, v1.h mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
 v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index c9ea7cd..1b1a91f 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -52,17 +52,26 @@ v_clz_i32_u32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xb9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_cos_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_cos_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe1,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_cos_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xe1,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -316,14 +325,17 @@ v_cvt_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x88,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x08,0xea,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -418,14 +430,17 @@ v_cvt_u32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x87,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_cvt_u32_u16_e64_dpp v255, v255.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x08,0xeb,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_exp_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -502,17 +517,26 @@ v_floor_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xa4,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_fract_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xdf,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_fract_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xdf,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -553,17 +577,26 @@ v_frexp_exp_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0xbf,0xd5,0xe9,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_frexp_mant_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xd9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xd9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -619,14 +652,23 @@ v_movrels_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xc3,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+v_not_b16_e64_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v5.l, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe9,0xd5,0xea,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_not_b16_e64_dpp v255.h, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe9,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -673,17 +715,26 @@ v_rcp_iflag_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xab,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rndne_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xde,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_rndne_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xde,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -730,17 +781,29 @@ v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
-v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sat_pk_u8_i16_e64_dpp v255.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0xe2,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+v_sin_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_sin_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xe0,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_sin_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xe0,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -778,17 +841,26 @@ v_sqrt_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xb3,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.h, v1.h mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_trunc_f16_e64_dpp v5.l, v1.h mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x08,0xdd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_trunc_f16_e64_dpp v255.h, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc1,0xdd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AsmParser/macro-def-in-instantiation.s b/llvm/test/MC/AsmParser/macro-def-in-instantiation.s
index b6df62e..96ddde82 100644
--- a/llvm/test/MC/AsmParser/macro-def-in-instantiation.s
+++ b/llvm/test/MC/AsmParser/macro-def-in-instantiation.s
@@ -9,22 +9,3 @@
 make_macro .macro,mybyte,a,.byte,\a,.endm
 # CHECK: .byte 42
 mybyte 42
-
-# PR18599
-.macro macro_a
- .macro macro_b
-  .byte 10
-  .macro macro_c
-  .endm
-
-  macro_c
-  .purgem macro_c
- .endm
-
- macro_b
-.endm
-
-# CHECK: .byte 10
-# CHECK: .byte 10
-macro_a
-macro_b
diff --git a/llvm/test/MC/AsmParser/macro-nesting.s b/llvm/test/MC/AsmParser/macro-nesting.s
new file mode 100644
index 0000000..37561a6
--- /dev/null
+++ b/llvm/test/MC/AsmParser/macro-nesting.s
@@ -0,0 +1,45 @@
+# RUN: llvm-mc -triple=x86_64 %s | FileCheck %s
+
+# CHECK:              .globl  inside_at_0
+# CHECK-NEXT:         .globl  inside_plus_0
+# CHECK-NEXT: inside_at_0:
+# CHECK-NEXT: inside_plus_0:
+# CHECK-NEXT:         .globl  after_at_0
+# CHECK-NEXT:         .globl  after_plus_0
+# CHECK-NEXT: after_at_0:
+# CHECK-NEXT: after_plus_0:
+
+.macro outer1
+  .macro inner1
+    .globl inside_at_\@
+    .globl inside_plus_\+
+    inside_at_\@:
+    inside_plus_\+:
+  .endm
+  inner1
+  .globl after_at_\@
+  .globl after_plus_\+
+  after_at_\@:
+  after_plus_\+:
+.endm
+
+outer1
+
+# PR18599
+.macro macro_a
+ .macro macro_b
+  .byte 10
+  .macro macro_c
+  .endm
+
+  macro_c
+  .purgem macro_c
+ .endm
+
+ macro_b
+.endm
+
+# CHECK: .byte 10
+# CHECK: .byte 10
+macro_a
+macro_b
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
index c5d074b..d198771 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
@@ -1,10 +1,18 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mc -triple aarch64 -disassemble %s  | FileCheck %s
 
-[0x1f,0x70,0x0e,0xd5]
+[0x00,0x70,0x0e,0xd5]
+[0x01,0x70,0x0e,0xd5]
+[0x02,0x70,0x0e,0xd5]
+[0x11,0x70,0x0e,0xd5]
+[0x1e,0x70,0x0e,0xd5]
 [0xa3,0x21,0x3e,0xd5]
 [0xa4,0x21,0x1e,0xd5]
 
-# CHECK:      	sys	#6, c7, c0, #0
+# CHECK:      	sys	#6, c7, c0, #0, x0
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x1
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x2
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x17
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x30
 # CHECK-NEXT: 	mrs	x3, GPCBW_EL3
 # CHECK-NEXT: 	msr	GPCBW_EL3, x4
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt
index 890a64b..9530167 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_smem.txt
@@ -1261,3 +1261,18 @@
 
 # GFX10: s_store_dwordx4 s[96:99], s[4:5], s0    ; encoding: [0x02,0x18,0x48,0xf4,0x00,0x00,0x00,0x00]
 0x02,0x18,0x48,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dword null, s[2:3], s0           ; encoding: [0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dwordx2 null, s[2:3], s0         ; encoding: [0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dwordx4 null, s[2:3], s0         ; encoding: [0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dwordx8 null, s[2:3], s0         ; encoding: [0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00
+
+# GFX10: s_load_dwordx16 null, s[2:3], s0        ; encoding: [0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00]
+0x41,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
index 8b49de5..8396132 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_smem.txt
@@ -471,3 +471,18 @@
 
 # GFX11: s_gl1_inv                               ; encoding: [0x00,0x00,0x80,0xf4,0x00,0x00,0x00,0x00]
 0x00,0x00,0x80,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b32 null, s[2:3], s0             ; encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b64 null, s[2:3], s0             ; encoding: [0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x04,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b128 null, s[2:3], s0            ; encoding: [0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x08,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b256 null, s[2:3], s0            ; encoding: [0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x0c,0xf4,0x00,0x00,0x00,0x00
+
+# GFX11: s_load_b512 null, s[2:3], s0            ; encoding: [0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x10,0xf4,0x00,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
index 61e529a..57a1da6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
@@ -287,49 +287,82 @@
 # GFX11: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xc3,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xc3,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
 
 0x7f,0xc3,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc3,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
 0x01,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
 
 0x69,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, s105                ; encoding: [0x69,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
 
 0x6a,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
 
 0x6b,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
 
 0x7b,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
 
 0x7d,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, m0                  ; encoding: [0x7d,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
 
 0x7e,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
 
 0x7f,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
 
 0x7c,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, null                ; encoding: [0x7c,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
 
 0xc1,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
 
 0xf0,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
 
 0xfd,0xc2,0x0a,0x7e
-# GFX11: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
+# GFX11-REAL16: v_cos_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
 0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xc3,0x0a,0x7e
+# GFX11-REAL16: v_cos_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc3,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xc3,0x0a,0x7e]
+
+0xff,0xc3,0x0a,0x7e
+# GFX11-REAL16: v_cos_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc3,0x0a,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xc3,0x0a,0x7e]
+
+0xf0,0xc2,0xfe,0x7e
+# GFX11-REAL16: v_cos_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc2,0xfe,0x7e]
+# GFX11-FAKE16: v_cos_f16_e32 v127, 0.5                 ; encoding: [0xf0,0xc2,0xfe,0x7e]
+
+0xfd,0xc2,0x0a,0x7f
+# GFX11-REAL16: v_cos_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7f]
+
+0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x6d,0x0a,0x7e
 # GFX11: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
@@ -1300,10 +1333,12 @@
 # GFX11: v_cvt_i32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xd5,0x0a,0x7e
-# GFX11: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
+# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v1.l              ; encoding: [0x01,0xd5,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
 
 0x7f,0xd5,0x0a,0x7e
-# GFX11: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
+# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v127.l            ; encoding: [0x7f,0xd5,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
 0x01,0xd4,0x0a,0x7e
 # GFX11: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
@@ -1344,6 +1379,15 @@
 0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00
 # GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+0x81,0xd5,0x0a,0x7e
+# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v1.h              ; encoding: [0x81,0xd5,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd5,0x0a,0x7e]
+
+0xff,0xd5,0x0a,0x7e
+# GFX11-REAL16: v_cvt_i32_i16_e32 v5, v127.h            ; encoding: [0xff,0xd5,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_i32_i16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd5,0x0a,0x7e]
+
+
 0x01,0x19,0x0a,0x7e
 # GFX11: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
 
@@ -1714,10 +1758,12 @@
 # GFX11: v_cvt_u32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xd7,0x0a,0x7e
-# GFX11: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
+# GFX11-REAL16: v_cvt_u32_u16_e32 v5, v1.l              ; encoding: [0x01,0xd7,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
 
 0x7f,0xd7,0x0a,0x7e
-# GFX11: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
+# GFX11-REAL16: v_cvt_u32_u16_e32 v5, v127.l            ; encoding: [0x7f,0xd7,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
 0x01,0xd6,0x0a,0x7e
 # GFX11: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
@@ -1758,6 +1804,15 @@
 0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00
 # GFX11: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
+0x81,0xd7,0x0a,0x7e
+# GFX11-REAL16: v_cvt_u32_u16_e32 v5, v1.h              ; encoding: [0x81,0xd7,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_u32_u16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd7,0x0a,0x7e]
+
+0xff,0xd7,0x0a,0x7e
+# GFX11-REAL16: v_cvt_u32_u16_e32 v5, v127.h            ; encoding: [0xff,0xd7,0x0a,0x7e]
+# GFX11-FAKE16: v_cvt_u32_u16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd7,0x0a,0x7e]
+
+
 0x01,0xb1,0x0a,0x7e
 # GFX11-REAL16: v_exp_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xb1,0x0a,0x7e]
 # GFX11-FAKE16: v_exp_f16_e32 v5, v1                    ; encoding: [0x01,0xb1,0x0a,0x7e]
@@ -2017,49 +2072,82 @@
 # GFX11: v_floor_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xbf,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbf,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
 
 0x7f,0xbf,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbf,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
 
 0x01,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, s1                ; encoding: [0x01,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
 
 0x69,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, s105              ; encoding: [0x69,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
 
 0x6a,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
 
 0x6b,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
 
 0x7b,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
 
 0x7d,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
 
 0x7e,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
 
 0x7f,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
 
 0x7c,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, null              ; encoding: [0x7c,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
 
 0xc1,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
 
 0xf0,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
 
 0xfd,0xbe,0x0a,0x7e
-# GFX11: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
+# GFX11-REAL16: v_fract_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
 0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xbf,0x0a,0x7e
+# GFX11-REAL16: v_fract_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbf,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbf,0x0a,0x7e]
+
+0xff,0xbf,0x0a,0x7e
+# GFX11-REAL16: v_fract_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbf,0x0a,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbf,0x0a,0x7e]
+
+0xf0,0xbe,0xfe,0x7e
+# GFX11-REAL16: v_fract_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbe,0xfe,0x7e]
+# GFX11-FAKE16: v_fract_f16_e32 v127, 0.5               ; encoding: [0xf0,0xbe,0xfe,0x7e]
+
+0xfd,0xbe,0x0a,0x7f
+# GFX11-REAL16: v_fract_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7f]
+
+0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x41,0x0a,0x7e
 # GFX11: v_fract_f32_e32 v5, v1                  ; encoding: [0x01,0x41,0x0a,0x7e]
@@ -2290,49 +2378,82 @@
 # GFX11: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xb3,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v1.l         ; encoding: [0x01,0xb3,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
 
 0x7f,0xb3,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v127.l       ; encoding: [0x7f,0xb3,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
 
 0x01,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, s1           ; encoding: [0x01,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
 
 0x69,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, s105         ; encoding: [0x69,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
 
 0x6a,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, vcc_lo       ; encoding: [0x6a,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
 
 0x6b,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, vcc_hi       ; encoding: [0x6b,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
 
 0x7b,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, ttmp15       ; encoding: [0x7b,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
 
 0x7d,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, m0           ; encoding: [0x7d,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
 
 0x7e,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, exec_lo      ; encoding: [0x7e,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
 
 0x7f,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, exec_hi      ; encoding: [0x7f,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
 
 0x7c,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, null         ; encoding: [0x7c,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
 
 0xc1,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, -1           ; encoding: [0xc1,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
 
 0xf0,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, 0.5          ; encoding: [0xf0,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
 
 0xfd,0xb2,0x0a,0x7e
-# GFX11: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
 0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e32 v127.l, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xb3,0x0a,0x7e
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v1.h         ; encoding: [0x81,0xb3,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xb3,0x0a,0x7e]
+
+0xff,0xb3,0x0a,0x7e
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.l, v127.h       ; encoding: [0xff,0xb3,0x0a,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xb3,0x0a,0x7e]
+
+0xf0,0xb2,0xfe,0x7e
+# GFX11-REAL16: v_frexp_mant_f16_e32 v127.l, 0.5        ; encoding: [0xf0,0xb2,0xfe,0x7e]
+# GFX11-FAKE16: v_frexp_mant_f16_e32 v127, 0.5          ; encoding: [0xf0,0xb2,0xfe,0x7e]
+
+0xfd,0xb2,0x0a,0x7f
+# GFX11-REAL16: v_frexp_mant_f16_e32 v5.h, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7f]
+
+0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e32 v127.h, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x81,0x0a,0x7e
 # GFX11: v_frexp_mant_f32_e32 v5, v1             ; encoding: [0x01,0x81,0x0a,0x7e]
@@ -2638,49 +2759,82 @@
 # GFX11: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
 
 0x01,0xd3,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, v1.l                ; encoding: [0x01,0xd3,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
 
 0x7f,0xd3,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, v127.l              ; encoding: [0x7f,0xd3,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
 
 0x01,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, s1                  ; encoding: [0x01,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
 
 0x69,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, s105                ; encoding: [0x69,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
 
 0x6a,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
 
 0x6b,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
 
 0x7b,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
 
 0x7d,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, m0                  ; encoding: [0x7d,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
 
 0x7e,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
 
 0x7f,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
 
 0x7c,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, null                ; encoding: [0x7c,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
 
 0xc1,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, -1                  ; encoding: [0xc1,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
 
 0xf0,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, 0x3800                ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e32 v5.l, 0x3800              ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e32 v5, 0x3800                ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
 
 0xfd,0xd2,0x0a,0x7e
-# GFX11: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
+# GFX11-REAL16: v_not_b16_e32 v5.l, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
 0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xd3,0x0a,0x7e
+# GFX11-REAL16: v_not_b16_e32 v5.l, v1.h                ; encoding: [0x81,0xd3,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xd3,0x0a,0x7e]
+
+0xff,0xd3,0x0a,0x7e
+# GFX11-REAL16: v_not_b16_e32 v5.l, v127.h              ; encoding: [0xff,0xd3,0x0a,0x7e]
+# GFX11-FAKE16: v_not_b16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xd3,0x0a,0x7e]
+
+0xf0,0xd2,0xfe,0x7e
+# GFX11-REAL16: v_not_b16_e32 v127.l, 0x3800            ; encoding: [0xff,0xd2,0xfe,0x7e,0x00,0x38,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e32 v127, 0x3800              ; encoding: [0xff,0xd2,0xfe,0x7e,0x00,0x38,0x00,0x00]
+
+0xfd,0xd2,0x0a,0x7f
+# GFX11-REAL16: v_not_b16_e32 v5.h, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7f]
+
+0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_not_b16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x6f,0x0a,0x7e
 # GFX11: v_not_b32_e32 v5, v1                    ; encoding: [0x01,0x6f,0x0a,0x7e]
@@ -2947,49 +3101,82 @@
 # GFX11: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
 
 0x01,0xbd,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
 
 0x7f,0xbd,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
 0x01,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
 
 0x69,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, s105              ; encoding: [0x69,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
 
 0x6a,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
 
 0x6b,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
 
 0x7b,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
 
 0x7d,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, m0                ; encoding: [0x7d,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
 
 0x7e,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
 
 0x7f,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
 
 0x7c,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, null              ; encoding: [0x7c,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
 
 0xc1,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, -1                ; encoding: [0xc1,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
 
 0xf0,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
 
 0xfd,0xbc,0x0a,0x7e
-# GFX11: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
 0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xbd,0x0a,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+0xff,0xbd,0x0a,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbd,0x0a,0x7e]
+
+0xf0,0xbc,0xfe,0x7e
+# GFX11-REAL16: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+# GFX11-FAKE16: v_rndne_f16_e32 v127, 0.5               ; encoding: [0xf0,0xbc,0xfe,0x7e]
+
+0xfd,0xbc,0x0a,0x7f
+# GFX11-REAL16: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
+
+0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x47,0x0a,0x7e
 # GFX11: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
@@ -3220,94 +3407,152 @@
 # GFX11: v_rsq_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 0x01,0xc5,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, v1            ; encoding: [0x01,0xc5,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
 
 0xff,0xc5,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, v255          ; encoding: [0xff,0xc5,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
 
 0x01,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, s1            ; encoding: [0x01,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
 
 0x69,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, s105          ; encoding: [0x69,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
 
 0x6a,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, vcc_lo        ; encoding: [0x6a,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
 
 0x6b,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, vcc_hi        ; encoding: [0x6b,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
 
 0x7b,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, ttmp15        ; encoding: [0x7b,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
 
 0x7d,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, m0            ; encoding: [0x7d,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
 
 0x7e,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, exec_lo       ; encoding: [0x7e,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
 
 0x7f,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, exec_hi       ; encoding: [0x7f,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
 
 0x7c,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, null          ; encoding: [0x7c,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
 
 0xc1,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, -1            ; encoding: [0xc1,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
 
 0xf0,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, 0.5           ; encoding: [0xf0,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
 
 0xfd,0xc4,0x0a,0x7e
-# GFX11: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.l, src_scc       ; encoding: [0xfd,0xc4,0x0a,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
 
 0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.l, 0xfe0b      ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0xf0,0xc4,0xfe,0x7e
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.l, 0.5         ; encoding: [0xf0,0xc4,0xfe,0x7e]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e32 v127, 0.5           ; encoding: [0xf0,0xc4,0xfe,0x7e]
+
+0xfd,0xc4,0x0a,0x7f
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v5.h, src_scc       ; encoding: [0xfd,0xc4,0x0a,0x7f]
+
+0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_sat_pk_u8_i16_e32 v127.h, 0xfe0b      ; encoding: [0xff,0xc4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0xc1,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xc1,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
 
 0x7f,0xc1,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc1,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
 0x01,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
 
 0x69,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, s105                ; encoding: [0x69,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
 
 0x6a,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
 
 0x6b,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
 
 0x7b,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, ttmp15              ; encoding: [0x7b,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
 
 0x7d,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, m0                  ; encoding: [0x7d,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
 
 0x7e,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, exec_lo             ; encoding: [0x7e,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
 
 0x7f,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, exec_hi             ; encoding: [0x7f,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
 
 0x7c,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, null                ; encoding: [0x7c,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
 
 0xc1,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
 
 0xf0,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
 
 0xfd,0xc0,0x0a,0x7e
-# GFX11: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
+# GFX11-REAL16: v_sin_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
 0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xc1,0x0a,0x7e
+# GFX11-REAL16: v_sin_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc1,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xc1,0x0a,0x7e]
+
+0xff,0xc1,0x0a,0x7e
+# GFX11-REAL16: v_sin_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc1,0x0a,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xc1,0x0a,0x7e]
+
+0xf0,0xc0,0xfe,0x7e
+# GFX11-REAL16: v_sin_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc0,0xfe,0x7e]
+# GFX11-FAKE16: v_sin_f16_e32 v127, 0.5                 ; encoding: [0xf0,0xc0,0xfe,0x7e]
+
+0xfd,0xc0,0x0a,0x7f
+# GFX11-REAL16: v_sin_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7f]
+
+0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x6b,0x0a,0x7e
 # GFX11: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
@@ -3520,49 +3765,82 @@
 # GFX11: v_swaprel_b32 v255, v255                ; encoding: [0xff,0xd1,0xfe,0x7f]
 
 0x01,0xbb,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, v1.l              ; encoding: [0x01,0xbb,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
 
 0x7f,0xbb,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbb,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
 
 0x01,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, s1                ; encoding: [0x01,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
 
 0x69,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, s105              ; encoding: [0x69,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
 
 0x6a,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
 
 0x6b,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
 
 0x7b,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
 
 0x7d,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, m0                ; encoding: [0x7d,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
 
 0x7e,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
 
 0x7f,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
 
 0x7c,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, null              ; encoding: [0x7c,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
 
 0xc1,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, -1                ; encoding: [0xc1,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
 
 0xf0,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
 
 0xfd,0xba,0x0a,0x7e
-# GFX11: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
 
 0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00
-# GFX11: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+0x81,0xbb,0x0a,0x7e
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xbb,0x0a,0x7e]
+
+0xff,0xbb,0x0a,0x7e
+# GFX11-REAL16: v_trunc_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbb,0x0a,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xbb,0x0a,0x7e]
+
+0xf0,0xba,0xfe,0x7e
+# GFX11-REAL16: v_trunc_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xba,0xfe,0x7e]
+# GFX11-FAKE16: v_trunc_f16_e32 v127, 0.5               ; encoding: [0xf0,0xba,0xfe,0x7e]
+
+0xfd,0xba,0x0a,0x7f
+# GFX11-REAL16: v_trunc_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7f]
+
+0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e32 v127.h, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 0x01,0x43,0x0a,0x7e
 # GFX11: v_trunc_f32_e32 v5, v1                  ; encoding: [0x01,0x43,0x0a,0x7e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
index 1075a3e..cabae81 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
@@ -229,46 +229,72 @@
 # GFX11: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cos_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -971,46 +997,72 @@
 # GFX11: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
-# GFX11: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13]
+
+0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1307,46 +1359,72 @@
 # GFX11: v_cvt_u32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
-# GFX11: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd6,0x0a,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13]
+
+0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cvt_u32_u16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11-REAL16: v_exp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1545,46 +1623,72 @@
 # GFX11: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fract_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1727,46 +1831,72 @@
 # GFX11: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2119,46 +2249,72 @@
 # GFX11: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30
-# GFX11: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_not_b16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
 
 0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2343,46 +2499,72 @@
 # GFX11: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2525,88 +2707,140 @@
 # GFX11: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30
-# GFX11: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+
+0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, s1, v176        ; encoding: [0x01,0x60,0x01,0x13]
+
+0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sin_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2749,46 +2983,72 @@
 # GFX11: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX11: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX11: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX11: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX11: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-REAL16: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+
+0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX11-REAL16: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX11-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX11: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
index 051dd34..fc7cbba 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
@@ -43,10 +43,23 @@
 # GFX11: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -185,10 +198,24 @@
 # GFX11: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00
-# GFX11: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -254,10 +281,24 @@
 # GFX11: v_cvt_u32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00
-# GFX11: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cvt_u32_u16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd6,0x0a,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cvt_u32_u16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cvt_u32_u16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11-REAL16: v_exp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -302,10 +343,23 @@
 # GFX11: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -335,14 +389,33 @@
 # GFX11: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
+# GFX11-FAKE16-REAL16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
+# GFX11-FAKE16-REAL16: v_frexp_mant_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+# GFX11-REAL16-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
+0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
 0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00
 # GFX11: v_frexp_mant_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
@@ -398,10 +471,23 @@
 # GFX11: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -437,10 +523,23 @@
 # GFX11: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -470,16 +569,42 @@
 # GFX11: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+
+0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sat_pk_u8_i16_dpp v127, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0xfe,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v1, v187        ; encoding: [0x01,0x77,0x39,0x05]
+
+0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -509,10 +634,23 @@
 # GFX11: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX11: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX11: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+
+0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_dot2acc_f32_f16 v156, v129, v187      ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX11: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index 4990c62..adcca58 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -2402,53 +2402,125 @@
 # GFX11: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00
-# GFX11: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01
-# GFX11: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01
-# GFX11: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01
-# GFX11: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX11: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX11: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX11: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX11: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01
-# GFX11: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX11: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX11: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX11: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX11: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23
-# GFX11: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
 
 0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 # CHECK: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2    ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_fma_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_fma_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_fma_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_fma_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_fma_f32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
@@ -4289,49 +4361,118 @@
 # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00
-# GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01
-# GFX11: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01
-# GFX11: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01
-# GFX11: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX11: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01
-# GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33
-# GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_maxmin_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00]
@@ -5779,49 +5920,118 @@
 # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00
-# GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01
-# GFX11: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01
-# GFX11: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01
-# GFX11: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX11: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01
-# GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33
-# GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_minmax_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index d734cd2..2964360 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -2113,46 +2113,118 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2833,46 +2905,118 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4431,46 +4575,130 @@
 # W64-FAKE16: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13
-# GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
 0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
index 2666b75..282ff229 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt
@@ -239,46 +239,72 @@
 # GFX11: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1025,46 +1051,64 @@
 # GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1397,46 +1441,64 @@
 # GFX11: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11-REAL16: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1659,46 +1721,72 @@
 # GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1853,46 +1941,72 @@
 # GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2257,46 +2371,72 @@
 # GFX11: v_movrelsd_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+
+0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2493,46 +2633,72 @@
 # GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2687,88 +2853,132 @@
 # GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2923,46 +3133,72 @@
 # GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index 3b3d398..7a81ba2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -1141,40 +1141,106 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1585,40 +1651,106 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2547,46 +2679,130 @@
 # W64-FAKE16: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX11: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX11: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x78,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # W32-REAL16: v_mad_i16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
index c19947c..5995762 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt
@@ -61,16 +61,32 @@
 # GFX11: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -291,10 +307,16 @@
 # GFX11: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -381,10 +403,16 @@
 # GFX11: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x08,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11-REAL16: v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -455,16 +483,32 @@
 # GFX11: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -505,16 +549,32 @@
 # GFX11: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -599,10 +659,24 @@
 # GFX11: v_movrelsd_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xc4,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -663,16 +737,32 @@
 # GFX11: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -727,22 +817,44 @@
 # GFX11: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -797,16 +909,32 @@
 # GFX11: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX11: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
index 3df206c..d7e7390 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop1.txt
@@ -281,49 +281,76 @@
 # GFX11: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
@@ -1314,10 +1341,12 @@
 # GFX11: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00
 # GFX11: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1358,6 +1387,10 @@
 0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX11: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+
 0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
 
@@ -1746,10 +1779,12 @@
 # GFX11: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00
 # GFX11: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1790,6 +1825,10 @@
 0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX11: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_cvt_u32_u16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
 0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00
 # GFX11-REAL16: v_exp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 # GFX11-FAKE16: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
@@ -2037,49 +2076,76 @@
 # GFX11: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
@@ -2316,49 +2382,76 @@
 # GFX11: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
@@ -2658,49 +2751,76 @@
 # GFX11: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, 0x3800              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00
-# GFX11: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
 0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_not_b16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
@@ -2937,49 +3057,76 @@
 # GFX11: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
@@ -3204,94 +3351,140 @@
 # GFX11: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255          ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105          ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo        ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi        ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15        ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0            ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo       ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi       ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, null          ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1            ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5           ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc       ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
 
 0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX11: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b      ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
@@ -3480,49 +3673,76 @@
 # GFX11: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00
-# GFX11: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08
-# GFX11: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10
-# GFX11: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX11: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00
 # GFX11: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
index 28decdd..02641e6 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
@@ -1277,3 +1277,21 @@
 
 # GFX12: s_buffer_load_u16 s5, s[96:99], s0 offset:0x0 th:TH_LOAD_HT scope:SCOPE_SYS ; encoding:  [0x70,0x61,0x63,0xf5,0x00,0x00,0x00,0x00]
 0x70,0x61,0x63,0xf5,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b32 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x1f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b64 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x3f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b96 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0xbf,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b128 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x5f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b256 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x7f,0x00,0xf4,0x00,0x00,0x00,0x00
+
+# GFX12: s_load_b512 null, s[2:3], s0 offset:0x0  ; encoding: [0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00]
+0x01,0x9f,0x00,0xf4,0x00,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
index a1291b2..181b78f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
@@ -237,46 +237,68 @@
 # GFX12: v_clz_i32_u32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_cos_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_cos_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cos_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_cos_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc2,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1035,46 +1057,68 @@
 # GFX12: v_cvt_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
-# GFX12: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd4,0x0a,0x7e,0x81,0x60,0x01,0x13]
+
+0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1395,46 +1439,68 @@
 # GFX12: v_cvt_u32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30
-# GFX12: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v255, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd6,0x0a,0x7e,0x81,0x60,0x01,0x13]
+
+0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cvt_u32_u16_dpp v255, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd6,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1649,46 +1715,68 @@
 # GFX12: v_floor_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_fract_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_fract_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_fract_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbe,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_fract_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbe,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1839,46 +1927,68 @@
 # GFX12: v_frexp_exp_i32_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_frexp_mant_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb2,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2239,46 +2349,68 @@
 # GFX12: v_movrelsd_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30
-# GFX12: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_not_b16_dpp v127.l, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_not_b16_dpp v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x0d,0x30]
+
+0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_not_b16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xd2,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_not_b16_dpp v127.h, v127.h row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xd2,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
 
 0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2471,46 +2603,68 @@
 # GFX12: v_rcp_iflag_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_rndne_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_rndne_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_rndne_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xbc,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_rndne_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xbc,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2661,88 +2815,132 @@
 # GFX12: v_rsq_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30
-# GFX12: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x0d,0x30]
+
+0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc4,0x0a,0x7f,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, s1, v176        ; encoding: [0x01,0x60,0x01,0x13]
+
+0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc4,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_sin_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_sin_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_sin_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xc0,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_sin_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xc0,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -2893,46 +3091,68 @@
 # GFX12: v_sqrt_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX12: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX12: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13
-# GFX12: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x01,0x13]
 
 0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
-# GFX12: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-REAL16: v_trunc_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_trunc_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
+
+0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_trunc_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xba,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_mul_i32_i24_e32 v128, 1, v176         ; encoding: [0x81,0x60,0x01,0x13]
+
+0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30
+# GFX12-REAL16: v_trunc_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xba,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# GFX12-FAKE16: v_lshlrev_b32_e32 v30, v255, v183       ; encoding: [0xff,0x6f,0x3d,0x30]
 
 0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff
 # GFX12: v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
index 05008bf..7f9b268 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
@@ -44,10 +44,19 @@
 # GFX12: v_clz_i32_u32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cos_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cos_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cos_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_cos_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -198,10 +207,20 @@
 # GFX12: v_cvt_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00
-# GFX12: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cvt_i32_i16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cvt_i32_i16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_cvt_nearest_i32_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -267,10 +286,20 @@
 # GFX12: v_cvt_u32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00
-# GFX12: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cvt_u32_u16_dpp v255, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cvt_u32_u16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd6,0x0a,0x7e,0x81,0x77,0x39,0x05]
+
+0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cvt_u32_u16_dpp v255, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_dpp v255, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_exp_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -315,10 +344,19 @@
 # GFX12: v_floor_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fract_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_fract_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbe,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_fract_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbe,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_fract_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -348,10 +386,19 @@
 # GFX12: v_frexp_exp_i32_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_frexp_mant_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb2,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_frexp_mant_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -411,10 +458,20 @@
 # GFX12: v_movrelsd_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_not_b16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_not_b16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_not_b16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_not_b16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xd2,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
 
 0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_not_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -450,10 +507,19 @@
 # GFX12: v_rcp_iflag_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xbc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xbc,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_rndne_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -483,16 +549,34 @@
 # GFX12: v_rsq_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_dpp v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+
+0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc4,0x0a,0x7f,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[1:2], v[187:188] ; encoding: [0x01,0x77,0x39,0x05]
+
+0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_sat_pk_u8_i16_dpp v127.h, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc4,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sin_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_sin_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xc0,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_sin_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xc0,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_sin_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -522,10 +606,19 @@
 # GFX12: v_sqrt_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX12: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_trunc_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX12: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_trunc_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xba,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xba,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_trunc_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 4c2060a..633d3a4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -2377,49 +2377,118 @@
 # GFX12: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_fma_f16 v5.l, v1.l, v2.l, s3          ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_fma_f16 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_fma_f16 v5.l, v255.l, s2, s105        ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_fma_f16 v5, v255, s2, s105            ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s1, v255.l, exec_hi     ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_fma_f16 v5, s1, v255, exec_hi         ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s105, s105, exec_lo     ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_fma_f16 v5, s105, s105, exec_lo       ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l    ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_fma_f16 v5, vcc_lo, ttmp15, v3        ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l  ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255      ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_fma_f16 v5.l, m0, 0.5, m0             ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_fma_f16 v5, m0, 0.5, m0               ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi   ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_fma_f16 v5, |exec_lo|, -1, vcc_hi     ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
 
 0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX12: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_fma_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_fma_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_fma_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_fma_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_fma_f32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
@@ -3509,49 +3578,100 @@
 # GFX12: v_mad_co_u64_u32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp ; encoding: [0xfe,0xfc,0xfe,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_max3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_max3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_max3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_max3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_max3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_max3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2c,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_max3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_max3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2c,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_max3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_max3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_max3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_max3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2c,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_max3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_max3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2c,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_max3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2c,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_max3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_max3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2c,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_max3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_max3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_max3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2c,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2c,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_max3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_max3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2c,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_max3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_max3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2c,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_max3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_max3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2c,0xd6,0xfd,0xd4,0x04,0x23]
 
 0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX12: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2c,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_max3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_max3_num_f32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x00]
@@ -4145,49 +4265,120 @@
 # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX12: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33
-# GFX12: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+
+
+0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+
+
+0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_maxmin_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00]
@@ -4886,49 +5077,100 @@
 # GFX12: v_med3_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x21,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_min3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_min3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_min3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_min3_num_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_min3_num_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_min3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2b,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_min3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_min3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2b,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_min3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_min3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_min3_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_min3_num_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2b,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_min3_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_min3_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2b,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_min3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2b,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_min3_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_min3_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2b,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_min3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_min3_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_min3_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min3_num_f16 v5.h, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[1,1,1,1] ; encoding: [0x05,0x7c,0x2b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_min3_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_min3_num_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x2b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_min3_num_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_min3_num_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x2b,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_min3_num_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_min3_num_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x2b,0xd6,0xfd,0xd4,0x04,0x23]
 
 0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX12: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min3_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x2b,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_min3_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_min3_num_f32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x00]
@@ -5522,49 +5764,120 @@
 # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX12: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33
-# GFX12: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+
+
+0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+
+
+0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_minmax_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index c64fe39..7e30a4a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -2329,52 +2329,131 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+
+0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3082,49 +3161,125 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+
+0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4764,49 +4919,124 @@
 # W64-FAKE16: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x48,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x48,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x48,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x48,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13
-# GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x48,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
 0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x48,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x48,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -5199,49 +5429,119 @@
 # GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2c,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2c,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2c,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2c,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2c,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2c,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
 0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2c,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+
 
 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -5964,49 +6264,119 @@
 # W64-FAKE16: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x7c,0x2b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x0b,0x2b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x15,0x2b,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x26,0x2b,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x01,0x13]
 
 0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2b,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
+
 
 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 9ed20c7..2aaba2a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -1294,43 +1294,112 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1768,43 +1837,112 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2814,52 +2952,130 @@
 # W64-FAKE16: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x54,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, 4.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, 4.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, 4.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, 4.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, 4.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x48,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x48,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x48,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x48,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x48,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x48,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x48,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05
+# W32-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_fma_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_fma_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x48,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
+# W32-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_fma_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x48,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # W32-REAL16: v_mad_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3240,49 +3456,119 @@
 # GFX12: v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,1,0,0] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x90,0x59,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2c,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2c,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2c,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2c,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2c,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2c,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_max3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2c,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2c,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05
+# W32-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2c,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
 
 0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3981,49 +4267,119 @@
 # W64-FAKE16: v_med3_u16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0x51,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, |v1|, v2, -m0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, -v1.h, v2.h, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, v2, |exec_lo| op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x7c,0x2b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.h|, -|v2.l|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, null op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0b,0x2b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, v2.h, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, v2, -|-1| op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x15,0x2b,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x26,0x2b,0xd6,0xe9,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_min3_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x2b,0xd6,0xea,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2b,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05
+# W32-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x2b,0xd6,0xe9,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
 
 0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index bb9f607..25c4e4a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -279,49 +279,76 @@
 # GFX12: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_cos_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_cos_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cos_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
@@ -1366,10 +1393,12 @@
 # GFX12: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00
 # GFX12: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1410,6 +1439,10 @@
 0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_cvt_i32_i16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+
 0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
 
@@ -1798,10 +1831,12 @@
 # GFX12: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00
 # GFX12: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1842,6 +1877,10 @@
 0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
+0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_cvt_u32_u16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
 0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00
 # GFX12-REAL16: v_exp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 # GFX12-FAKE16: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
@@ -2089,50 +2128,78 @@
 # GFX12: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
+0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_fract_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_fract_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+# GFX11: v_fract_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdf,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2368,50 +2435,76 @@
 # GFX12: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
-
+# GFX12-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+# GFX11: v_frexp_mant_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xd9,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
 
@@ -2707,49 +2800,76 @@
 # GFX12: v_movrelsd_b32_e64 v255, v255           ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, 0x3800              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00
-# GFX12: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
 0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX12: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_not_b16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_not_b16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_not_b16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
@@ -2983,49 +3103,76 @@
 # GFX12: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
@@ -3250,94 +3397,140 @@
 # GFX12: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255          ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105          ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo        ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi        ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15        ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0            ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo       ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi       ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, null          ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1            ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5           ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc       ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
 
 0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b      ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_sat_pk_u8_i16_e64 v255.h, 0xfe0b op_sel:[0,1] ; encoding: [0xff,0x40,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_sin_f16_e64 v5.h, v1.h op_sel:[1,1]   ; encoding: [0x05,0x48,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_sin_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_sin_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
@@ -3526,49 +3719,76 @@
 # GFX12: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08
-# GFX12: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
 0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10
-# GFX12: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
 0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
-# GFX12: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_e64 v5.h, v1.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_e64 v5.l, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_e64 v255.h, -|0xfe0b| op_sel:[0,1] clamp div:2 ; encoding: [0xff,0xc1,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00
 # GFX12: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index be9f069..f447fb4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -227,46 +227,72 @@
 # GFX12: v_clz_i32_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe1,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cos_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb6,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1055,46 +1081,64 @@
 # GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8c,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1427,46 +1471,64 @@
 # GFX12: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -1665,47 +1727,74 @@
 # GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
+0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdf,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdf,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_fract_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
@@ -1859,47 +1948,72 @@
 # GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xfa,0x00,0x00,0x20,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
-
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xd9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xd9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xc0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
@@ -2125,46 +2239,72 @@
 # GFX12: v_movrels_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_not_b16_e64_dpp v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+
+0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_not_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb7,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2349,46 +2489,72 @@
 # GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xde,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_rndne_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa3,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2531,88 +2697,132 @@
 # GFX12: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
-# GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+
+0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xe0,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_sin_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb5,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
@@ -2755,46 +2965,72 @@
 # GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
-# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
 
 0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
-# GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+
+0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x48,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x08,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0xdd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x01,0x13]
+
+0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_trunc_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xa1,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index 87115b9..7cf415a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -49,16 +49,32 @@
 # GFX12: v_clz_i32_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xb9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cos_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cos_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe1,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cos_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cos_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe1,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cos_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb6,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -321,10 +337,16 @@
 # GFX12: v_cvt_i32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x88,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xea,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cvt_i32_i16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_i32_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xea,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cvt_nearest_i32_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8c,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -411,10 +433,16 @@
 # GFX12: v_cvt_u32_f32_e64_dpp v255, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x87,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xeb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x08,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cvt_u32_u16_e64_dpp v255, v255.h op_sel:[1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x08,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cvt_u32_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xeb,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_exp_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -473,17 +501,34 @@
 # GFX12: v_floor_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xa4,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_fract_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
+0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_fract_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdf,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_fract_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_fract_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdf,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+# GFX11: v_fract_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdf,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_fract_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
@@ -523,17 +568,32 @@
 # GFX12: v_frexp_exp_i32_f32_e64_dpp v255, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x01,0xbf,0xd5,0xea,0x00,0x00,0x20,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xd9,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_frexp_mant_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_frexp_mant_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xd9,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+# GFX11: v_frexp_mant_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xd9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_frexp_mant_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xc0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
@@ -587,10 +647,24 @@
 # GFX12: v_movrels_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xc3,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_not_b16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_not_b16_e64_dpp v5.h, v1.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_not_b16_e64_dpp v5.l, v1.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_not_b16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_not_b16_e64_dpp v255.h, v255.l op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_not_b16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe9,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_not_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb7,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -639,16 +713,32 @@
 # GFX12: v_rcp_iflag_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xab,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_rndne_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xde,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_rndne_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_rndne_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xde,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_rndne_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa3,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -691,22 +781,44 @@
 # GFX12: v_rsq_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xae,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v5.l, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
-# GFX12: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.l, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+
+0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_sat_pk_u8_i16_e64_dpp v255.h, v255 op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x40,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sat_pk_u8_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xe2,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_sin_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_sin_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xe0,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_sin_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_sin_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xe0,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_sin_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb5,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -749,16 +861,32 @@
 # GFX12: v_sqrt_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xb3,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX12: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX12: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_trunc_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.h, v1.h op_sel:[1,1] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_trunc_f16_e64_dpp v5.l, v1.h op_sel:[1,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xdd,0xd5,0xe9,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_trunc_f16_e64_dpp v255.h, -|v255.l| op_sel:[0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc1,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_trunc_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0xdd,0xd5,0xea,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_trunc_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xa1,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/Xtensa/code_density.txt b/llvm/test/MC/Disassembler/Xtensa/code_density.txt
index eac236a..833dd52 100644
--- a/llvm/test/MC/Disassembler/Xtensa/code_density.txt
+++ b/llvm/test/MC/Disassembler/Xtensa/code_density.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble %s | FileCheck --check-prefix=CHECK-DENSITY %s
+# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble %s | FileCheck -check-prefixes=CHECK-DENSITY %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
 
 #------------------------------------------------------------------------------
 # Verify that binary code is correctly disassembled with
@@ -6,57 +7,46 @@
 # density option generates warnings.
 #------------------------------------------------------------------------------
 
-0x4a 0x23
+[0x4a, 0x23]
 # CHECK-DENSITY: add.n  a2, a3, a4
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x3b 0x23
+[0x3b, 0x23]
 # CHECK-DENSITY: addi.n a2, a3, 3
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x9c 0x03
+[0x9c, 0x03]
 # CHECK-DENSITY: beqz.n a3, . +20
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0xcc 0xe3
+[0xcc, 0xe3]
 # CHECK-DENSITY: bnez.n a3, . +18
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x6d 0xf0
+[0x6d, 0xf0]
 # CHECK-DENSITY: ill.n
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x28 0x33
+[0x28, 0x33]
 # CHECK-DENSITY: l32i.n a2, a3, 12
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x2d 0x03
+[0x2d, 0x03]
 # CHECK-DENSITY: mov.n  a2, a3
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x0d 0xf0
+[0x0d, 0xf0]
 # CHECK-DENSITY: ret.n
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x29 0x33
+[0x29, 0x33]
 # CHECK-DENSITY: s32i.n a2, a3, 12
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x6c 0x02
+[0x6c, 0x02]
 # CHECK-DENSITY: movi.n a2, -32
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-0x3d 0xf0
+[0x3d, 0xf0]
 # CHECK-DENSITY: nop.n
-# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding
-# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/RISCV/XVentanaCondOps-valid.s b/llvm/test/MC/RISCV/XVentanaCondOps-valid.s
index 8f4eba5..9825210 100644
--- a/llvm/test/MC/RISCV/XVentanaCondOps-valid.s
+++ b/llvm/test/MC/RISCV/XVentanaCondOps-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+xventanacondops -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xventanacondops -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+xventanacondops < %s \
 # RUN:     | llvm-objdump --mattr=+xventanacondops -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/Zawrs-valid.s b/llvm/test/MC/RISCV/Zawrs-valid.s
index 0bdc570..29e95d3 100644
--- a/llvm/test/MC/RISCV/Zawrs-valid.s
+++ b/llvm/test/MC/RISCV/Zawrs-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zawrs -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zawrs -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zawrs -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zawrs -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zawrs < %s \
 # RUN:     | llvm-objdump --mattr=+zawrs -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/Ztso.s b/llvm/test/MC/RISCV/Ztso.s
index 06b1030..af61355 100644
--- a/llvm/test/MC/RISCV/Ztso.s
+++ b/llvm/test/MC/RISCV/Ztso.s
@@ -1,5 +1,5 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+ztso -riscv-no-aliases 2>&1 | FileCheck %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+ztso -riscv-no-aliases 2>&1 | FileCheck %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+ztso -M no-aliases 2>&1 | FileCheck %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+ztso -M no-aliases 2>&1 | FileCheck %s
 
 # Note: Ztso doesn't add or remove any instructions, so this is basically
 # just checking that a) we accepted the attribute name, and b) codegen did
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 6ffaa62..4e77a53 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -467,3 +467,9 @@
 
 .attribute arch, "rv32i_ssctr1p0"
 # CHECK: attribute      5, "rv32i2p1_sscsrind1p0_ssctr1p0"
+
+.attribute arch, "rv32i_sdext1p0"
+# CHECK: attribute      5, "rv32i2p1_sdext1p0"
+
+.attribute arch, "rv32i_sdtrig1p0"
+# CHECK: attribute      5, "rv32i2p1_sdtrig1p0"
diff --git a/llvm/test/MC/RISCV/compress-cjal.s b/llvm/test/MC/RISCV/compress-cjal.s
index d55586b..cdb6e85 100644
--- a/llvm/test/MC/RISCV/compress-cjal.s
+++ b/llvm/test/MC/RISCV/compress-cjal.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIASOBJ %s
diff --git a/llvm/test/MC/RISCV/compress-debug-info.s b/llvm/test/MC/RISCV/compress-debug-info.s
index 70aaefb..134c3d0 100644
--- a/llvm/test/MC/RISCV/compress-debug-info.s
+++ b/llvm/test/MC/RISCV/compress-debug-info.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -triple riscv32 -mattr=+c %s -g -o - -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv32 -mattr=+c %s -g -o - -M no-aliases \
 # RUN:   | FileCheck %s -check-prefixes=COMPRESS,BOTH
-# RUN: llvm-mc -triple riscv32 %s -g -o - -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv32 %s -g -o - -M no-aliases \
 # RUN:   | FileCheck %s -check-prefixes=UNCOMPRESS,BOTH
 
 
diff --git a/llvm/test/MC/RISCV/compress-rv32d.s b/llvm/test/MC/RISCV/compress-rv32d.s
index c41a088..2bfae21 100644
--- a/llvm/test/MC/RISCV/compress-rv32d.s
+++ b/llvm/test/MC/RISCV/compress-rv32d.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+d -filetype=obj < %s \
 # RUN:   | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+c,+d -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -11,7 +11,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+zcd,+d -filetype=obj < %s \
 # RUN:   | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+zcd,+d -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -22,7 +22,7 @@
 # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c,+d -filetype=obj < %s \
 # RUN:   | llvm-objdump --no-print-imm-hex --triple=riscv64 --mattr=+c,+d -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -32,7 +32,7 @@
 # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc -triple riscv64 -mattr=+zcd,+d -filetype=obj < %s \
 # RUN:   | llvm-objdump --no-print-imm-hex --triple=riscv64 --mattr=+zcd,+d -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/compress-rv32f.s b/llvm/test/MC/RISCV/compress-rv32f.s
index afe15c5..5fc3f41 100644
--- a/llvm/test/MC/RISCV/compress-rv32f.s
+++ b/llvm/test/MC/RISCV/compress-rv32f.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c,+f -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c,+f --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -11,7 +11,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+zcf,+f -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+zcf,+f --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/compress-rv32i.s b/llvm/test/MC/RISCV/compress-rv32i.s
index a75bea3..5a28128 100644
--- a/llvm/test/MC/RISCV/compress-rv32i.s
+++ b/llvm/test/MC/RISCV/compress-rv32i.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS,CHECK-ALIASASM %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST,CHECK-INSTASM %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST,CHECK-INSTASM %s
 # RUN: llvm-mc -triple riscv32 -mattr=+c -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS,CHECK-ALIASOBJ32 %s
@@ -12,7 +12,7 @@
 # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS,CHECK-ALIASASM %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST,CHECK-INSTASM %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST,CHECK-INSTASM %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS,CHECK-ALIASOBJ64 %s
diff --git a/llvm/test/MC/RISCV/compress-rv64i.s b/llvm/test/MC/RISCV/compress-rv64i.s
index ab5b243..31eb4d9 100644
--- a/llvm/test/MC/RISCV/compress-rv64i.s
+++ b/llvm/test/MC/RISCV/compress-rv64i.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc -triple riscv64 -mattr=+c -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/compressed-relocations.s b/llvm/test/MC/RISCV/compressed-relocations.s
index c7117ab..196d987 100644
--- a/llvm/test/MC/RISCV/compressed-relocations.s
+++ b/llvm/test/MC/RISCV/compressed-relocations.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -mattr=+c -riscv-no-aliases < %s -show-encoding \
+# RUN: llvm-mc -triple riscv32 -mattr=+c -M no-aliases < %s -show-encoding \
 # RUN:     | FileCheck -check-prefix=INSTR -check-prefix=FIXUP %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s
diff --git a/llvm/test/MC/RISCV/compressed-zicfiss.s b/llvm/test/MC/RISCV/compressed-zicfiss.s
index 2ebf9d3..7d387b2 100644
--- a/llvm/test/MC/RISCV/compressed-zicfiss.s
+++ b/llvm/test/MC/RISCV/compressed-zicfiss.s
@@ -1,15 +1,15 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zicfiss,+zcmop -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zicfiss,+zcmop -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: c.sspopchk t0
diff --git a/llvm/test/MC/RISCV/corev/XCValu-valid.s b/llvm/test/MC/RISCV/corev/XCValu-valid.s
index 1c74e36..2636b34 100644
--- a/llvm/test/MC/RISCV/corev/XCValu-valid.s
+++ b/llvm/test/MC/RISCV/corev/XCValu-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=riscv32 --mattr=+xcvalu -riscv-no-aliases -show-encoding %s \
+# RUN: llvm-mc -triple=riscv32 --mattr=+xcvalu -M no-aliases -show-encoding %s \
 # RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INSTR
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xcvalu < %s \
 # RUN:     | llvm-objdump --mattr=+xcvalu --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/corev/XCVmac-valid.s b/llvm/test/MC/RISCV/corev/XCVmac-valid.s
index 93ed9d7..a795bc3 100644
--- a/llvm/test/MC/RISCV/corev/XCVmac-valid.s
+++ b/llvm/test/MC/RISCV/corev/XCVmac-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=riscv32 --mattr=+xcvmac -riscv-no-aliases -show-encoding %s \
+# RUN: llvm-mc -triple=riscv32 --mattr=+xcvmac -M no-aliases -show-encoding %s \
 # RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INSTR
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xcvmac < %s \
 # RUN:     | llvm-objdump --mattr=+xcvmac --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/custom_reloc.s b/llvm/test/MC/RISCV/custom_reloc.s
index 4bd4700..cdb8194 100644
--- a/llvm/test/MC/RISCV/custom_reloc.s
+++ b/llvm/test/MC/RISCV/custom_reloc.s
@@ -21,16 +21,33 @@
   .reloc ., R_RISCV_VENDOR,    VENDOR_NAME
   .reloc ., R_RISCV_CUSTOM192, my_foo + 1
   addi a0, a0, 0
-  # CHECK-ASM: [[L1:.L[^:]+]]:
+  # CHECK-ASM:      [[L1:.L[^:]+]]:
   # CHECK-ASM-NEXT: .reloc [[L1]], R_RISCV_VENDOR, VENDOR_NAME
   # CHECK-ASM-NEXT: [[L2:.L[^:]+]]:
   # CHECK-ASM-NEXT: .reloc [[L2]], R_RISCV_CUSTOM192, my_foo+1
   # CHECK-ASM-NEXT: mv a0, a0
 
-  # CHECK-OBJ: addi a0, a0, 0
+  # CHECK-OBJ:      addi a0, a0, 0
   # CHECK-OBJ-NEXT: R_RISCV_VENDOR    VENDOR_NAME
   # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_foo+0x1
 
   nop
   # CHECK-ASM: nop
   # CHECK-OBJ: addi zero, zero, 0x0
+
+  .reloc ., R_RISCV_VENDOR,     QUALCOMM
+  .reloc ., R_RISCV_QC_ABS20_U, my_bar + 2
+  addi a1, a1, 0
+  # CHECK-ASM:      [[L3:.L[^:]+]]:
+  # CHECK-ASM-NEXT: .reloc [[L3]], R_RISCV_VENDOR, QUALCOMM
+  # CHECK-ASM-NEXT: [[L4:.L[^:]+]]:
+  # CHECK-ASM-NEXT: .reloc [[L4]], R_RISCV_QC_ABS20_U, my_bar+2
+  # CHECK-ASM-NEXT: mv a1, a1
+
+  # CHECK-OBJ:      addi a1, a1, 0
+  # CHECK-OBJ-NEXT: R_RISCV_VENDOR    QUALCOMM
+  # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_bar+0x2
+
+  nop
+  # CHECK-ASM: nop
+  # CHECK-OBJ: addi zero, zero, 0x0
diff --git a/llvm/test/MC/RISCV/debug-valid.s b/llvm/test/MC/RISCV/debug-valid.s
index 89b8f00..44dd47d2 100644
--- a/llvm/test/MC/RISCV/debug-valid.s
+++ b/llvm/test/MC/RISCV/debug-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/deprecated-csr-names.s b/llvm/test/MC/RISCV/deprecated-csr-names.s
index e895732..3bb104a 100644
--- a/llvm/test/MC/RISCV/deprecated-csr-names.s
+++ b/llvm/test/MC/RISCV/deprecated-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding %s \
+# RUN: llvm-mc -triple riscv32 -M no-aliases -show-encoding %s \
 # RUN:     | FileCheck -check-prefixes CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype obj -triple riscv32 %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 
-# RUN: llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding %s \
+# RUN: llvm-mc -triple riscv64 -M no-aliases -show-encoding %s \
 # RUN:     | FileCheck -check-prefixes CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype obj -triple riscv64 %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/fixups-binary-expression.s b/llvm/test/MC/RISCV/fixups-binary-expression.s
index dc1de5d..325a54c 100644
--- a/llvm/test/MC/RISCV/fixups-binary-expression.s
+++ b/llvm/test/MC/RISCV/fixups-binary-expression.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -mattr=+c -riscv-no-aliases < %s -show-encoding \
+# RUN: llvm-mc -triple riscv32 -mattr=+c -M no-aliases < %s -show-encoding \
 # RUN:     | FileCheck -check-prefix=CHECK-FIXUP %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/fixups.s b/llvm/test/MC/RISCV/fixups.s
index d0682ed..5145dbe 100644
--- a/llvm/test/MC/RISCV/fixups.s
+++ b/llvm/test/MC/RISCV/fixups.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -riscv-no-aliases < %s -show-encoding \
+# RUN: llvm-mc -triple riscv32 -M no-aliases < %s -show-encoding \
 # RUN:     | FileCheck -check-prefix=CHECK-FIXUP %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/fp-default-rounding-mode.s b/llvm/test/MC/RISCV/fp-default-rounding-mode.s
index c918920..88b681a 100644
--- a/llvm/test/MC/RISCV/fp-default-rounding-mode.s
+++ b/llvm/test/MC/RISCV/fp-default-rounding-mode.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+d,+zfh,+zfbfmin \
 # RUN:     | FileCheck -check-prefixes=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s b/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s
index d2764ce..2b628e1 100644
--- a/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s
+++ b/llvm/test/MC/RISCV/fp-inx-default-rounding-mode.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx,+zhinx \
 # RUN:     | FileCheck -check-prefixes=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/hypervisor-csr-names.s b/llvm/test/MC/RISCV/hypervisor-csr-names.s
index 2f29e5d..a7e3a57 100644
--- a/llvm/test/MC/RISCV/hypervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/hypervisor-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/insn.s b/llvm/test/MC/RISCV/insn.s
index 829364c6..b1ef251 100644
--- a/llvm/test/MC/RISCV/insn.s
+++ b/llvm/test/MC/RISCV/insn.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
-# RUN: llvm-mc %s -triple riscv64 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f < %s \
 # RUN:     | llvm-objdump --mattr=+f -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/insn_c.s b/llvm/test/MC/RISCV/insn_c.s
index c63e8ab..c52f9b6 100644
--- a/llvm/test/MC/RISCV/insn_c.s
+++ b/llvm/test/MC/RISCV/insn_c.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+f,+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+f,+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefix=CHECK-ASM %s
-# RUN: llvm-mc %s -triple riscv64 -mattr=+f,+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+f,+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefix=CHECK-ASM %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f,+c < %s \
 # RUN:     | llvm-objdump --mattr=+f,+c -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/machine-csr-names.s b/llvm/test/MC/RISCV/machine-csr-names.s
index 8cfdf7e..ba2a79f 100644
--- a/llvm/test/MC/RISCV/machine-csr-names.s
+++ b/llvm/test/MC/RISCV/machine-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
@@ -1419,6 +1419,34 @@ csrrs t1, tdata3, zero
 # uimm12
 csrrs t2, 0x7A3, zero
 
+# tinfo
+# name
+# CHECK-INST: csrrs t1, tinfo, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x40,0x7a]
+# CHECK-INST-ALIAS: csrr t1, tinfo
+# uimm12
+# CHECK-INST: csrrs t2, tinfo, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x40,0x7a]
+# CHECK-INST-ALIAS: csrr t2, tinfo
+# name
+csrrs t1, tinfo, zero
+# uimm12
+csrrs t2, 0x7A4, zero
+
+# tcontrol
+# name
+# CHECK-INST: csrrs t1, tcontrol, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x50,0x7a]
+# CHECK-INST-ALIAS: csrr t1, tcontrol
+# uimm12
+# CHECK-INST: csrrs t2, tcontrol, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x50,0x7a]
+# CHECK-INST-ALIAS: csrr t2, tcontrol
+# name
+csrrs t1, tcontrol, zero
+# uimm12
+csrrs t2, 0x7A5, zero
+
 # mcontext
 # name
 # CHECK-INST: csrrs t1, mcontext, zero
@@ -1433,6 +1461,20 @@ csrrs t1, mcontext, zero
 # uimm12
 csrrs t2, 0x7A8, zero
 
+# mscontext
+# name
+# CHECK-INST: csrrs t1, mscontext, zero
+# CHECK-ENC: encoding: [0x73,0x23,0xa0,0x7a]
+# CHECK-INST-ALIAS: csrr t1, mscontext
+# uimm12
+# CHECK-INST: csrrs t2, mscontext, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0xa0,0x7a]
+# CHECK-INST-ALIAS: csrr t2, mscontext
+# name
+csrrs t1, mscontext, zero
+# uimm12
+csrrs t2, 0x7AA, zero
+
 #######################
 # Debug Mode Registers
 ########################
diff --git a/llvm/test/MC/RISCV/option-nopic.s b/llvm/test/MC/RISCV/option-nopic.s
index db0cf1d..20203f6 100644
--- a/llvm/test/MC/RISCV/option-nopic.s
+++ b/llvm/test/MC/RISCV/option-nopic.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases \
 # RUN:     -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -position-independent < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases \
 # RUN:     -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -position-independent < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
diff --git a/llvm/test/MC/RISCV/option-pic.s b/llvm/test/MC/RISCV/option-pic.s
index ef456e0..9c9381c 100644
--- a/llvm/test/MC/RISCV/option-pic.s
+++ b/llvm/test/MC/RISCV/option-pic.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases \
 # RUN:     -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -position-independent < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases \
 # RUN:     -position-independent < %s | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -position-independent < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
diff --git a/llvm/test/MC/RISCV/option-pushpop.s b/llvm/test/MC/RISCV/option-pushpop.s
index 9c61b5d..68d60be 100644
--- a/llvm/test/MC/RISCV/option-pushpop.s
+++ b/llvm/test/MC/RISCV/option-pushpop.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv32 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
@@ -6,7 +6,7 @@
 # RUN:     | llvm-objdump --no-print-imm-hex --triple=riscv32 --mattr=+c -d - \
 # RUN:     | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
 
-# RUN: llvm-mc -triple riscv64 -mattr=-relax -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple riscv64 -mattr=-relax -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s
diff --git a/llvm/test/MC/RISCV/option-rvc.s b/llvm/test/MC/RISCV/option-rvc.s
index 894fbab..32568b9 100644
--- a/llvm/test/MC/RISCV/option-rvc.s
+++ b/llvm/test/MC/RISCV/option-rvc.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
@@ -12,7 +12,7 @@
 # RUN: llvm-mc -triple riscv64 -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv64 -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK-INST %s
 # RUN: llvm-mc -triple riscv64 -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/print-imm-hex.s b/llvm/test/MC/RISCV/print-imm-hex.s
index 04d405a..10270de 100644
--- a/llvm/test/MC/RISCV/print-imm-hex.s
+++ b/llvm/test/MC/RISCV/print-imm-hex.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding -mattr=+v \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding -mattr=+v \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding -mattr=+v --print-imm-hex \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding -mattr=+v --print-imm-hex \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM-HEX %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+v < %s \
 # RUN:     | llvm-objdump -M no-aliases --mattr=+v --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/priv-valid.s b/llvm/test/MC/RISCV/priv-valid.s
index 561c76b..67dc501 100644
--- a/llvm/test/MC/RISCV/priv-valid.s
+++ b/llvm/test/MC/RISCV/priv-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+svinval -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+svinval -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+svinval -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+svinval -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+svinval < %s \
 # RUN:     | llvm-objdump --mattr=+svinval -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/relocations.s b/llvm/test/MC/RISCV/relocations.s
index f5f6417..85a25fe 100644
--- a/llvm/test/MC/RISCV/relocations.s
+++ b/llvm/test/MC/RISCV/relocations.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -riscv-no-aliases < %s -show-encoding \
+# RUN: llvm-mc -triple riscv32 -M no-aliases < %s -show-encoding \
 # RUN:     | FileCheck -check-prefix=INSTR -check-prefix=FIXUP %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s
diff --git a/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s b/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
index 83e6d21..aadee4f 100644
--- a/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv32-machine-csr-names.s b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
index e7a6d9c..3d527e3 100644
--- a/llvm/test/MC/RISCV/rv32-machine-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s b/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s
index 4c1fef4..ca7887a 100644
--- a/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-supervisor-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv32-user-csr-names.s b/llvm/test/MC/RISCV/rv32-user-csr-names.s
index acd6646..6fb9861 100644
--- a/llvm/test/MC/RISCV/rv32-user-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-user-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv32c-aliases-valid.s b/llvm/test/MC/RISCV/rv32c-aliases-valid.s
index f159adb..de9d0c6 100644
--- a/llvm/test/MC/RISCV/rv32c-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32c-aliases-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc -triple=riscv32 -mattr=+c -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple=riscv32 -mattr=+c -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -d -M no-aliases - \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
-# RUN: llvm-mc -triple=riscv32 -mattr=+zca -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple=riscv32 -mattr=+zca -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zca -d -M no-aliases - \
diff --git a/llvm/test/MC/RISCV/rv32c-only-valid.s b/llvm/test/MC/RISCV/rv32c-only-valid.s
index 3321aff..c4fec69 100644
--- a/llvm/test/MC/RISCV/rv32c-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32c-only-valid.s
@@ -1,17 +1,17 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefix=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefix=CHECK-NO-RV32 %s
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefix=CHECK-NO-RV32-AND-EXT %s
 
 # CHECK-OBJ: c.jal 0x7fe
diff --git a/llvm/test/MC/RISCV/rv32dc-valid.s b/llvm/test/MC/RISCV/rv32dc-valid.s
index 201aee5..495c884 100644
--- a/llvm/test/MC/RISCV/rv32dc-valid.s
+++ b/llvm/test/MC/RISCV/rv32dc-valid.s
@@ -1,18 +1,18 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c,+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+c,+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcd,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcd,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcd,+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zcd,+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-D %s
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-DC %s
 
 # CHECK-ASM-AND-OBJ: c.fldsp  fs0, 504(sp)
diff --git a/llvm/test/MC/RISCV/rv32fc-aliases-valid.s b/llvm/test/MC/RISCV/rv32fc-aliases-valid.s
index d992d07..f1a1c73 100644
--- a/llvm/test/MC/RISCV/rv32fc-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32fc-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c,+f < %s \
 # RUN:     | llvm-objdump --mattr=+c,+f --no-print-imm-hex -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rv32fc-valid.s b/llvm/test/MC/RISCV/rv32fc-valid.s
index 9360325..af38a63 100644
--- a/llvm/test/MC/RISCV/rv32fc-valid.s
+++ b/llvm/test/MC/RISCV/rv32fc-valid.s
@@ -1,22 +1,22 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c,+f < %s \
 # RUN:     | llvm-objdump --mattr=+c,+f --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcf,+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zcf,+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcf,+f < %s \
 # RUN:     | llvm-objdump --mattr=+zcf,+f --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-F %s
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-FC %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+c,+f \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-RV32 %s
 
 # FIXME: error messages for rv64fc are misleading
diff --git a/llvm/test/MC/RISCV/rv32i-aliases-invalid.s b/llvm/test/MC/RISCV/rv32i-aliases-invalid.s
index 9254452..7f54fe7 100644
--- a/llvm/test/MC/RISCV/rv32i-aliases-invalid.s
+++ b/llvm/test/MC/RISCV/rv32i-aliases-invalid.s
@@ -1,5 +1,5 @@
 # UNSUPPORTED: target={{.*-windows.*}}
-# RUN: not llvm-mc -triple=riscv32 -riscv-no-aliases < %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=riscv32 -M no-aliases < %s -o /dev/null 2>&1 | FileCheck %s
 # RUN: not llvm-mc -triple=riscv32 < %s -o /dev/null 2>&1 | FileCheck %s
 
 # TODO ld
diff --git a/llvm/test/MC/RISCV/rv32i-aliases-valid.s b/llvm/test/MC/RISCV/rv32i-aliases-valid.s
index 93d8cb2..20deda4 100644
--- a/llvm/test/MC/RISCV/rv32i-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32i-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST,CHECK-ASM-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv32 \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-ALIAS,CHECK-ASM %s
diff --git a/llvm/test/MC/RISCV/rv32i-only-valid.s b/llvm/test/MC/RISCV/rv32i-only-valid.s
index 74232e3..afe62ce 100644
--- a/llvm/test/MC/RISCV/rv32i-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32i-only-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \
 # RUN:     | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s b/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s
index 26a725a..994b46e 100644
--- a/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbb-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv32zbb-only-valid.s b/llvm/test/MC/RISCV/rv32zbb-only-valid.s
index 8cee959..0e18cf2 100644
--- a/llvm/test/MC/RISCV/rv32zbb-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbb-only-valid.s
@@ -1,5 +1,5 @@
 # With Bitmanip base extension:
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zbb < %s \
 # RUN:     | llvm-objdump --mattr=+zbb --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s b/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s
index 3ebb297..60599c5 100644
--- a/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbs-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zbs \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv32zcmp-invalid.s b/llvm/test/MC/RISCV/rv32zcmp-invalid.s
index 2ed82bc..0720a74 100644
--- a/llvm/test/MC/RISCV/rv32zcmp-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zcmp-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple=riscv32 -mattr=zcmp -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv32 -mattr=zcmp -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zcmp-valid.s b/llvm/test/MC/RISCV/rv32zcmp-valid.s
index 31e287b..d144c6f 100644
--- a/llvm/test/MC/RISCV/rv32zcmp-valid.s
+++ b/llvm/test/MC/RISCV/rv32zcmp-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=zcmp -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=zcmp -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=zcmp < %s \
 # RUN:     | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv32zfa-only-valid.s b/llvm/test/MC/RISCV/rv32zfa-only-valid.s
index d212659..a780a9f 100644
--- a/llvm/test/MC/RISCV/rv32zfa-only-valid.s
+++ b/llvm/test/MC/RISCV/rv32zfa-only-valid.s
@@ -1,11 +1,11 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: fmvh.x.d a1, fs1
diff --git a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s
index 1cedcb9..048df67 100644
--- a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+c -riscv-no-aliases -show-encoding \
+# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+c -M no-aliases -show-encoding \
 # RUN:     2>&1 | FileCheck -check-prefixes=CHECK-ERR %s
 
 # CHECK-ERR: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zmmul-invaild.s b/llvm/test/MC/RISCV/rv32zmmul-invaild.s
index cf4ced8..b5f1ee6 100644
--- a/llvm/test/MC/RISCV/rv32zmmul-invaild.s
+++ b/llvm/test/MC/RISCV/rv32zmmul-invaild.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -triple=riscv32 -mattr=+zmmul -riscv-no-aliases 2>&1 \
+# RUN: not llvm-mc %s -triple=riscv32 -mattr=+zmmul -M no-aliases 2>&1 \
 # RUN:  | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: 5:1: error: instruction requires the following: 'M' (Integer Multiplication and Division){{$}}
diff --git a/llvm/test/MC/RISCV/rv32zmmul-valid.s b/llvm/test/MC/RISCV/rv32zmmul-valid.s
index 929dc52..b226685 100644
--- a/llvm/test/MC/RISCV/rv32zmmul-valid.s
+++ b/llvm/test/MC/RISCV/rv32zmmul-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zmmul -riscv-no-aliases 2>&1 \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zmmul -M no-aliases 2>&1 \
 # RUN:  | FileCheck -check-prefixes=CHECK-INST %s
 
 # CHECK-INST: mul a4, ra, s0
diff --git a/llvm/test/MC/RISCV/rv64-machine-csr-names.s b/llvm/test/MC/RISCV/rv64-machine-csr-names.s
index 3efebf9..b49eb17 100644
--- a/llvm/test/MC/RISCV/rv64-machine-csr-names.s
+++ b/llvm/test/MC/RISCV/rv64-machine-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/rv64-user-csr-names.s b/llvm/test/MC/RISCV/rv64-user-csr-names.s
index fc23b69..afb7235 100644
--- a/llvm/test/MC/RISCV/rv64-user-csr-names.s
+++ b/llvm/test/MC/RISCV/rv64-user-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/rv64a-aliases-valid.s b/llvm/test/MC/RISCV/rv64a-aliases-valid.s
index 09999536..577010f 100644
--- a/llvm/test/MC/RISCV/rv64a-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64a-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+a \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rv64c-aliases-valid.s b/llvm/test/MC/RISCV/rv64c-aliases-valid.s
index ccf9e6a..6856697 100644
--- a/llvm/test/MC/RISCV/rv64c-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64c-aliases-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc -triple=riscv64 -mattr=+c -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple=riscv64 -mattr=+c -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -d -M no-aliases - \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
-# RUN: llvm-mc -triple=riscv64 -mattr=+zca -riscv-no-aliases < %s \
+# RUN: llvm-mc -triple=riscv64 -mattr=+zca -M no-aliases < %s \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zca -d -M no-aliases - \
diff --git a/llvm/test/MC/RISCV/rv64c-hints-valid.s b/llvm/test/MC/RISCV/rv64c-hints-valid.s
index 92cbe542..95d0932 100644
--- a/llvm/test/MC/RISCV/rv64c-hints-valid.s
+++ b/llvm/test/MC/RISCV/rv64c-hints-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple riscv64 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+zca -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zca -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64c-valid.s b/llvm/test/MC/RISCV/rv64c-valid.s
index a9f0cf5..f8736e5 100644
--- a/llvm/test/MC/RISCV/rv64c-valid.s
+++ b/llvm/test/MC/RISCV/rv64c-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
@@ -11,10 +11,10 @@
 #
 #
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv32 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-RV64 %s
 
 # TODO: more exhaustive testing of immediate encoding.
diff --git a/llvm/test/MC/RISCV/rv64d-aliases-valid.s b/llvm/test/MC/RISCV/rv64d-aliases-valid.s
index 17a44b4..c23aa84 100644
--- a/llvm/test/MC/RISCV/rv64d-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64d-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+d \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rv64d-valid.s b/llvm/test/MC/RISCV/rv64d-valid.s
index e6cc8ec..8a9f608 100644
--- a/llvm/test/MC/RISCV/rv64d-valid.s
+++ b/llvm/test/MC/RISCV/rv64d-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+d < %s \
 # RUN:     | llvm-objdump --mattr=+d -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64dc-valid.s b/llvm/test/MC/RISCV/rv64dc-valid.s
index 83225b2..7f2b8c4 100644
--- a/llvm/test/MC/RISCV/rv64dc-valid.s
+++ b/llvm/test/MC/RISCV/rv64dc-valid.s
@@ -1,18 +1,18 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c,+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+c,+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zcd,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zcd,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zcd,+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+zcd,+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv64 -mattr=+c \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-D %s
-# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-DC %s
 
 # CHECK-ASM-AND-OBJ: c.fldsp  fs0, 504(sp)
diff --git a/llvm/test/MC/RISCV/rv64e-valid.s b/llvm/test/MC/RISCV/rv64e-valid.s
index f7b66fd..42f9b2c 100644
--- a/llvm/test/MC/RISCV/rv64e-valid.s
+++ b/llvm/test/MC/RISCV/rv64e-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+e -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+e -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+e < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64e-zcmp-valid.s b/llvm/test/MC/RISCV/rv64e-zcmp-valid.s
index 376edf0..607a023 100644
--- a/llvm/test/MC/RISCV/rv64e-zcmp-valid.s
+++ b/llvm/test/MC/RISCV/rv64e-zcmp-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp,+e -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp,+e -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=zcmp < %s \
 # RUN:     | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64f-aliases-valid.s b/llvm/test/MC/RISCV/rv64f-aliases-valid.s
index 488d52f..d04b0b5 100644
--- a/llvm/test/MC/RISCV/rv64f-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64f-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+f \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rv64f-valid.s b/llvm/test/MC/RISCV/rv64f-valid.s
index 108e1eb..1fea787 100644
--- a/llvm/test/MC/RISCV/rv64f-valid.s
+++ b/llvm/test/MC/RISCV/rv64f-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+f < %s \
 # RUN:     | llvm-objdump --mattr=+f -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64i-aliases-invalid.s b/llvm/test/MC/RISCV/rv64i-aliases-invalid.s
index 34552b5..1bd4e78 100644
--- a/llvm/test/MC/RISCV/rv64i-aliases-invalid.s
+++ b/llvm/test/MC/RISCV/rv64i-aliases-invalid.s
@@ -1,5 +1,5 @@
 # UNSUPPORTED: target={{.*-windows.*}}
-# RUN: not llvm-mc -triple=riscv64 -riscv-no-aliases < %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=riscv64 -M no-aliases < %s -o /dev/null 2>&1 | FileCheck %s
 # RUN: not llvm-mc -triple=riscv64 < %s 2>&1 -o /dev/null | FileCheck %s
 
 li t5, 0x10000000000000000 # CHECK: :[[@LINE]]:8: error: unknown operand
diff --git a/llvm/test/MC/RISCV/rv64i-aliases-valid.s b/llvm/test/MC/RISCV/rv64i-aliases-valid.s
index f36446d..dde8dbe 100644
--- a/llvm/test/MC/RISCV/rv64i-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64i-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-INST,CHECK-ASM-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv64 \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND,CHECK-ALIAS,CHECK-ASM %s
diff --git a/llvm/test/MC/RISCV/rv64i-valid.s b/llvm/test/MC/RISCV/rv64i-valid.s
index ec101f9..7f94fbf 100644
--- a/llvm/test/MC/RISCV/rv64i-valid.s
+++ b/llvm/test/MC/RISCV/rv64i-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64ih-valid.s b/llvm/test/MC/RISCV/rv64ih-valid.s
index be8ccf9..13ca4f0 100644
--- a/llvm/test/MC/RISCV/rv64ih-valid.s
+++ b/llvm/test/MC/RISCV/rv64ih-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -mattr=+h -triple riscv64 < %s \
 # RUN:     | llvm-objdump --mattr=+h -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rv64m-valid.s b/llvm/test/MC/RISCV/rv64m-valid.s
index 246f74f..21db064 100644
--- a/llvm/test/MC/RISCV/rv64m-valid.s
+++ b/llvm/test/MC/RISCV/rv64m-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m < %s \
 # RUN:     | llvm-objdump --mattr=+m -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zaamo-valid.s b/llvm/test/MC/RISCV/rv64zaamo-valid.s
index 96d3e61..c401ce1 100644
--- a/llvm/test/MC/RISCV/rv64zaamo-valid.s
+++ b/llvm/test/MC/RISCV/rv64zaamo-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
@@ -7,7 +7,7 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 \
 # RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zaamo < %s \
 # RUN:     | llvm-objdump --mattr=+zaamo -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zacas-valid.s b/llvm/test/MC/RISCV/rv64zacas-valid.s
index 595c70b..d90e778 100644
--- a/llvm/test/MC/RISCV/rv64zacas-valid.s
+++ b/llvm/test/MC/RISCV/rv64zacas-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a,+zacas < %s \
 # RUN:     | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zalasr-valid.s b/llvm/test/MC/RISCV/rv64zalasr-valid.s
index 2f1e381..13d2b21 100644
--- a/llvm/test/MC/RISCV/rv64zalasr-valid.s
+++ b/llvm/test/MC/RISCV/rv64zalasr-valid.s
@@ -1,11 +1,11 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zalasr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefixes=CHECK-NO-EXT %s
 
 
diff --git a/llvm/test/MC/RISCV/rv64zalrsc-valid.s b/llvm/test/MC/RISCV/rv64zalrsc-valid.s
index 2bbde96..98ac38d 100644
--- a/llvm/test/MC/RISCV/rv64zalrsc-valid.s
+++ b/llvm/test/MC/RISCV/rv64zalrsc-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
@@ -7,7 +7,7 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 \
 # RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zalrsc < %s \
 # RUN:     | llvm-objdump --mattr=+zalrsc -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zba-aliases-valid.s b/llvm/test/MC/RISCV/rv64zba-aliases-valid.s
index bb8eeb4..78ae18b 100644
--- a/llvm/test/MC/RISCV/rv64zba-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64zba-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zba -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zba -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s  -triple=riscv64 -mattr=+zba \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s b/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s
index 662f360..50d6716 100644
--- a/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64zbb-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s  -triple=riscv64 -mattr=+zbb \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv64zbb-valid.s b/llvm/test/MC/RISCV/rv64zbb-valid.s
index 6c7327f..5617e11 100644
--- a/llvm/test/MC/RISCV/rv64zbb-valid.s
+++ b/llvm/test/MC/RISCV/rv64zbb-valid.s
@@ -1,5 +1,5 @@
 # With Bitmanip base extension:
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zbb < %s \
 # RUN:     | llvm-objdump --mattr=+zbb -M no-aliases --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s b/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s
index 0bfd3ac..0379a06 100644
--- a/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rv64zbs-aliases-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbs -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbs -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s  -triple=riscv64 -mattr=+zbs \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rv64zcb-valid.s b/llvm/test/MC/RISCV/rv64zcb-valid.s
index ab0550e..83e7fd6 100644
--- a/llvm/test/MC/RISCV/rv64zcb-valid.s
+++ b/llvm/test/MC/RISCV/rv64zcb-valid.s
@@ -1,14 +1,14 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb < %s \
 # RUN:     | llvm-objdump --mattr=+m,+zbb,+zba,zcb -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv32 -mattr=+m,+zbb,+zba,+zcb \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-RV64 %s
 
 # CHECK-ASM-AND-OBJ: c.zext.w s0
diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s
index 8f353e8..7e10ab5c 100644
--- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple=riscv64 -mattr=zcmp -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv64 -mattr=zcmp -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zcmp-valid.s b/llvm/test/MC/RISCV/rv64zcmp-valid.s
index 5973f6d..c70a904 100644
--- a/llvm/test/MC/RISCV/rv64zcmp-valid.s
+++ b/llvm/test/MC/RISCV/rv64zcmp-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=zcmp -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=zcmp < %s \
 # RUN:     | llvm-objdump --mattr=-c,zcmp -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zdinx-valid.s b/llvm/test/MC/RISCV/rv64zdinx-valid.s
index fa603f3..411f424 100644
--- a/llvm/test/MC/RISCV/rv64zdinx-valid.s
+++ b/llvm/test/MC/RISCV/rv64zdinx-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zdinx %s \
 # RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zfh-valid.s b/llvm/test/MC/RISCV/rv64zfh-valid.s
index 5a15040..5fb8ba5 100644
--- a/llvm/test/MC/RISCV/rv64zfh-valid.s
+++ b/llvm/test/MC/RISCV/rv64zfh-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfh -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zfinx-valid.s b/llvm/test/MC/RISCV/rv64zfinx-valid.s
index d2de9a3..63006e9 100644
--- a/llvm/test/MC/RISCV/rv64zfinx-valid.s
+++ b/llvm/test/MC/RISCV/rv64zfinx-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfinx %s \
 # RUN:     | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zhinx-valid.s b/llvm/test/MC/RISCV/rv64zhinx-valid.s
index cba252f..c6aa559 100644
--- a/llvm/test/MC/RISCV/rv64zhinx-valid.s
+++ b/llvm/test/MC/RISCV/rv64zhinx-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx %s \
 # RUN:     | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zhinxmin-valid.s b/llvm/test/MC/RISCV/rv64zhinxmin-valid.s
index 062844f..3489549 100644
--- a/llvm/test/MC/RISCV/rv64zhinxmin-valid.s
+++ b/llvm/test/MC/RISCV/rv64zhinxmin-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx,+zdinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx,+zdinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx,+zdinx %s \
 # RUN:     | llvm-objdump --mattr=+zhinx,+zdinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s
index 1296940..fc69c68 100644
--- a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+c -riscv-no-aliases -show-encoding \
+# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+c -M no-aliases -show-encoding \
 # RUN:     2>&1 | FileCheck -check-prefixes=CHECK-ERR %s
 
 # CHECK-ERR: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zmmul-invalid.s b/llvm/test/MC/RISCV/rv64zmmul-invalid.s
index 026b0a4..3d27ed9 100644
--- a/llvm/test/MC/RISCV/rv64zmmul-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zmmul-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc %s -triple=riscv64 -mattr=+zmmul -riscv-no-aliases 2>&1 \
+# RUN: not llvm-mc %s -triple=riscv64 -mattr=+zmmul -M no-aliases 2>&1 \
 # RUN:  | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: 5:1: error: instruction requires the following: 'M' (Integer Multiplication and Division){{$}}
diff --git a/llvm/test/MC/RISCV/rv64zmmul-valid.s b/llvm/test/MC/RISCV/rv64zmmul-valid.s
index 80d05ac..b287d89 100644
--- a/llvm/test/MC/RISCV/rv64zmmul-valid.s
+++ b/llvm/test/MC/RISCV/rv64zmmul-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zmmul -riscv-no-aliases 2>&1 \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zmmul -M no-aliases 2>&1 \
 # RUN:  | FileCheck -check-prefixes=CHECK-INST %s
 
 # CHECK-INST: mulw ra, sp, gp
diff --git a/llvm/test/MC/RISCV/rva-aliases-valid.s b/llvm/test/MC/RISCV/rva-aliases-valid.s
index 57d96b5..7b33407 100644
--- a/llvm/test/MC/RISCV/rva-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rva-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-S-NOALIAS,CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+a \
 # RUN:     | FileCheck -check-prefixes=CHECK-S,CHECK-S-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases\
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases\
 # RUN:     | FileCheck -check-prefixes=CHECK-S-NOALIAS,CHECK-S-OBJ-NOALIAS %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+a \
 # RUN:     | FileCheck -check-prefixes=CHECK-S,CHECK-S-OBJ %s
diff --git a/llvm/test/MC/RISCV/rvc-aliases-valid.s b/llvm/test/MC/RISCV/rvc-aliases-valid.s
index fa73922..23398f9 100644
--- a/llvm/test/MC/RISCV/rvc-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvc-aliases-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rvc-hints-valid.s b/llvm/test/MC/RISCV/rvc-hints-valid.s
index 562fe65..5dc86d1 100644
--- a/llvm/test/MC/RISCV/rvc-hints-valid.s
+++ b/llvm/test/MC/RISCV/rvc-hints-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple riscv64 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/rvc-valid.s b/llvm/test/MC/RISCV/rvc-valid.s
index 9b0ca80..798bff8 100644
--- a/llvm/test/MC/RISCV/rvc-valid.s
+++ b/llvm/test/MC/RISCV/rvc-valid.s
@@ -1,26 +1,26 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zca -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zca -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --mattr=+zca --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+c < %s \
 # RUN:     | llvm-objdump --mattr=+c --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zca -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zca < %s \
 # RUN:     | llvm-objdump --mattr=+zca --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # TODO: more exhaustive testing of immediate encoding.
diff --git a/llvm/test/MC/RISCV/rvd-aliases-valid.s b/llvm/test/MC/RISCV/rvd-aliases-valid.s
index 58478195..9832b73 100644
--- a/llvm/test/MC/RISCV/rvd-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvd-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+d \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+d \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvd-valid.s b/llvm/test/MC/RISCV/rvd-valid.s
index f782900..5135562 100644
--- a/llvm/test/MC/RISCV/rvd-valid.s
+++ b/llvm/test/MC/RISCV/rvd-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+d -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+d < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex --mattr=+d -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvdc-aliases-valid.s b/llvm/test/MC/RISCV/rvdc-aliases-valid.s
index f74ee05..083c4d2 100644
--- a/llvm/test/MC/RISCV/rvdc-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvdc-aliases-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+c,+d -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+c,+d -M no-aliases \
 # RUN:     | FileCheck -check-prefixes=CHECK-EXPAND %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c,+d < %s \
 # RUN:     | llvm-objdump --mattr=+c,+d --no-print-imm-hex -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rve-valid.s b/llvm/test/MC/RISCV/rve-valid.s
index ccb47f1..d151c07 100644
--- a/llvm/test/MC/RISCV/rve-valid.s
+++ b/llvm/test/MC/RISCV/rve-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+e -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+e -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+e < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck -check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+e -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+e -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+e < %s \
 # RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvf-aliases-valid.s b/llvm/test/MC/RISCV/rvf-aliases-valid.s
index 0430e2a..e0b63ec 100644
--- a/llvm/test/MC/RISCV/rvf-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvf-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+f \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+f \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvf-user-csr-names.s b/llvm/test/MC/RISCV/rvf-user-csr-names.s
index 7b7569d..697c333 100644
--- a/llvm/test/MC/RISCV/rvf-user-csr-names.s
+++ b/llvm/test/MC/RISCV/rvf-user-csr-names.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+f -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+f -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+f < %s \
 # RUN:     | llvm-objdump -d --mattr=+f - \
@@ -7,7 +7,7 @@
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS-NO-F %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+f -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+f -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+f < %s \
 # RUN:     | llvm-objdump -d --mattr=+f - \
diff --git a/llvm/test/MC/RISCV/rvf-valid.s b/llvm/test/MC/RISCV/rvf-valid.s
index 77b5df0..49b67a0 100644
--- a/llvm/test/MC/RISCV/rvf-valid.s
+++ b/llvm/test/MC/RISCV/rvf-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+f -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+f < %s \
 # RUN:     | llvm-objdump --mattr=+f --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvi-valid.s b/llvm/test/MC/RISCV/rvi-valid.s
index 86b508a..25b72d4 100644
--- a/llvm/test/MC/RISCV/rvi-valid.s
+++ b/llvm/test/MC/RISCV/rvi-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \
 # RUN:     | llvm-objdump -M no-aliases --no-print-imm-hex -d -r - \
diff --git a/llvm/test/MC/RISCV/rvih-valid.s b/llvm/test/MC/RISCV/rvih-valid.s
index 6f80a24..a3a9d29 100644
--- a/llvm/test/MC/RISCV/rvih-valid.s
+++ b/llvm/test/MC/RISCV/rvih-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+h -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+h -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+h -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+h < %s \
 # RUN:     | llvm-objdump --mattr=+h -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/rvk-user-csr-name.s b/llvm/test/MC/RISCV/rvk-user-csr-name.s
index 0615da0..0a3cb2d 100644
--- a/llvm/test/MC/RISCV/rvk-user-csr-name.s
+++ b/llvm/test/MC/RISCV/rvk-user-csr-name.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zkr < %s \
 # RUN:     | llvm-objdump -d --mattr=+zkr - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zkr < %s \
 # RUN:     | llvm-objdump -d --mattr=+zkr - \
diff --git a/llvm/test/MC/RISCV/rvm-valid.s b/llvm/test/MC/RISCV/rvm-valid.s
index 8fb54bb..d0612c4 100644
--- a/llvm/test/MC/RISCV/rvm-valid.s
+++ b/llvm/test/MC/RISCV/rvm-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+m -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+m -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+m -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+m < %s \
 # RUN:     | llvm-objdump --mattr=+m -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvv-user-csr-names.s b/llvm/test/MC/RISCV/rvv-user-csr-names.s
index 71d4033..1f691bb 100644
--- a/llvm/test/MC/RISCV/rvv-user-csr-names.s
+++ b/llvm/test/MC/RISCV/rvv-user-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+f -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+f -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+v < %s \
 # RUN:     | llvm-objdump -d --mattr=+v - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+f -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+f -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+v < %s \
 # RUN:     | llvm-objdump -d --mattr=+v - \
diff --git a/llvm/test/MC/RISCV/rvv/aliases.s b/llvm/test/MC/RISCV/rvv/aliases.s
index 0dadeb1..c36bdb53 100644
--- a/llvm/test/MC/RISCV/rvv/aliases.s
+++ b/llvm/test/MC/RISCV/rvv/aliases.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc --triple=riscv64 -mattr +v < %s --show-encoding 2>&1 \
 # RUN:   -mattr +d | FileCheck --check-prefix=ALIAS %s
-# RUN: llvm-mc --triple=riscv64 -mattr=+v --riscv-no-aliases < %s \
+# RUN: llvm-mc --triple=riscv64 -mattr=+v --M no-aliases < %s \
 # RUN:   -mattr +d --show-encoding 2>&1 | FileCheck --check-prefix=NO-ALIAS %s
 
 # ALIAS:    vwcvt.x.x.v     v2, v1, v0.t    # encoding: [0x57,0x61,0x10,0xc4]
diff --git a/llvm/test/MC/RISCV/rvv/fothers.s b/llvm/test/MC/RISCV/rvv/fothers.s
index 997115f..0236d31 100644
--- a/llvm/test/MC/RISCV/rvv/fothers.s
+++ b/llvm/test/MC/RISCV/rvv/fothers.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --mattr=+f --riscv-no-aliases \
+# RUN:   --mattr=+f --M no-aliases \
 # RUN:   | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvv/freduction.s b/llvm/test/MC/RISCV/rvv/freduction.s
index 1232694..190d60f 100644
--- a/llvm/test/MC/RISCV/rvv/freduction.s
+++ b/llvm/test/MC/RISCV/rvv/freduction.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:         --mattr=+f --riscv-no-aliases \
+# RUN:         --mattr=+f --M no-aliases \
 # RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvv/load.s b/llvm/test/MC/RISCV/rvv/load.s
index 3c251a3..9cd0ab4 100644
--- a/llvm/test/MC/RISCV/rvv/load.s
+++ b/llvm/test/MC/RISCV/rvv/load.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN:   --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \
diff --git a/llvm/test/MC/RISCV/rvv/others.s b/llvm/test/MC/RISCV/rvv/others.s
index cc16a87..0414585 100644
--- a/llvm/test/MC/RISCV/rvv/others.s
+++ b/llvm/test/MC/RISCV/rvv/others.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN:   --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \
diff --git a/llvm/test/MC/RISCV/rvv/store.s b/llvm/test/MC/RISCV/rvv/store.s
index c6a3470..ca9bb13 100644
--- a/llvm/test/MC/RISCV/rvv/store.s
+++ b/llvm/test/MC/RISCV/rvv/store.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --riscv-no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN:   --M no-aliases | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \
diff --git a/llvm/test/MC/RISCV/rvv/zvlsseg.s b/llvm/test/MC/RISCV/rvv/zvlsseg.s
index 65089e2..479d2f9 100644
--- a/llvm/test/MC/RISCV/rvv/zvlsseg.s
+++ b/llvm/test/MC/RISCV/rvv/zvlsseg.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
-# RUN:   --riscv-no-aliases \
+# RUN:   --M no-aliases \
 # RUN:   | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-ERROR
diff --git a/llvm/test/MC/RISCV/rvzaamo-valid.s b/llvm/test/MC/RISCV/rvzaamo-valid.s
index d9ba6ef..e4805aa 100644
--- a/llvm/test/MC/RISCV/rvzaamo-valid.s
+++ b/llvm/test/MC/RISCV/rvzaamo-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
@@ -8,9 +8,9 @@
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zaamo -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zaamo -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zaamo -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zaamo < %s \
 # RUN:     | llvm-objdump --mattr=+zaamo -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzabha-valid.s b/llvm/test/MC/RISCV/rvzabha-valid.s
index a3c61db..2b1b1e0 100644
--- a/llvm/test/MC/RISCV/rvzabha-valid.s
+++ b/llvm/test/MC/RISCV/rvzabha-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha < %s \
 # RUN:     | llvm-objdump --mattr=+a,+zabha -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
index 97afb9d..4e271e4 100644
--- a/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
+++ b/llvm/test/MC/RISCV/rvzabha-zacas-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zabha,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zabha,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zabha,+zacas < %s \
 # RUN:     | llvm-objdump --mattr=+a,+zabha,+zacas -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzacas-valid.s b/llvm/test/MC/RISCV/rvzacas-valid.s
index 0e76f02..2524001 100644
--- a/llvm/test/MC/RISCV/rvzacas-valid.s
+++ b/llvm/test/MC/RISCV/rvzacas-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a,+zacas -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+zacas < %s \
 # RUN:     | llvm-objdump --mattr=+a,+zacas -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzalasr-valid.s b/llvm/test/MC/RISCV/rvzalasr-valid.s
index 7b2668b..11487ee 100644
--- a/llvm/test/MC/RISCV/rvzalasr-valid.s
+++ b/llvm/test/MC/RISCV/rvzalasr-valid.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zalasr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zalasr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zalasr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zalasr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zalasr -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck --check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: lb.aq t1, (a0)
diff --git a/llvm/test/MC/RISCV/rvzalrsc-valid.s b/llvm/test/MC/RISCV/rvzalrsc-valid.s
index f84c0fd..01883da 100644
--- a/llvm/test/MC/RISCV/rvzalrsc-valid.s
+++ b/llvm/test/MC/RISCV/rvzalrsc-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
@@ -8,9 +8,9 @@
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zalrsc -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zalrsc -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zalrsc -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zalrsc < %s \
 # RUN:     | llvm-objdump --mattr=+zalrsc -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzbb-valid.s b/llvm/test/MC/RISCV/rvzbb-valid.s
index 1ed069e..1b060be 100644
--- a/llvm/test/MC/RISCV/rvzbb-valid.s
+++ b/llvm/test/MC/RISCV/rvzbb-valid.s
@@ -1,7 +1,7 @@
 # With Bitmanip base extension:
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zbb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zbb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zbb < %s \
 # RUN:     | llvm-objdump --mattr=+zbb --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzcb-invalid.s b/llvm/test/MC/RISCV/rvzcb-invalid.s
index 2f543b2..f53ab25 100644
--- a/llvm/test/MC/RISCV/rvzcb-invalid.s
+++ b/llvm/test/MC/RISCV/rvzcb-invalid.s
@@ -1,6 +1,6 @@
-# RUN: not llvm-mc -triple=riscv32 -mattr=zcb -riscv-no-aliases -show-encoding %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv32 -mattr=zcb -M no-aliases -show-encoding %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
-# RUN: not llvm-mc -triple=riscv64 -mattr=zcb -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv64 -mattr=zcb -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: error: immediate must be an integer in the range [0, 3]
diff --git a/llvm/test/MC/RISCV/rvzcb-valid.s b/llvm/test/MC/RISCV/rvzcb-valid.s
index de25a38..b78ecef 100644
--- a/llvm/test/MC/RISCV/rvzcb-valid.s
+++ b/llvm/test/MC/RISCV/rvzcb-valid.s
@@ -1,19 +1,19 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+m,+zbb,+zba,+zcb < %s \
 # RUN:     | llvm-objdump --mattr=+m,+zbb,+zba,+zcb --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+m,+zbb,+zba,+zcb < %s \
 # RUN:     | llvm-objdump --mattr=+m,+zbb,+zba,zcb --no-print-imm-hex -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: c.zext.b s0
diff --git a/llvm/test/MC/RISCV/rvzcmt-invalid.s b/llvm/test/MC/RISCV/rvzcmt-invalid.s
index 5f964ed..0cd9f0b 100644
--- a/llvm/test/MC/RISCV/rvzcmt-invalid.s
+++ b/llvm/test/MC/RISCV/rvzcmt-invalid.s
@@ -1,6 +1,6 @@
-# RUN: not llvm-mc -triple=riscv32 -mattr=+zcmt -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv32 -mattr=+zcmt -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
-# RUN: not llvm-mc -triple=riscv64 -mattr=+zcmt -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple=riscv64 -mattr=+zcmt -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-ERROR %s
 
 # CHECK-ERROR: error: immediate must be an integer in the range [0, 31]
diff --git a/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s b/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s
index 58fe43e..c1ab6bc 100644
--- a/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s
+++ b/llvm/test/MC/RISCV/rvzcmt-user-csr-name.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -mattr=+zcmt -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -mattr=+zcmt -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zcmt < %s \
 # RUN:     | llvm-objdump -d --mattr=+zcmt - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -mattr=+zcmt -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -mattr=+zcmt -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zcmt < %s \
 # RUN:     | llvm-objdump -d --mattr=+zcmt - \
diff --git a/llvm/test/MC/RISCV/rvzcmt-valid.s b/llvm/test/MC/RISCV/rvzcmt-valid.s
index a3829fe..a87e790 100644
--- a/llvm/test/MC/RISCV/rvzcmt-valid.s
+++ b/llvm/test/MC/RISCV/rvzcmt-valid.s
@@ -1,5 +1,5 @@
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zcmt\
-# RUN:  -riscv-no-aliases -show-encoding \
+# RUN:  -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcmt\
 # RUN:  -mattr=m < %s \
@@ -7,7 +7,7 @@
 # RUN:  -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zcmt\
-# RUN:  -riscv-no-aliases -show-encoding \
+# RUN:  -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zcmt\
 # RUN:  -mattr=m < %s \
@@ -16,10 +16,10 @@
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: cm.jt 1
diff --git a/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
index 96ec4a4..a24e36b 100644
--- a/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvzdinx-valid.s b/llvm/test/MC/RISCV/rvzdinx-valid.s
index bd1e231..623e281 100644
--- a/llvm/test/MC/RISCV/rvzdinx-valid.s
+++ b/llvm/test/MC/RISCV/rvzdinx-valid.s
@@ -1,9 +1,9 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zdinx %s \
 # RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zdinx %s \
 # RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzfbfmin-valid.s b/llvm/test/MC/RISCV/rvzfbfmin-valid.s
index aa8f8cc..6bca691 100644
--- a/llvm/test/MC/RISCV/rvzfbfmin-valid.s
+++ b/llvm/test/MC/RISCV/rvzfbfmin-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfbfmin -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfbfmin -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfbfmin -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfbfmin -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfbfmin,+f < %s \
 # RUN:     | llvm-objdump --mattr=+zfbfmin --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzfh-aliases-valid.s b/llvm/test/MC/RISCV/rvzfh-aliases-valid.s
index 4e33375..44b3cc1 100644
--- a/llvm/test/MC/RISCV/rvzfh-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvzfh-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvzfh-valid.s b/llvm/test/MC/RISCV/rvzfh-valid.s
index ec21f40..ccc6d10 100644
--- a/llvm/test/MC/RISCV/rvzfh-valid.s
+++ b/llvm/test/MC/RISCV/rvzfh-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfh --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzfhmin-valid.s b/llvm/test/MC/RISCV/rvzfhmin-valid.s
index 63e5e98..fd7b36a 100644
--- a/llvm/test/MC/RISCV/rvzfhmin-valid.s
+++ b/llvm/test/MC/RISCV/rvzfhmin-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfhmin,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfhmin,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfhmin,+d -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfhmin,+d -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfhmin,+d < %s \
 # RUN:     | llvm-objdump --mattr=+zfhmin,+d --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
index f9225cf..83e1660 100644
--- a/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvzfinx-valid.s b/llvm/test/MC/RISCV/rvzfinx-valid.s
index 58f805c..09a5f9e 100644
--- a/llvm/test/MC/RISCV/rvzfinx-valid.s
+++ b/llvm/test/MC/RISCV/rvzfinx-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfinx %s \
 # RUN:     | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s
index dbefc5a..8f69558 100644
--- a/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s
+++ b/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s
@@ -1,8 +1,8 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx \
 # RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/rvzhinx-valid.s b/llvm/test/MC/RISCV/rvzhinx-valid.s
index 97ec9dd..dc244b2 100644
--- a/llvm/test/MC/RISCV/rvzhinx-valid.s
+++ b/llvm/test/MC/RISCV/rvzhinx-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zhinx %s \
 # RUN:     | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzhinxmin-valid.s b/llvm/test/MC/RISCV/rvzhinxmin-valid.s
index fbdbce0..1773b29 100644
--- a/llvm/test/MC/RISCV/rvzhinxmin-valid.s
+++ b/llvm/test/MC/RISCV/rvzhinxmin-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinxmin -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinxmin -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinxmin -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinxmin -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zhinxmin %s \
 # RUN:     | llvm-objdump --mattr=+zhinxmin -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzihintntl-valid.s b/llvm/test/MC/RISCV/rvzihintntl-valid.s
index f7601c3..415070a 100644
--- a/llvm/test/MC/RISCV/rvzihintntl-valid.s
+++ b/llvm/test/MC/RISCV/rvzihintntl-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintntl -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintntl -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintntl -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintntl -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zihintntl < %s \
 # RUN:     | llvm-objdump --mattr=+zihintntl -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/rvzihintpause-valid.s b/llvm/test/MC/RISCV/rvzihintpause-valid.s
index 3ffc387..44cebae 100644
--- a/llvm/test/MC/RISCV/rvzihintpause-valid.s
+++ b/llvm/test/MC/RISCV/rvzihintpause-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintpause -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zihintpause -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintpause -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zihintpause -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zihintpause < %s \
 # RUN:     | llvm-objdump --mattr=+zihintpause -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/smctr-ssctr-valid.s b/llvm/test/MC/RISCV/smctr-ssctr-valid.s
index 0b4fe47..8bbd5a4 100644
--- a/llvm/test/MC/RISCV/smctr-ssctr-valid.s
+++ b/llvm/test/MC/RISCV/smctr-ssctr-valid.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-smctr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-smctr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-smctr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-smctr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-ssctr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-ssctr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-ssctr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-ssctr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-smctr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-smctr -M no-aliases -d - \
@@ -19,9 +19,9 @@
 # RUN:     | llvm-objdump --mattr=+experimental-ssctr -M no-aliases -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-INST: sctrclr
diff --git a/llvm/test/MC/RISCV/smrnmi-valid.s b/llvm/test/MC/RISCV/smrnmi-valid.s
index d330ece..8c57a4d 100644
--- a/llvm/test/MC/RISCV/smrnmi-valid.s
+++ b/llvm/test/MC/RISCV/smrnmi-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+smrnmi -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+smrnmi -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+smrnmi -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+smrnmi -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+smrnmi < %s \
 # RUN:     | llvm-objdump --mattr=+smrnmi -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/supervisor-csr-names.s b/llvm/test/MC/RISCV/supervisor-csr-names.s
index db0fcb3..712ec56 100644
--- a/llvm/test/MC/RISCV/supervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/supervisor-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/user-csr-names.s b/llvm/test/MC/RISCV/user-csr-names.s
index f49eace..bc7363f 100644
--- a/llvm/test/MC/RISCV/user-csr-names.s
+++ b/llvm/test/MC/RISCV/user-csr-names.s
@@ -1,10 +1,10 @@
-# RUN: llvm-mc %s -triple=riscv32 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
 #
-# RUN: llvm-mc %s -triple=riscv64 -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
diff --git a/llvm/test/MC/RISCV/xqcia-valid.s b/llvm/test/MC/RISCV/xqcia-valid.s
index 6bd1049..9382856 100644
--- a/llvm/test/MC/RISCV/xqcia-valid.s
+++ b/llvm/test/MC/RISCV/xqcia-valid.s
@@ -1,5 +1,5 @@
 # Xqcia - Qualcomm uC Arithmetic Extesnsion
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcia -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcia -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcia < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcia -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqciac-invalid.s b/llvm/test/MC/RISCV/xqciac-invalid.s
new file mode 100644
index 0000000..4e0182a
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqciac-invalid.s
@@ -0,0 +1,43 @@
+# Xqciac - Qualcomm uC Load-Store Address Calculation Extension
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqciac < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-IMM %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqciac < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.c.muladdi x5, x10, 4
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.muladdi x15
+
+# CHECK-IMM: :[[@LINE+1]]:24: error: immediate must be an integer in the range [0, 31]
+qc.c.muladdi x10, x15, 32
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.c.muladdi x10, x15, 20
+
+
+# CHECK: :[[@LINE+1]]:12: error: invalid operand for instruction
+qc.muladdi x0, x10, 1048577
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.muladdi x10
+
+# CHECK-IMM: :[[@LINE+1]]:22: error: operand must be a symbol with %lo/%pcrel_lo/%tprel_lo modifier or an integer in the range [-2048, 2047]
+qc.muladdi x10, x15, 8589934592
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.muladdi x10, x15, 577
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.shladd 0, x10, 1048577
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.shladd x10
+
+# CHECK-IMM: :[[@LINE+1]]:26: error: immediate must be an integer in the range [4, 31]
+qc.shladd x10, x15, x11, 2
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+qc.shladd x10, x15, x11, 5
diff --git a/llvm/test/MC/RISCV/xqciac-valid.s b/llvm/test/MC/RISCV/xqciac-valid.s
new file mode 100644
index 0000000..6e97d8c
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqciac-valid.s
@@ -0,0 +1,49 @@
+# Xqciac - Qualcomm uC Load-Store Address Calculation Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciac -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciac < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqciac -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciac -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciac < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqciac --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 0
+# CHECK-ENC: encoding: [0x8a,0x21]
+qc.c.muladdi x10, x11, 0
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 31
+# CHECK-ENC: encoding: [0xea,0x3d]
+qc.c.muladdi x10, x11, 31
+
+# CHECK-INST: qc.c.muladdi    a0, a1, 16
+# CHECK-ENC: encoding: [0xaa,0x21]
+qc.c.muladdi x10, x11, 16
+
+
+# CHECK-INST: qc.muladdi      tp, t0, 1234
+# CHECK-ENC: encoding: [0x0b,0xe2,0x22,0x4d]
+qc.muladdi x4, x5, 1234
+
+# CHECK-INST: qc.muladdi      a0, a1, -2048
+# CHECK-ENC: encoding: [0x0b,0xe5,0x05,0x80]
+qc.muladdi x10, x11, -2048
+
+# CHECK-INST: qc.muladdi      a0, a1, 2047
+# CHECK-ENC: encoding: [0x0b,0xe5,0xf5,0x7f]
+qc.muladdi x10, x11, 2047
+
+
+# CHECK-INST: qc.shladd       tp, t0, t1, 12
+# CHECK-ENC: encoding: [0x0b,0xb2,0x62,0x58]
+qc.shladd x4, x5, x6, 12
+
+# CHECK-INST: qc.shladd       a0, a1, a2, 4
+# CHECK-ENC: encoding: [0x0b,0xb5,0xc5,0x48]
+qc.shladd x10, x11, x12, 4
+
+# CHECK-INST: qc.shladd       a0, a1, a2, 31
+# CHECK-ENC: encoding: [0x0b,0xb5,0xc5,0x7e]
+qc.shladd x10, x11, x12, 31
diff --git a/llvm/test/MC/RISCV/xqcicli-invalid.s b/llvm/test/MC/RISCV/xqcicli-invalid.s
new file mode 100644
index 0000000..7ee92ec
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicli-invalid.s
@@ -0,0 +1,232 @@
+# Xqcicli - Qualcomm uC Conditional Load Immediate Instructions
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcicli < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-PLUS %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcicli < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-MINUS %s
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.lieq x0, x4, x6, 10
+
+# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction
+qc.lieq x2, x0, x6, 10
+
+# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction
+qc.lieq x2, x4, x0, 10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lieq x2, x4, x6
+
+# CHECK-PLUS: :[[@LINE+1]]:21: error: immediate must be an integer in the range [-16, 15]
+qc.lieq x2, x4, x6, 40
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lieq x2, x4, x6, 10
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.lige x0, x8, x20, 2
+
+# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction
+qc.lige x4, x0, x20, 2
+
+# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction
+qc.lige x4, x8, x0, 2
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lige x4, x8, x20
+
+# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15]
+qc.lige x4, x8, x20, -18
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lige x4, x8, x20, 2
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.lilt x0, x9, x10, 3
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.lilt x19, x0, x10, 3
+
+# CHECK: :[[@LINE+1]]:18: error: invalid operand for instruction
+qc.lilt x19, x9, x0, 3
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lilt x19, x9, x10
+
+# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15]
+qc.lilt x19, x9, x10, 39
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lilt x19, x9, x10, 3
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.line x0, x14, x6, 10
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.line x18, x0, x6, 10
+
+# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction
+qc.line x18, x14, x0, 10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.line x18, x14, x6
+
+# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15]
+qc.line x18, x14, x6, 100
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.line x18, x14, x6, 10
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.ligeu x0, x4, x6, 10
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.ligeu x2, x0, x6, 10
+
+# CHECK: :[[@LINE+1]]:18: error: invalid operand for instruction
+qc.ligeu x2, x4, x0, 10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.ligeu x2, x4, x6
+
+# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15]
+qc.ligeu x2, x4, x6, 70
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.ligeu x2, x4, x6, 10
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.liltu x0, x19, x12, 13
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.liltu x1, x0, x12, 13
+
+# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction
+qc.liltu x1, x19, x0, 13
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.liltu x1, x19, x12
+
+# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15]
+qc.liltu x1, x19, x12, 73
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.liltu x1, x19, x12, 13
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.lieqi x0, x1, 15, 12
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.lieqi x7, x0, 15, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lieqi x7, x1, 15
+
+# CHECK-PLUS: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15]
+qc.lieqi x7, x1, 25, 12
+
+# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15]
+qc.lieqi x7, x1, 15, -22
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lieqi x7, x1, 15, 12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.ligei x0, x11, -4, 9
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.ligei x17, x0, -4, 9
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.ligei x17, x11, -4
+
+# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [-16, 15]
+qc.ligei x17, x11, -24, 9
+
+# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15]
+qc.ligei x17, x11, -4, 59
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.ligei x17, x11, -4, 9
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.lilti x0, x11, -14, 2
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.lilti x9, x0, -14, 2
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.lilti x9, x11, -14
+
+# CHECK-PLUS: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.lilti x9, x11, -84, 2
+
+# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15]
+qc.lilti x9, x11, -14, 52
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.lilti x9, x11, -14, 2
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.linei x0, x1, 10, 12
+
+# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
+qc.linei x5, x0, 10, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.linei x5, x1, 10
+
+# CHECK-PLUS: :[[@LINE+1]]:18: error: immediate must be an integer in the range [-16, 15]
+qc.linei x5, x1, 130, 12
+
+# CHECK-PLUS: :[[@LINE+1]]:22: error: immediate must be an integer in the range [-16, 15]
+qc.linei x5, x1, 10, 124
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.linei x5, x1, 10, 12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.ligeui x0, x12, 7, -12
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.ligeui x2, x0, 7, -12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.ligeui x2, x12, 7
+
+# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.ligeui x2, x12, -7, -12
+
+# CHECK-PLUS: :[[@LINE+1]]:23: error: immediate must be an integer in the range [-16, 15]
+qc.ligeui x2, x12, 7, -17
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.ligeui x2, x12, 7, -12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.liltui x0, x25, 31, 12
+
+# CHECK: :[[@LINE+1]]:15: error: invalid operand for instruction
+qc.liltui x3, x0, 31, 12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.liltui x3, x25, 31
+
+# CHECK-PLUS: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.liltui x3, x25, 32, 12
+
+# CHECK-PLUS: :[[@LINE+1]]:24: error: immediate must be an integer in the range [-16, 15]
+qc.liltui x3, x25, 31, 112
+
+# CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+qc.liltui x3, x25, 31, 12
diff --git a/llvm/test/MC/RISCV/xqcicli-valid.s b/llvm/test/MC/RISCV/xqcicli-valid.s
new file mode 100644
index 0000000..404bfdf
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicli-valid.s
@@ -0,0 +1,59 @@
+# Xqcicli - Qualcomm uC Conditional Load Immediate Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicli -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicli < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicli -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicli -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicli < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicli --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.lieq   sp, tp, t1, 10
+# CHECK-ENC: encoding: [0x5b,0x01,0x62,0x52]
+qc.lieq x2, x4, x6, 10
+
+# CHECK-INST: qc.lieqi  t2, ra, 15, 12
+# CHECK-ENC: encoding: [0xdb,0x83,0xf0,0x66]
+qc.lieqi x7, x1, 15, 12
+
+# CHECK-INST: qc.lige   tp, s0, s4, 2
+# CHECK-ENC: encoding: [0x5b,0x52,0x44,0x13]
+qc.lige x4, x8, x20, 2
+
+# CHECK-INST: qc.ligei  a7, a1, -4, 9
+# CHECK-ENC: encoding: [0xdb,0xd8,0xc5,0x4f]
+qc.ligei x17, x11, -4, 9
+
+# CHECK-INST: qc.ligeu  sp, tp, t1, 10
+# CHECK-ENC: encoding: [0x5b,0x71,0x62,0x52]
+qc.ligeu x2, x4, x6, 10
+
+# CHECK-INST: qc.ligeui sp, a2, 7, -12
+# CHECK-ENC: encoding: [0x5b,0x71,0x76,0xa6]
+qc.ligeui x2, x12, 7, -12
+
+# CHECK-INST: qc.lilt   s3, s1, a0, 3
+# CHECK-ENC: encoding: [0xdb,0xc9,0xa4,0x1a]
+qc.lilt x19, x9, x10, 3
+
+# CHECK-INST: qc.lilti  s1, a1, -14, 2
+# CHECK-ENC: encoding: [0xdb,0xc4,0x25,0x17]
+qc.lilti x9, x11, -14, 2
+
+# CHECK-INST: qc.liltu  ra, s3, a2, 13
+# CHECK-ENC: encoding: [0xdb,0xe0,0xc9,0x6a]
+qc.liltu x1, x19, x12, 13
+
+# CHECK-INST: qc.liltui gp, s9, 31, 12
+# CHECK-ENC: encoding: [0xdb,0xe1,0xfc,0x67]
+qc.liltui x3, x25, 31, 12
+
+# CHECK-INST: qc.line   s2, a4, t1, 10
+# CHECK-ENC: encoding: [0x5b,0x19,0x67,0x52]
+qc.line x18, x14, x6, 10
+
+# CHECK-INST: qc.linei  t0, ra, 10, 12
+# CHECK-ENC: encoding: [0xdb,0x92,0xa0,0x66]
+qc.linei x5, x1, 10, 12
diff --git a/llvm/test/MC/RISCV/xqcicm-invalid.s b/llvm/test/MC/RISCV/xqcicm-invalid.s
new file mode 100644
index 0000000..8b37ed4
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicm-invalid.s
@@ -0,0 +1,152 @@
+# Xqcicm - Qualcomm uC Conditional Move Extension
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcicm < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-IMM %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcicm < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
+
+# CHECK: :[[@LINE+1]]:12: error: invalid operand for instruction
+qc.c.mveqz 9, x10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.mveqz x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.c.mveqz x9, x10
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mveq 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mveq x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mveq x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvge 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvge x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvge x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvgeu 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgeu x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgeu x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvlt 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvlt x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvlt x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvltu 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvltu x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvltu x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvne 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvne x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvne x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mveqi 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mveqi x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mveqi x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mveqi x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvgei 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgei x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvgei x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgei x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvlti 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvlti x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvlti x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvlti x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvnei 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvnei x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvnei x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvnei x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.mvltui 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvltui x9
+
+# CHECK-IMM: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.mvltui x9, x10, 37, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvltui x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.mvgeui 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgeui x9
+
+# CHECK-IMM: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.mvgeui x9, x10, 37, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgeui x9, x10, 5, x12
diff --git a/llvm/test/MC/RISCV/xqcicm-valid.s b/llvm/test/MC/RISCV/xqcicm-valid.s
new file mode 100644
index 0000000..7d0050b
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicm-valid.s
@@ -0,0 +1,123 @@
+# Xqcicm - Qualcomm uC Conditional Move Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicm -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicm < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicm -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicm -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicm < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicm --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.c.mveqz      s1, a0
+# CHECK-ENC: encoding: [0x06,0xad]
+qc.c.mveqz x9, x10
+
+
+# CHECK-INST: qc.mveq s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0xb5,0x60]
+qc.mveq x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvge s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0xb5,0x60]
+qc.mvge x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvgeu        s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0xb5,0x60]
+qc.mvgeu x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvlt s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0xb5,0x60]
+qc.mvlt x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvltu        s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0xb5,0x60]
+qc.mvltu x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvne s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0xb5,0x60]
+qc.mvne x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mveqi        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0x55,0x64]
+qc.mveqi x9, x10, 5, x12
+
+# CHECK-INST: qc.mveqi        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0x05,0x65]
+qc.mveqi x9, x10, -16, x12
+
+# CHECK-INST: qc.mveqi        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0xf5,0x64]
+qc.mveqi x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvgei        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0x55,0x64]
+qc.mvgei x9, x10, 5, x12
+
+# CHECK-INST: qc.mvgei        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0x05,0x65]
+qc.mvgei x9, x10, -16, x12
+
+# CHECK-INST: qc.mvgei        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0xf5,0x64]
+qc.mvgei x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvlti        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0x55,0x64]
+qc.mvlti x9, x10, 5, x12
+
+# CHECK-INST: qc.mvlti        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0x05,0x65]
+qc.mvlti x9, x10, -16, x12
+
+# CHECK-INST: qc.mvlti        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0xf5,0x64]
+qc.mvlti x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvnei        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0x55,0x64]
+qc.mvnei x9, x10, 5, x12
+
+# CHECK-INST: qc.mvnei        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0x05,0x65]
+qc.mvnei x9, x10, -16, x12
+
+# CHECK-INST: qc.mvnei        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0xf5,0x64]
+qc.mvnei x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvltui       s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0x55,0x64]
+qc.mvltui x9, x10, 5, x12
+
+# CHECK-INST: qc.mvltui       s1, a0, 0, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0x05,0x64]
+qc.mvltui x9, x10, 0, x12
+
+# CHECK-INST: qc.mvltui       s1, a0, 31, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0xf5,0x65]
+qc.mvltui x9, x10, 31, x12
+
+
+# CHECK-INST: qc.mvgeui       s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0x55,0x64]
+qc.mvgeui x9, x10, 5, x12
+
+# CHECK-INST: qc.mvgeui       s1, a0, 0, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0x05,0x64]
+qc.mvgeui x9, x10, 0, x12
+
+# CHECK-INST: qc.mvgeui       s1, a0, 31, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0xf5,0x65]
+qc.mvgeui x9, x10, 31, x12
diff --git a/llvm/test/MC/RISCV/xqcics-valid.s b/llvm/test/MC/RISCV/xqcics-valid.s
index eb888a6..1438f67 100644
--- a/llvm/test/MC/RISCV/xqcics-valid.s
+++ b/llvm/test/MC/RISCV/xqcics-valid.s
@@ -1,5 +1,5 @@
 # Xqcics - Qualcomm uC Conditional Select Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcics -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcics -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcics < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcics -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqcicsr-valid.s b/llvm/test/MC/RISCV/xqcicsr-valid.s
index 1236dd6..ab26098 100644
--- a/llvm/test/MC/RISCV/xqcicsr-valid.s
+++ b/llvm/test/MC/RISCV/xqcicsr-valid.s
@@ -1,5 +1,5 @@
 # Xqcicsr - Qualcomm uC CSR Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicsr -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicsr -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicsr < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcicsr -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s b/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s
index e9aec14..b65a831 100644
--- a/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s
+++ b/llvm/test/MC/RISCV/xqcilsm-aliases-valid.s
@@ -1,5 +1,5 @@
 # Xqcilsm - Qualcomm uC Load Store Multiple Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilsm < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcilsm -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqcilsm-valid.s b/llvm/test/MC/RISCV/xqcilsm-valid.s
index 4893e07..cbe25a2 100644
--- a/llvm/test/MC/RISCV/xqcilsm-valid.s
+++ b/llvm/test/MC/RISCV/xqcilsm-valid.s
@@ -1,5 +1,5 @@
 # Xqcilsm - Qualcomm uC Load Store Multiple Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcilsm -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcilsm < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcilsm -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xqcisls-valid.s b/llvm/test/MC/RISCV/xqcisls-valid.s
index 32f64a8..d7e80b3 100644
--- a/llvm/test/MC/RISCV/xqcisls-valid.s
+++ b/llvm/test/MC/RISCV/xqcisls-valid.s
@@ -1,5 +1,5 @@
 # Xqcisls - Qualcomm uC Scaled Load Store Extension
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcisls -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcisls -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcisls < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqcisls -M no-aliases --no-print-imm-hex -d - \
diff --git a/llvm/test/MC/RISCV/xsifive-valid.s b/llvm/test/MC/RISCV/xsifive-valid.s
index 8aa0ab1..bf59981 100644
--- a/llvm/test/MC/RISCV/xsifive-valid.s
+++ b/llvm/test/MC/RISCV/xsifive-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease < %s \
 # RUN:     | llvm-objdump --mattr=+xsifivecdiscarddlone,+xsifivecflushdlone,+xsfcease -M no-aliases -d - \
diff --git a/llvm/test/MC/RISCV/xwchc-compress.s b/llvm/test/MC/RISCV/xwchc-compress.s
index 4bdce1c..7964497 100644
--- a/llvm/test/MC/RISCV/xwchc-compress.s
+++ b/llvm/test/MC/RISCV/xwchc-compress.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -show-encoding < %s \
 # RUN:   | FileCheck -check-prefixes=CHECK,CHECK-ALIAS %s
 # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -show-encoding \
-# RUN:   -riscv-no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+# RUN:   -M no-aliases < %s | FileCheck -check-prefixes=CHECK,CHECK-INST %s
 # RUN: llvm-mc -triple riscv32 -mattr=+xwchc -filetype=obj < %s \
 # RUN:   | llvm-objdump  --triple=riscv32 --mattr=+xwchc --no-print-imm-hex -d - \
 # RUN:   | FileCheck -check-prefixes=CHECK-ALIAS %s
diff --git a/llvm/test/MC/RISCV/xwchc-valid.s b/llvm/test/MC/RISCV/xwchc-valid.s
index 292a042..51767941 100644
--- a/llvm/test/MC/RISCV/xwchc-valid.s
+++ b/llvm/test/MC/RISCV/xwchc-valid.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+xwchc -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+xwchc -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+xwchc < %s \
 # RUN:     | llvm-objdump --mattr=+xwchc --no-print-imm-hex -M no-aliases -d -r - \
diff --git a/llvm/test/MC/RISCV/zfa-double-invalid.s b/llvm/test/MC/RISCV/zfa-double-invalid.s
index ec21b0c..f28bd5c 100644
--- a/llvm/test/MC/RISCV/zfa-double-invalid.s
+++ b/llvm/test/MC/RISCV/zfa-double-invalid.s
@@ -1,8 +1,8 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTD %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTD %s
 
 # CHECK-NO-EXTD: error: instruction requires the following: 'D' (Double-Precision Floating-Point){{$}}
diff --git a/llvm/test/MC/RISCV/zfa-half-invalid.s b/llvm/test/MC/RISCV/zfa-half-invalid.s
index a2c6f09..debaf71 100644
--- a/llvm/test/MC/RISCV/zfa-half-invalid.s
+++ b/llvm/test/MC/RISCV/zfa-half-invalid.s
@@ -1,8 +1,8 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+d \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTZFH %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+d \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXTZFH %s
 
 # CHECK-NO-EXTZFH: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point){{$}}
diff --git a/llvm/test/MC/RISCV/zfa-valid.s b/llvm/test/MC/RISCV/zfa-valid.s
index e951c9d..6e78a4c 100644
--- a/llvm/test/MC/RISCV/zfa-valid.s
+++ b/llvm/test/MC/RISCV/zfa-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+d,+zfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \
@@ -10,10 +10,10 @@
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
 # RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 # RUN: not llvm-mc -triple riscv64 -mattr=+d,+zfh \
-# RUN:     -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: fli.s ft1, -1.0
diff --git a/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s b/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
index 6b5dc92..a7a16d5 100644
--- a/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
+++ b/llvm/test/MC/RISCV/zfa-zfhmin-zvfh-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+zfhmin,+zvfh -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+zfhmin,+zvfh -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+zfhmin,+zvfh < %s \
 # RUN:     | llvm-objdump --mattr=+zfa,+zfhmin,+zvfh -M no-aliases -d -r - \
@@ -9,9 +9,9 @@
 # RUN:     | llvm-objdump --mattr=+zfa,+zfhmin,+zvfh -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # This test makes sure fli.h is supported with Zvfh.
diff --git a/llvm/test/MC/RISCV/zicfilp-invalid.s b/llvm/test/MC/RISCV/zicfilp-invalid.s
index 5b22c0a..bff989f 100644
--- a/llvm/test/MC/RISCV/zicfilp-invalid.s
+++ b/llvm/test/MC/RISCV/zicfilp-invalid.s
@@ -1,6 +1,6 @@
-# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zicfilp -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -mattr=+experimental-zicfilp -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-NO-EXT: immediate must be an integer in the range [0, 1048575]
diff --git a/llvm/test/MC/RISCV/zicfilp-valid.s b/llvm/test/MC/RISCV/zicfilp-valid.s
index 308e9b6..f61cad8 100644
--- a/llvm/test/MC/RISCV/zicfilp-valid.s
+++ b/llvm/test/MC/RISCV/zicfilp-valid.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfilp -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfilp -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfilp -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicfilp < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-zicfilp --no-print-imm-hex -d -r - \
@@ -9,9 +9,9 @@
 # RUN:     | llvm-objdump --mattr=+experimental-zicfilp --no-print-imm-hex -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 #
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
 
 # CHECK-ASM-AND-OBJ: lpad 22
diff --git a/llvm/test/MC/RISCV/zicfiss-valid.s b/llvm/test/MC/RISCV/zicfiss-valid.s
index fd69d37..5b2ab8d 100644
--- a/llvm/test/MC/RISCV/zicfiss-valid.s
+++ b/llvm/test/MC/RISCV/zicfiss-valid.s
@@ -1,17 +1,17 @@
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zicfiss -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zicfiss -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+experimental-zicfiss < %s \
 # RUN:     | llvm-objdump --mattr=+a,+experimental-zicfiss -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
-# RUN: llvm-mc %s -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss -riscv-no-aliases -show-encoding \
+# RUN: llvm-mc %s -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss -M no-aliases -show-encoding \
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM-RV64,CHECK-ASM,CHECK-ASM-AND-OBJ-RV64,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss < %s \
 # RUN:     | llvm-objdump --mattr=+a,+experimental-zicfiss -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ-RV64,CHECK-ASM-AND-OBJ %s
 #
-# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
-# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -M no-aliases -show-encoding < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-RV64 %s
 
 # CHECK-ASM-AND-OBJ: sspopchk ra
diff --git a/llvm/test/TableGen/ContextlessPredicates.td b/llvm/test/TableGen/GlobalISelEmitter/ContextlessPredicates.td
index eead965..fa3484e 100644
--- a/llvm/test/TableGen/ContextlessPredicates.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/ContextlessPredicates.td
@@ -1,6 +1,6 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/context-non-optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/context-non-optimized.cpp
 // RUN: FileCheck %s --check-prefixes=CHECK_NOPT -input-file=%T/context-non-optimized.cpp
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=true  %s -o %T/context-optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=true  %s -o %T/context-optimized.cpp
 // RUN: FileCheck %s --check-prefixes=CHECK_OPT -input-file=%T/context-optimized.cpp
 
 
diff --git a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td b/llvm/test/TableGen/GlobalISelEmitter/CustomPredicate.td
index 3ceadf3..56eaa4b 100644
--- a/llvm/test/TableGen/GlobalISelEmitterCustomPredicate.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/CustomPredicate.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s
 
 // Verify that all MI predicates are enumerated.
 //
diff --git a/llvm/test/TableGen/DefaultOpsGlobalISel.td b/llvm/test/TableGen/GlobalISelEmitter/DefaultOpsGlobalISel.td
index 8f4176a..f88045c 100644
--- a/llvm/test/TableGen/DefaultOpsGlobalISel.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/DefaultOpsGlobalISel.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common -o - | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterFlags.td b/llvm/test/TableGen/GlobalISelEmitter/Flags.td
index fa8f2a7..0878955 100644
--- a/llvm/test/TableGen/GlobalISelEmitterFlags.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/Flags.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
index ffefaba..7c81814 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
@@ -1,6 +1,6 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/non-optimized.cpp
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=true  %s -o %T/optimized.cpp
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o %T/default.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/non-optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=true  %s -o %T/optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s -o %T/default.cpp
 
 // RUN: FileCheck %s --check-prefixes=CHECK,R19C,R19N -input-file=%T/non-optimized.cpp
 // RUN: FileCheck %s --check-prefixes=CHECK,R19C,R19O -input-file=%T/optimized.cpp
diff --git a/llvm/test/TableGen/GlobalISelEmitterHwModes.td b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
index 9d235f5..3588ba3 100644
--- a/llvm/test/TableGen/GlobalISelEmitterHwModes.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/HwModes.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common -optimize-match-table=false %s -o %T/hwmode-non-optimized.cpp
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common -optimize-match-table=false %s -o %T/hwmode-non-optimized.cpp
 // RUN: FileCheck %s --check-prefixes=CHECK -input-file=%T/hwmode-non-optimized.cpp
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td
index 3db31be..c430725 100644
--- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizer.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizer.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand-invalid.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand-invalid.td
index d93805b..18ae767 100644
--- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand-invalid.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand-invalid.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand.td b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand.td
index 1ac3399..c6ca9b7 100644
--- a/llvm/test/TableGen/GlobalISelEmitterMatchTableOptimizerSameOperand.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/MatchTableOptimizerSameOperand.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=true -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterOverloadedPtr.td b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td
index 422edbb..31accba 100644
--- a/llvm/test/TableGen/GlobalISelEmitterOverloadedPtr.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/OverloadedPtr.td
@@ -1,13 +1,35 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o - | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 // Boilerplate code.
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
 
+def GPR : RegisterClass<"MyTarget", [i32, i64], 32, (add R0)>;
+
 let TargetPrefix = "mytarget" in {
     def int_mytarget_anyptr : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty]>;
 }
 
+// Check that iPTR in the destination DAG doesn't prevent the pattern from being imported.
+
+// CHECK: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// CHECK-NEXT: GIM_CheckMemorySizeEqualToLLT, /*MI*/0, /*MMO*/0, /*OpIdx*/0,
+// CHECK-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT: // MIs[0] src1
+// CHECK-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/0,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID),
+// CHECK-NEXT: // (ld:{ *:[i32] } GPR:{ *:[iPTR] }:$src1)<<P:Predicate_unindexedload>><<P:Predicate_load>>  =>  (ANYLOAD:{ *:[i32] } GPR:{ *:[iPTR] }:$src1)
+// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(MyTarget::ANYLOAD),
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage, 0,
+// CHECK-NEXT: GIR_Done,
+
+let hasSideEffects = 1 in {
+  def ANYLOAD : I<(outs GPR32:$dst), (ins GPR:$src1),
+                  [(set GPR32:$dst, (load GPR:$src1))]>;
+}
+
 // Ensure that llvm_anyptr_ty on an intrinsic results in a
 // GIM_CheckPointerToAny rather than a GIM_CheckType.
 //
@@ -20,10 +42,6 @@ let TargetPrefix = "mytarget" in {
 // CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_frag_anyptr),
 // CHECK-NEXT: // (intrinsic_w_chain:{ *:[i32] } {{[0-9]+}}:{ *:[iPTR] }, GPR32:{ *:[i32] }:$src)<<P:Predicate_frag_anyptr>>  =>  (ANYLOAD:{ *:[i32] } GPR32:{ *:[i32] }:$src)
 // CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ANYLOAD),
-let hasSideEffects = 1 in {
-  def ANYLOAD : I<(outs GPR32:$dst), (ins GPR32:$src1),
-            [(set GPR32:$dst, (load GPR32:$src1))]>;
-}
 
 def frag_anyptr : PatFrag<(ops node:$src),
                    (int_mytarget_anyptr node:$src),
diff --git a/llvm/test/TableGen/GlobalISelEmitter-PR39045.td b/llvm/test/TableGen/GlobalISelEmitter/PR39045.td
index 5407222..595fa92 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-PR39045.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/PR39045.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common  %s -o %t
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common  %s -o %t
 // RUN: FileCheck %s < %t
 
 // Both predicates should be tested
diff --git a/llvm/test/TableGen/GlobalISelEmitterRegSequence.td b/llvm/test/TableGen/GlobalISelEmitter/RegSequence.td
index 69f82ea..97790fb 100644
--- a/llvm/test/TableGen/GlobalISelEmitterRegSequence.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/RegSequence.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td b/llvm/test/TableGen/GlobalISelEmitter/SDNodeXForm-timm.td
index 8d6dedf..fab395d 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-SDNodeXForm-timm.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/SDNodeXForm-timm.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td b/llvm/test/TableGen/GlobalISelEmitter/SkippedPatterns.td
index fc8abc6..9d9d3f9 100644
--- a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/SkippedPatterns.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../include %s -I %p/Common -o /dev/null 2>&1 | FileCheck %s
-// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../include %s -I %p/Common -o /dev/null -DIGNORE 2>&1 | FileCheck --allow-empty --check-prefix=IGNORED %s
+// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../../include %s -I %p/../Common -o /dev/null 2>&1 | FileCheck %s
+// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../../include %s -I %p/../Common -o /dev/null -DIGNORE 2>&1 | FileCheck --allow-empty --check-prefix=IGNORED %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterSubreg.td b/llvm/test/TableGen/GlobalISelEmitter/Subreg.td
index 08e690f..5203c2b 100644
--- a/llvm/test/TableGen/GlobalISelEmitterSubreg.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/Subreg.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common -o - 2> %t.skipped | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common 2> %t.skipped | FileCheck %s
 // RUN: cat %t.skipped | FileCheck %s --check-prefix=SKIPPED
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitterVariadic.td b/llvm/test/TableGen/GlobalISelEmitter/Variadic.td
index 992e1a4..b3c8052 100644
--- a/llvm/test/TableGen/GlobalISelEmitterVariadic.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/Variadic.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s -o - | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-atomic_store.td b/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td
index da2dfe8..53b8670 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-atomic_store.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/atomic-store.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter/dead-def.td b/llvm/test/TableGen/GlobalISelEmitter/dead-def.td
new file mode 100644
index 0000000..a8597f1
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/dead-def.td
@@ -0,0 +1,27 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false \
+// RUN:   -I %p/../../../include -I %p/../Common %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+// Check that $same_name from I2 isn't copied to the root instruction.
+
+def I1 : I<(outs GPR32:$same_name), (ins GPR32:$rs), []>;
+def I2 : I<(outs GPR32:$other_name, GPR32:$same_name), (ins GPR32:$rs), []>;
+
+def : Pat<(abs i32:$x), (I1 (I2 $x))>;
+
+// CHECK-LABEL: // (abs:{ *:[i32] } i32:{ *:[i32] }:$x)  =>  (I1:{ *:[i32] } (I2:{ *:[i32] }:{ *:[i32] } ?:{ *:[i32] }:$x))
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/1, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(MyTarget::I2),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/GIMT_Encode2(RegState::Define),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/1, /*TempRegFlags*/GIMT_Encode2(RegState::Define|RegState::Dead),
+// CHECK-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/1, // x
+// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/1,
+// CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::I1),
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[same_name]
+// CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage, 0,
+// CHECK-NEXT: GIR_EraseRootFromParent_Done,
diff --git a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td b/llvm/test/TableGen/GlobalISelEmitter/frameindex.td
index 715e53d..2778452 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/frameindex.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td
new file mode 100644
index 0000000..1f1b557
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/gisel-physreg-input.td
@@ -0,0 +1,167 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s
+
+include "llvm/Target/Target.td"
+
+def TestTargetInstrInfo : InstrInfo;
+
+def TestTarget : Target {
+  let InstructionSet = TestTargetInstrInfo;
+}
+
+def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
+def SPECIAL : Register<"special"> { let Namespace = "MyTarget"; }
+def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
+def Special32 : RegisterClass<"MyTarget", [i32], 32, (add SPECIAL)>;
+
+
+class I<dag OOps, dag IOps, list<dag> Pat>
+  : Instruction {
+  let Namespace = "MyTarget";
+  let OutOperandList = OOps;
+  let InOperandList = IOps;
+  let Pattern = Pat;
+}
+
+// Try a nested physical register
+
+// GISEL: GIM_Try,
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// GISEL-NEXT: // MIs[0] src0
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] Operand 1
+// GISEL-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32,
+// GISEL-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_MUL),
+// GISEL-NEXT: // MIs[1] Operand 0
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: // MIs[1] src1
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[1] Operand 2
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
+// GISEL-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1,
+// GISEL-NEXT: // (st GPR32:{ *:[i32] }:$src0, (mul:{ *:[i32] } GPR32:{ *:[i32] }:$src1, SPECIAL:{ *:[i32] }))  =>  (MULM_PHYS GPR32:{ *:[i32] }:$src0, GPR32:{ *:[i32] }:$src1)
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/1, /*OpIdx*/2, // SPECIAL
+// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MULM_PHYS),
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // src0
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1
+// GISEL-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1,
+// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
+// GISEL-NEXT: // GIR_Coverage, 0,
+// GISEL-NEXT: GIR_EraseRootFromParent_Done,
+def MULM_PHYS : I<(outs), (ins GPR32:$src0, GPR32:$src1),
+    [(st GPR32:$src0, (mul GPR32:$src1, SPECIAL))]> {
+  let Uses = [SPECIAL];
+}
+
+// Try nested physical registers and check on duplicated copies
+
+// GISEL: GIM_Try,
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_STORE),
+// GISEL-NEXT: GIM_CheckAtomicOrdering, /*MI*/0, /*Order*/(uint8_t)AtomicOrdering::NotAtomic,
+// GISEL-NEXT: // MIs[0] src0
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] Operand 1
+// GISEL-NEXT: GIM_CheckPointerToAny, /*MI*/0, /*Op*/1, /*SizeInBits*/32,
+// GISEL-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_MUL),
+// GISEL-NEXT: // MIs[1] Operand 0
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: // MIs[1] Operand 1
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[1] Operand 2
+// GISEL-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
+// GISEL-NEXT: GIM_CheckIsSafeToFold, /*NumInsns*/1,
+// GISEL-NEXT: // (st GPR32:{ *:[i32] }:$src0, (mul:{ *:[i32] } R0:{ *:[i32] }, SPECIAL:{ *:[i32] }))  =>  (MULMR0_PHYS GPR32:{ *:[i32] }:$src0)
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/2, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/2, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/2, /*OldInsnID*/1, /*OpIdx*/2, // SPECIAL
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::R0), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/1, /*OpIdx*/1, // R0
+// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MULMR0_PHYS),
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // src0
+// GISEL-NEXT: GIR_MergeMemOperands, /*InsnID*/0, /*NumInsns*/2, /*MergeInsnID's*/0, 1,
+// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
+// GISEL-NEXT: // GIR_Coverage, 1,
+// GISEL-NEXT: GIR_EraseRootFromParent_Done,
+def MULMR0_PHYS : I<(outs), (ins GPR32:$src0),
+    [(st GPR32:$src0, (mul R0, SPECIAL))]> {
+  let Uses = [R0, SPECIAL];
+}
+
+// Try a normal physical register use.
+
+// GISEL: GIM_Try,
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_ADD),
+// GISEL-NEXT: // MIs[0] DstI[dst]
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] src0
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] Operand 2
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
+// GISEL-NEXT: // (add:{ *:[i32] } GPR32:{ *:[i32] }:$src0, SPECIAL:{ *:[i32] })  =>  (ADD_PHYS:{ *:[i32] } GPR32:{ *:[i32] }:$src0)
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/2, // SPECIAL
+// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ADD_PHYS),
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // src0
+// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
+// GISEL-NEXT: // GIR_Coverage, 2,
+// GISEL-NEXT: GIR_EraseRootFromParent_Done,
+def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0),
+    [(set GPR32:$dst, (add GPR32:$src0, SPECIAL))]> {
+  let Uses = [SPECIAL];
+}
+
+// Try using the name of the physreg in another operand.
+
+// GISEL: GIM_Try,
+// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_MUL),
+// GISEL-NEXT: // MIs[0] DstI[dst]
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] SPECIAL
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// GISEL-NEXT: // MIs[0] Operand 2
+// GISEL-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
+// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
+// GISEL-NEXT: // (mul:{ *:[i32] } GPR32:{ *:[i32] }:$SPECIAL, SPECIAL:{ *:[i32] })  =>  (MUL_PHYS:{ *:[i32] } GPR32:{ *:[i32] }:$SPECIAL)
+// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
+// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
+// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/2, // SPECIAL
+// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MUL_PHYS),
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
+// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // SPECIAL
+// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
+// GISEL-NEXT: // GIR_Coverage, 3,
+// GISEL-NEXT: GIR_EraseRootFromParent_Done,
+def MUL_PHYS : I<(outs GPR32:$dst), (ins GPR32:$SPECIAL),
+    [(set GPR32:$dst, (mul GPR32:$SPECIAL, SPECIAL))]> {
+  let Uses = [SPECIAL];
+}
+
+// Try giving the physical operand a name
+// def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0),
+//     [(set GPR32:$dst, (add GPR32:$src0, SPECIAL:$special))]> {
+//   let Uses = [SPECIAL];
+// }
diff --git a/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td b/llvm/test/TableGen/GlobalISelEmitter/immAllZeroOne.td
index 0125aa5..68278f4 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-immAllZeroOne.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/immAllZeroOne.td
@@ -1,5 +1,5 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefixes=GISEL-NOOPT %s
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=true -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefixes=GISEL-OPT %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefixes=GISEL-NOOPT %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=true -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefixes=GISEL-OPT %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td b/llvm/test/TableGen/GlobalISelEmitter/immarg-literal-pattern.td
index 6b4012e..ff05ac1 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-immarg-literal-pattern.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/immarg-literal-pattern.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/immarg-predicated.td b/llvm/test/TableGen/GlobalISelEmitter/immarg-predicated.td
index dcacb2f..ab412fa 100644
--- a/llvm/test/TableGen/immarg-predicated.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/immarg-predicated.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/Common -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../Common -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/immarg.td b/llvm/test/TableGen/GlobalISelEmitter/immarg.td
index e5fd06c..eae0409 100644
--- a/llvm/test/TableGen/immarg.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/immarg.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/Common -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../Common -I %p/../../../include %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td b/llvm/test/TableGen/GlobalISelEmitter/implicit-defs.td
index 79af1a3..06e5e39 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-implicit-defs.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/implicit-defs.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o /dev/null 2>&1 < %s | FileCheck %s --implicit-check-not="Skipped pattern"
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not="Skipped pattern"
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-input-discard.td b/llvm/test/TableGen/GlobalISelEmitter/input-discard.td
index 202ff4a..65ebfa2 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-input-discard.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/input-discard.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-multiple-output-discard.td b/llvm/test/TableGen/GlobalISelEmitter/multiple-output-discard.td
index 2d968bebb..a180431 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-multiple-output-discard.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/multiple-output-discard.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td b/llvm/test/TableGen/GlobalISelEmitter/multiple-output.td
index dea3b54..baf7675 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-multiple-output.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/multiple-output.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-nested-subregs.td b/llvm/test/TableGen/GlobalISelEmitter/nested-subregs.td
index 79e55ef..8688e4f 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-nested-subregs.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/nested-subregs.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common -o - | FileCheck %s
+// RUN: llvm-tblgen %s -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-notype-output-pattern.td b/llvm/test/TableGen/GlobalISelEmitter/notype-output-pattern.td
index 622d7fa..80b1256 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-notype-output-pattern.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/notype-output-pattern.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -I %p/../../include -I %p/Common %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-optional-def.td b/llvm/test/TableGen/GlobalISelEmitter/optional-def.td
index def4a04..7792a97e 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-optional-def.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/optional-def.td
@@ -1,5 +1,5 @@
 // RUN: llvm-tblgen -gen-global-isel  -warn-on-skipped-patterns \
-// RUN:   -I %p/../../include -I %p/Common %s 2> %t | FileCheck %s
+// RUN:   -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck %s
 // RUN: FileCheck -DFILE=%s -check-prefix=ERR %s < %t
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-output-discard.td b/llvm/test/TableGen/GlobalISelEmitter/output-discard.td
index 7a0242d..c249dcb 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-output-discard.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/output-discard.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../include -I %p/Common %s -o - < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -I %p/../../../include -I %p/../Common %s | FileCheck -check-prefix=GISEL %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-setcc.td b/llvm/test/TableGen/GlobalISelEmitter/setcc.td
index 38add76..02622d0 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-setcc.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/setcc.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - 2> %t < %s | FileCheck -check-prefix=GISEL %s
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns -optimize-match-table=false -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck -check-prefix=GISEL %s
 // RUN: FileCheck -DFILE=%s -check-prefix=ERR %s < %t
 
 include "llvm/Target/Target.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td
new file mode 100644
index 0000000..323aea9
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/undef-tied-input.td
@@ -0,0 +1,35 @@
+// RUN: llvm-tblgen -gen-global-isel -warn-on-skipped-patterns \
+// RUN:   -I %p/../../../include -I %p/../Common %s 2> %t | FileCheck %s
+// RUN: FileCheck -check-prefix=ERR %s < %t
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def undef_tied_1 : OperandWithDefaultOps<untyped, (ops (i32 undef_tied_input))> {
+  let MIOperandInfo = (ops GPR32:$inactive);
+}
+
+def undef_tied_2 : OperandWithDefaultOps<i32, (ops (untyped undef_tied_input))> {
+  let MIOperandInfo = (ops GPR32:$inactive);
+}
+
+let Constraints = "$opt.inactive = $rd" in
+def I1 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied_1:$opt),
+           [(set GPR32:$rd, (abs i32:$rs))]>;
+
+// ERR: [[#@LINE+2]]:5: warning: Skipped pattern: unsupported type
+let Constraints = "$opt.inactive = $rd" in
+def I2 : I<(outs GPR32:$rd), (ins GPR32:$rs, undef_tied_2:$opt),
+           [(set GPR32:$rd, (abs i32:$rs))]>;
+
+// CHECK-LABEL: // (abs:{ *:[i32] } i32:{ *:[i32] }:$rs)  =>  (I1:{ *:[i32] } i32:{ *:[i32] }:$rs)
+// CHECK-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32,
+// CHECK-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::IMPLICIT_DEF),
+// CHECK-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/GIMT_Encode2(RegState::Define),
+// CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::I1),
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[rd]
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // rs
+// CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage, 0,
+// CHECK-NEXT: GIR_EraseRootFromParent_Done,
diff --git a/llvm/test/TableGen/GlobalISelEmitter-zero-instr.td b/llvm/test/TableGen/GlobalISelEmitter/zero-instr.td
index c8a8cab..f9463ba 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-zero-instr.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/zero-instr.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o /dev/null --warn-on-skipped-patterns 2>&1 < %s 2>&1 | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s -o /dev/null --warn-on-skipped-patterns 2>&1 | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/GlobalISelEmitter-zero-reg.td b/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td
index ddf0224..87e5432 100644
--- a/llvm/test/TableGen/GlobalISelEmitter-zero-reg.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/zero-reg.td
@@ -1,4 +1,4 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck %s
 
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
diff --git a/llvm/test/TableGen/gisel-physreg-input.td b/llvm/test/TableGen/gisel-physreg-input.td
deleted file mode 100644
index f19872a..0000000
--- a/llvm/test/TableGen/gisel-physreg-input.td
+++ /dev/null
@@ -1,87 +0,0 @@
-// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include %s -o - < %s | FileCheck -check-prefix=GISEL %s
-
-include "llvm/Target/Target.td"
-
-def TestTargetInstrInfo : InstrInfo;
-
-def TestTarget : Target {
-  let InstructionSet = TestTargetInstrInfo;
-}
-
-def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
-def SPECIAL : Register<"special"> { let Namespace = "MyTarget"; }
-def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
-def Special32 : RegisterClass<"MyTarget", [i32], 32, (add SPECIAL)>;
-
-
-class I<dag OOps, dag IOps, list<dag> Pat>
-  : Instruction {
-  let Namespace = "MyTarget";
-  let OutOperandList = OOps;
-  let InOperandList = IOps;
-  let Pattern = Pat;
-}
-
-// Try a normal physical register use.
-
-// GISEL: GIM_Try,
-// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
-// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_ADD),
-// GISEL-NEXT: // MIs[0] DstI[dst]
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
-// GISEL-NEXT: // MIs[0] src0
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
-// GISEL-NEXT: // MIs[0] Operand 2
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
-// GISEL-NEXT: // (add:{ *:[i32] } GPR32:{ *:[i32] }:$src0, SPECIAL:{ *:[i32] })  =>  (ADD_PHYS:{ *:[i32] } GPR32:{ *:[i32] }:$src0)
-// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
-// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
-// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/2, // SPECIAL
-// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ADD_PHYS),
-// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
-// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // src0
-// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
-// GISEL-NEXT: // GIR_Coverage, 0,
-// GISEL-NEXT: GIR_EraseRootFromParent_Done,
-def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0),
-    [(set GPR32:$dst, (add GPR32:$src0, SPECIAL))]> {
-  let Uses = [SPECIAL];
-}
-
-// Try using the name of the physreg in another operand.
-
-// GISEL: GIM_Try,
-// GISEL-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
-// GISEL-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_MUL),
-// GISEL-NEXT: // MIs[0] DstI[dst]
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
-// GISEL-NEXT: // MIs[0] SPECIAL
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
-// GISEL-NEXT: // MIs[0] Operand 2
-// GISEL-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// GISEL-NEXT: GIM_RootCheckRegBankForClass, /*Op*/2, /*RC*/GIMT_Encode2(MyTarget::Special32RegClassID),
-// GISEL-NEXT: // (mul:{ *:[i32] } GPR32:{ *:[i32] }:$SPECIAL, SPECIAL:{ *:[i32] })  =>  (MUL_PHYS:{ *:[i32] } GPR32:{ *:[i32] }:$SPECIAL)
-// GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::COPY),
-// GISEL-NEXT: GIR_AddRegister, /*InsnID*/1, GIMT_Encode2(MyTarget::SPECIAL), /*AddRegisterRegFlags*/GIMT_Encode2(RegState::Define),
-// GISEL-NEXT: GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/2, // SPECIAL
-// GISEL-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::MUL_PHYS),
-// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
-// GISEL-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // SPECIAL
-// GISEL-NEXT: GIR_RootConstrainSelectedInstOperands,
-// GISEL-NEXT: // GIR_Coverage, 1,
-// GISEL-NEXT: GIR_EraseRootFromParent_Done,
-def MUL_PHYS : I<(outs GPR32:$dst), (ins GPR32:$SPECIAL),
-    [(set GPR32:$dst, (mul GPR32:$SPECIAL, SPECIAL))]> {
-  let Uses = [SPECIAL];
-}
-
-// Try giving the physical operand a name
-// def ADD_PHYS : I<(outs GPR32:$dst), (ins GPR32:$src0),
-//     [(set GPR32:$dst, (add GPR32:$src0, SPECIAL:$special))]> {
-//   let Uses = [SPECIAL];
-// }
diff --git a/llvm/test/TableGen/template-args.td b/llvm/test/TableGen/template-args.td
index f3eb02d..1644b0a1 100644
--- a/llvm/test/TableGen/template-args.td
+++ b/llvm/test/TableGen/template-args.td
@@ -9,6 +9,7 @@
 // RUN: not llvm-tblgen -DERROR8 %s 2>&1 | FileCheck --check-prefix=ERROR8 %s
 // RUN: not llvm-tblgen -DERROR9 %s 2>&1 | FileCheck --check-prefix=ERROR9 %s
 // RUN: not llvm-tblgen -DERROR10 %s 2>&1 | FileCheck --check-prefix=ERROR10 %s
+// RUN: not llvm-tblgen -DERROR11 %s 2>&1 | FileCheck --check-prefix=ERROR11 %s
 
 // This file tests that all required arguments are specified and template
 // arguments are type-checked and cast if necessary.
@@ -158,13 +159,13 @@ defm MissingComma : TwoArgs<2 "two">;
 #ifdef ERROR8
 def error8: Class1;
 // ERROR8: value not specified for template argument 'Class1:nm'
-// ERROR8: 18:21: note: declared in 'Class1'
+// ERROR8: 19:21: note: declared in 'Class1'
 #endif
 
 #ifdef ERROR9
 defm error9: MC1;
 // ERROR9: value not specified for template argument 'MC1::nm'
-// ERROR9: 99:23: note: declared in 'MC1'
+// ERROR9: 100:23: note: declared in 'MC1'
 #endif
 
 #ifdef ERROR10
@@ -172,5 +173,15 @@ def error10 {
   int value = Class2<>.Code;
 }
 // ERROR10: value not specified for template argument 'Class2:cd'
-// ERROR10: 37:22: note: declared in 'Class2'
+// ERROR10: 38:22: note: declared in 'Class2'
+#endif
+
+#ifdef ERROR11
+
+class Foo<int i, int j>;
+
+def error11 : Foo<"", "">;
+// ERROR11: [[#@LINE-1]]:19: error: Value specified for template argument 'Foo:i' is of type string; expected type int: ""
+// ERROR11: [[#@LINE-2]]:23: error: Value specified for template argument 'Foo:j' is of type string; expected type int: ""
+
 #endif
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 36f6afa..954c05b 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -3085,9 +3085,12 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VMINMAXPSZ128rri, X86::VMINMAXPSZ128rmi, 0},
   {X86::VMINMAXPSZ256rri, X86::VMINMAXPSZ256rmi, 0},
   {X86::VMINMAXPSZrri, X86::VMINMAXPSZrmi, 0},
-  {X86::VMINMAXSDrri, X86::VMINMAXSDrmi, TB_NO_REVERSE},
-  {X86::VMINMAXSHrri, X86::VMINMAXSHrmi, TB_NO_REVERSE},
-  {X86::VMINMAXSSrri, X86::VMINMAXSSrmi, TB_NO_REVERSE},
+  {X86::VMINMAXSDrri, X86::VMINMAXSDrmi, 0},
+  {X86::VMINMAXSDrri_Int, X86::VMINMAXSDrmi_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSHrri, X86::VMINMAXSHrmi, 0},
+  {X86::VMINMAXSHrri_Int, X86::VMINMAXSHrmi_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSSrri, X86::VMINMAXSSrmi, 0},
+  {X86::VMINMAXSSrri_Int, X86::VMINMAXSSrmi_Int, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rr, X86::VMINPBF16Z128rm, 0},
   {X86::VMINPBF16Z256rr, X86::VMINPBF16Z256rm, 0},
   {X86::VMINPBF16Zrr, X86::VMINPBF16Zrm, 0},
@@ -4236,9 +4239,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0},
   {X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0},
   {X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0},
-  {X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VADDSHZrr_Intkz, X86::VADDSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VADDSDZrrkz_Int, X86::VADDSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VADDSHZrrkz_Int, X86::VADDSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VADDSSZrrkz_Int, X86::VADDSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0},
   {X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0},
   {X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0},
@@ -4285,9 +4288,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0},
   {X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0},
   {X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0},
-  {X86::VCMPSDZrri_Intk, X86::VCMPSDZrmi_Intk, TB_NO_REVERSE},
-  {X86::VCMPSHZrri_Intk, X86::VCMPSHZrmi_Intk, TB_NO_REVERSE},
-  {X86::VCMPSSZrri_Intk, X86::VCMPSSZrmi_Intk, TB_NO_REVERSE},
+  {X86::VCMPSDZrrik_Int, X86::VCMPSDZrmik_Int, TB_NO_REVERSE},
+  {X86::VCMPSHZrrik_Int, X86::VCMPSHZrmik_Int, TB_NO_REVERSE},
+  {X86::VCMPSSZrrik_Int, X86::VCMPSSZrmik_Int, TB_NO_REVERSE},
   {X86::VCVT2PS2PHXZ128rrkz, X86::VCVT2PS2PHXZ128rmkz, 0},
   {X86::VCVT2PS2PHXZ256rrkz, X86::VCVT2PS2PHXZ256rmkz, 0},
   {X86::VCVT2PS2PHXZrrkz, X86::VCVT2PS2PHXZrmkz, 0},
@@ -4435,12 +4438,12 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VCVTQQ2PSZ128rrk, X86::VCVTQQ2PSZ128rmk, 0},
   {X86::VCVTQQ2PSZ256rrk, X86::VCVTQQ2PSZ256rmk, 0},
   {X86::VCVTQQ2PSZrrk, X86::VCVTQQ2PSZrmk, 0},
-  {X86::VCVTSD2SHZrr_Intkz, X86::VCVTSD2SHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSD2SSZrr_Intkz, X86::VCVTSD2SSZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSH2SDZrr_Intkz, X86::VCVTSH2SDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSH2SSZrr_Intkz, X86::VCVTSH2SSZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSS2SDZrr_Intkz, X86::VCVTSS2SDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VCVTSS2SHZrr_Intkz, X86::VCVTSS2SHZrm_Intkz, TB_NO_REVERSE},
+  {X86::VCVTSD2SHZrrkz_Int, X86::VCVTSD2SHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSD2SSZrrkz_Int, X86::VCVTSD2SSZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSH2SDZrrkz_Int, X86::VCVTSH2SDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSH2SSZrrkz_Int, X86::VCVTSH2SSZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSS2SDZrrkz_Int, X86::VCVTSS2SDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VCVTSS2SHZrrkz_Int, X86::VCVTSS2SHZrmkz_Int, TB_NO_REVERSE},
   {X86::VCVTTNEBF162IBSZ128rrk, X86::VCVTTNEBF162IBSZ128rmk, 0},
   {X86::VCVTTNEBF162IBSZ256rrk, X86::VCVTTNEBF162IBSZ256rmk, 0},
   {X86::VCVTTNEBF162IBSZrrk, X86::VCVTTNEBF162IBSZrmk, 0},
@@ -4564,9 +4567,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0},
   {X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0},
   {X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0},
-  {X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VDIVSHZrr_Intkz, X86::VDIVSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VDIVSDZrrkz_Int, X86::VDIVSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VDIVSHZrrkz_Int, X86::VDIVSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VDIVSSZrrkz_Int, X86::VDIVSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0},
   {X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0},
   {X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0},
@@ -5107,9 +5110,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0},
   {X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0},
   {X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0},
-  {X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMAXSHZrr_Intkz, X86::VMAXSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VMAXSDZrrkz_Int, X86::VMAXSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMAXSHZrrkz_Int, X86::VMAXSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMAXSSZrrkz_Int, X86::VMAXSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0},
   {X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0},
   {X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0},
@@ -5131,9 +5134,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMINMAXPSZ128rrikz, X86::VMINMAXPSZ128rmikz, 0},
   {X86::VMINMAXPSZ256rrikz, X86::VMINMAXPSZ256rmikz, 0},
   {X86::VMINMAXPSZrrikz, X86::VMINMAXPSZrmikz, 0},
-  {X86::VMINMAXSDrrikz, X86::VMINMAXSDrmikz, TB_NO_REVERSE},
-  {X86::VMINMAXSHrrikz, X86::VMINMAXSHrmikz, TB_NO_REVERSE},
-  {X86::VMINMAXSSrrikz, X86::VMINMAXSSrmikz, TB_NO_REVERSE},
+  {X86::VMINMAXSDrrikz_Int, X86::VMINMAXSDrmikz_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSHrrikz_Int, X86::VMINMAXSHrmikz_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSSrrikz_Int, X86::VMINMAXSSrmikz_Int, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rrkz, X86::VMINPBF16Z128rmkz, 0},
   {X86::VMINPBF16Z256rrkz, X86::VMINPBF16Z256rmkz, 0},
   {X86::VMINPBF16Zrrkz, X86::VMINPBF16Zrmkz, 0},
@@ -5146,9 +5149,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0},
   {X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0},
   {X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0},
-  {X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMINSHZrr_Intkz, X86::VMINSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VMINSDZrrkz_Int, X86::VMINSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMINSHZrrkz_Int, X86::VMINSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMINSSZrrkz_Int, X86::VMINSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VMOVAPDZ128rrk, X86::VMOVAPDZ128rmk, TB_NO_REVERSE|TB_ALIGN_16},
   {X86::VMOVAPDZ256rrk, X86::VMOVAPDZ256rmk, TB_NO_REVERSE|TB_ALIGN_32},
   {X86::VMOVAPDZrrk, X86::VMOVAPDZrmk, TB_NO_REVERSE|TB_ALIGN_64},
@@ -5203,9 +5206,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0},
   {X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0},
   {X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0},
-  {X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMULSHZrr_Intkz, X86::VMULSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VMULSDZrrkz_Int, X86::VMULSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMULSHZrrkz_Int, X86::VMULSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VMULSSZrrkz_Int, X86::VMULSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0},
   {X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0},
   {X86::VORPDZrrkz, X86::VORPDZrmkz, 0},
@@ -5969,9 +5972,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VRNDSCALEPSZ128rrik, X86::VRNDSCALEPSZ128rmik, 0},
   {X86::VRNDSCALEPSZ256rrik, X86::VRNDSCALEPSZ256rmik, 0},
   {X86::VRNDSCALEPSZrrik, X86::VRNDSCALEPSZrmik, 0},
-  {X86::VRNDSCALESDZrri_Intkz, X86::VRNDSCALESDZrmi_Intkz, TB_NO_REVERSE},
-  {X86::VRNDSCALESHZrri_Intkz, X86::VRNDSCALESHZrmi_Intkz, TB_NO_REVERSE},
-  {X86::VRNDSCALESSZrri_Intkz, X86::VRNDSCALESSZrmi_Intkz, TB_NO_REVERSE},
+  {X86::VRNDSCALESDZrrikz_Int, X86::VRNDSCALESDZrmikz_Int, TB_NO_REVERSE},
+  {X86::VRNDSCALESHZrrikz_Int, X86::VRNDSCALESHZrmikz_Int, TB_NO_REVERSE},
+  {X86::VRNDSCALESSZrrikz_Int, X86::VRNDSCALESSZrmikz_Int, TB_NO_REVERSE},
   {X86::VRSQRT14PDZ128rk, X86::VRSQRT14PDZ128mk, 0},
   {X86::VRSQRT14PDZ256rk, X86::VRSQRT14PDZ256mk, 0},
   {X86::VRSQRT14PDZrk, X86::VRSQRT14PDZmk, 0},
@@ -6035,9 +6038,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mk, 0},
   {X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mk, 0},
   {X86::VSQRTPSZrk, X86::VSQRTPSZmk, 0},
-  {X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE},
-  {X86::VSQRTSHZr_Intkz, X86::VSQRTSHZm_Intkz, TB_NO_REVERSE},
-  {X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE},
+  {X86::VSQRTSDZrkz_Int, X86::VSQRTSDZmkz_Int, TB_NO_REVERSE},
+  {X86::VSQRTSHZrkz_Int, X86::VSQRTSHZmkz_Int, TB_NO_REVERSE},
+  {X86::VSQRTSSZrkz_Int, X86::VSQRTSSZmkz_Int, TB_NO_REVERSE},
   {X86::VSUBNEPBF16Z128rrkz, X86::VSUBNEPBF16Z128rmkz, 0},
   {X86::VSUBNEPBF16Z256rrkz, X86::VSUBNEPBF16Z256rmkz, 0},
   {X86::VSUBNEPBF16Zrrkz, X86::VSUBNEPBF16Zrmkz, 0},
@@ -6050,9 +6053,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0},
   {X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0},
   {X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0},
-  {X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE},
-  {X86::VSUBSHZrr_Intkz, X86::VSUBSHZrm_Intkz, TB_NO_REVERSE},
-  {X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE},
+  {X86::VSUBSDZrrkz_Int, X86::VSUBSDZrmkz_Int, TB_NO_REVERSE},
+  {X86::VSUBSHZrrkz_Int, X86::VSUBSHZrmkz_Int, TB_NO_REVERSE},
+  {X86::VSUBSSZrrkz_Int, X86::VSUBSSZrmkz_Int, TB_NO_REVERSE},
   {X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0},
   {X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0},
   {X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0},
@@ -6086,9 +6089,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0},
   {X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0},
   {X86::VADDPSZrrk, X86::VADDPSZrmk, 0},
-  {X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VADDSHZrr_Intk, X86::VADDSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VADDSDZrrk_Int, X86::VADDSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VADDSHZrrk_Int, X86::VADDSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VADDSSZrrk_Int, X86::VADDSSZrmk_Int, TB_NO_REVERSE},
   {X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0},
   {X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0},
   {X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0},
@@ -6137,12 +6140,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0},
   {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0},
   {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0},
-  {X86::VCVTSD2SHZrr_Intk, X86::VCVTSD2SHZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSH2SDZrr_Intk, X86::VCVTSH2SDZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSH2SSZrr_Intk, X86::VCVTSH2SSZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE},
-  {X86::VCVTSS2SHZrr_Intk, X86::VCVTSS2SHZrm_Intk, TB_NO_REVERSE},
+  {X86::VCVTSD2SHZrrk_Int, X86::VCVTSD2SHZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSD2SSZrrk_Int, X86::VCVTSD2SSZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSH2SDZrrk_Int, X86::VCVTSH2SDZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSH2SSZrrk_Int, X86::VCVTSH2SSZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSS2SDZrrk_Int, X86::VCVTSS2SDZrmk_Int, TB_NO_REVERSE},
+  {X86::VCVTSS2SHZrrk_Int, X86::VCVTSS2SHZrmk_Int, TB_NO_REVERSE},
   {X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0},
   {X86::VDBPSADBWZ256rrik, X86::VDBPSADBWZ256rmik, 0},
   {X86::VDBPSADBWZrrik, X86::VDBPSADBWZrmik, 0},
@@ -6158,9 +6161,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0},
   {X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0},
   {X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0},
-  {X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VDIVSHZrr_Intk, X86::VDIVSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VDIVSDZrrk_Int, X86::VDIVSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VDIVSHZrrk_Int, X86::VDIVSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VDIVSSZrrk_Int, X86::VDIVSSZrmk_Int, TB_NO_REVERSE},
   {X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0},
   {X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0},
   {X86::VDPBF16PSZ256rk, X86::VDPBF16PSZ256mk, 0},
@@ -6225,12 +6228,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADD132PSZ256rkz, X86::VFMADD132PSZ256mkz, 0},
   {X86::VFMADD132PSZrk, X86::VFMADD132PSZmk, 0},
   {X86::VFMADD132PSZrkz, X86::VFMADD132PSZmkz, 0},
-  {X86::VFMADD132SDZr_Intk, X86::VFMADD132SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD132SDZr_Intkz, X86::VFMADD132SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD132SHZr_Intk, X86::VFMADD132SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD132SHZr_Intkz, X86::VFMADD132SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD132SSZr_Intk, X86::VFMADD132SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD132SSZr_Intkz, X86::VFMADD132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMADD132SDZrk_Int, X86::VFMADD132SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SDZrkz_Int, X86::VFMADD132SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SHZrk_Int, X86::VFMADD132SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SHZrkz_Int, X86::VFMADD132SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SSZrk_Int, X86::VFMADD132SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD132SSZrkz_Int, X86::VFMADD132SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMADD213NEPBF16Z128rk, X86::VFMADD213NEPBF16Z128mk, 0},
   {X86::VFMADD213NEPBF16Z128rkz, X86::VFMADD213NEPBF16Z128mkz, 0},
   {X86::VFMADD213NEPBF16Z256rk, X86::VFMADD213NEPBF16Z256mk, 0},
@@ -6255,12 +6258,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADD213PSZ256rkz, X86::VFMADD213PSZ256mkz, 0},
   {X86::VFMADD213PSZrk, X86::VFMADD213PSZmk, 0},
   {X86::VFMADD213PSZrkz, X86::VFMADD213PSZmkz, 0},
-  {X86::VFMADD213SDZr_Intk, X86::VFMADD213SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD213SDZr_Intkz, X86::VFMADD213SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD213SHZr_Intk, X86::VFMADD213SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD213SHZr_Intkz, X86::VFMADD213SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD213SSZr_Intk, X86::VFMADD213SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD213SSZr_Intkz, X86::VFMADD213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMADD213SDZrk_Int, X86::VFMADD213SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SDZrkz_Int, X86::VFMADD213SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SHZrk_Int, X86::VFMADD213SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SHZrkz_Int, X86::VFMADD213SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SSZrk_Int, X86::VFMADD213SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD213SSZrkz_Int, X86::VFMADD213SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMADD231NEPBF16Z128rk, X86::VFMADD231NEPBF16Z128mk, 0},
   {X86::VFMADD231NEPBF16Z128rkz, X86::VFMADD231NEPBF16Z128mkz, 0},
   {X86::VFMADD231NEPBF16Z256rk, X86::VFMADD231NEPBF16Z256mk, 0},
@@ -6285,12 +6288,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMADD231PSZ256rkz, X86::VFMADD231PSZ256mkz, 0},
   {X86::VFMADD231PSZrk, X86::VFMADD231PSZmk, 0},
   {X86::VFMADD231PSZrkz, X86::VFMADD231PSZmkz, 0},
-  {X86::VFMADD231SDZr_Intk, X86::VFMADD231SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD231SDZr_Intkz, X86::VFMADD231SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD231SHZr_Intk, X86::VFMADD231SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD231SHZr_Intkz, X86::VFMADD231SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMADD231SSZr_Intk, X86::VFMADD231SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMADD231SSZr_Intkz, X86::VFMADD231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMADD231SDZrk_Int, X86::VFMADD231SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SDZrkz_Int, X86::VFMADD231SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SHZrk_Int, X86::VFMADD231SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SHZrkz_Int, X86::VFMADD231SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SSZrk_Int, X86::VFMADD231SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMADD231SSZrkz_Int, X86::VFMADD231SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMADDCPHZ128rk, X86::VFMADDCPHZ128mk, 0},
   {X86::VFMADDCPHZ128rkz, X86::VFMADDCPHZ128mkz, 0},
   {X86::VFMADDCPHZ256rk, X86::VFMADDCPHZ256mk, 0},
@@ -6377,12 +6380,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMSUB132PSZ256rkz, X86::VFMSUB132PSZ256mkz, 0},
   {X86::VFMSUB132PSZrk, X86::VFMSUB132PSZmk, 0},
   {X86::VFMSUB132PSZrkz, X86::VFMSUB132PSZmkz, 0},
-  {X86::VFMSUB132SDZr_Intk, X86::VFMSUB132SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB132SDZr_Intkz, X86::VFMSUB132SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB132SHZr_Intk, X86::VFMSUB132SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB132SHZr_Intkz, X86::VFMSUB132SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB132SSZr_Intk, X86::VFMSUB132SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB132SSZr_Intkz, X86::VFMSUB132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMSUB132SDZrk_Int, X86::VFMSUB132SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SDZrkz_Int, X86::VFMSUB132SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SHZrk_Int, X86::VFMSUB132SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SHZrkz_Int, X86::VFMSUB132SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SSZrk_Int, X86::VFMSUB132SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB132SSZrkz_Int, X86::VFMSUB132SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMSUB213NEPBF16Z128rk, X86::VFMSUB213NEPBF16Z128mk, 0},
   {X86::VFMSUB213NEPBF16Z128rkz, X86::VFMSUB213NEPBF16Z128mkz, 0},
   {X86::VFMSUB213NEPBF16Z256rk, X86::VFMSUB213NEPBF16Z256mk, 0},
@@ -6407,12 +6410,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMSUB213PSZ256rkz, X86::VFMSUB213PSZ256mkz, 0},
   {X86::VFMSUB213PSZrk, X86::VFMSUB213PSZmk, 0},
   {X86::VFMSUB213PSZrkz, X86::VFMSUB213PSZmkz, 0},
-  {X86::VFMSUB213SDZr_Intk, X86::VFMSUB213SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB213SDZr_Intkz, X86::VFMSUB213SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB213SHZr_Intk, X86::VFMSUB213SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB213SHZr_Intkz, X86::VFMSUB213SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB213SSZr_Intk, X86::VFMSUB213SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB213SSZr_Intkz, X86::VFMSUB213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMSUB213SDZrk_Int, X86::VFMSUB213SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SDZrkz_Int, X86::VFMSUB213SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SHZrk_Int, X86::VFMSUB213SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SHZrkz_Int, X86::VFMSUB213SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SSZrk_Int, X86::VFMSUB213SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB213SSZrkz_Int, X86::VFMSUB213SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMSUB231NEPBF16Z128rk, X86::VFMSUB231NEPBF16Z128mk, 0},
   {X86::VFMSUB231NEPBF16Z128rkz, X86::VFMSUB231NEPBF16Z128mkz, 0},
   {X86::VFMSUB231NEPBF16Z256rk, X86::VFMSUB231NEPBF16Z256mk, 0},
@@ -6437,12 +6440,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFMSUB231PSZ256rkz, X86::VFMSUB231PSZ256mkz, 0},
   {X86::VFMSUB231PSZrk, X86::VFMSUB231PSZmk, 0},
   {X86::VFMSUB231PSZrkz, X86::VFMSUB231PSZmkz, 0},
-  {X86::VFMSUB231SDZr_Intk, X86::VFMSUB231SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB231SDZr_Intkz, X86::VFMSUB231SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB231SHZr_Intk, X86::VFMSUB231SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB231SHZr_Intkz, X86::VFMSUB231SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFMSUB231SSZr_Intk, X86::VFMSUB231SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFMSUB231SSZr_Intkz, X86::VFMSUB231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFMSUB231SDZrk_Int, X86::VFMSUB231SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SDZrkz_Int, X86::VFMSUB231SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SHZrk_Int, X86::VFMSUB231SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SHZrkz_Int, X86::VFMSUB231SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SSZrk_Int, X86::VFMSUB231SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFMSUB231SSZrkz_Int, X86::VFMSUB231SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFMSUBADD132PDZ128rk, X86::VFMSUBADD132PDZ128mk, 0},
   {X86::VFMSUBADD132PDZ128rkz, X86::VFMSUBADD132PDZ128mkz, 0},
   {X86::VFMSUBADD132PDZ256rk, X86::VFMSUBADD132PDZ256mk, 0},
@@ -6525,12 +6528,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD132PSZ256rkz, X86::VFNMADD132PSZ256mkz, 0},
   {X86::VFNMADD132PSZrk, X86::VFNMADD132PSZmk, 0},
   {X86::VFNMADD132PSZrkz, X86::VFNMADD132PSZmkz, 0},
-  {X86::VFNMADD132SDZr_Intk, X86::VFNMADD132SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD132SDZr_Intkz, X86::VFNMADD132SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD132SHZr_Intk, X86::VFNMADD132SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD132SHZr_Intkz, X86::VFNMADD132SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD132SSZr_Intk, X86::VFNMADD132SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD132SSZr_Intkz, X86::VFNMADD132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMADD132SDZrk_Int, X86::VFNMADD132SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SDZrkz_Int, X86::VFNMADD132SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SHZrk_Int, X86::VFNMADD132SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SHZrkz_Int, X86::VFNMADD132SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SSZrk_Int, X86::VFNMADD132SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD132SSZrkz_Int, X86::VFNMADD132SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMADD213NEPBF16Z128rk, X86::VFNMADD213NEPBF16Z128mk, 0},
   {X86::VFNMADD213NEPBF16Z128rkz, X86::VFNMADD213NEPBF16Z128mkz, 0},
   {X86::VFNMADD213NEPBF16Z256rk, X86::VFNMADD213NEPBF16Z256mk, 0},
@@ -6555,12 +6558,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD213PSZ256rkz, X86::VFNMADD213PSZ256mkz, 0},
   {X86::VFNMADD213PSZrk, X86::VFNMADD213PSZmk, 0},
   {X86::VFNMADD213PSZrkz, X86::VFNMADD213PSZmkz, 0},
-  {X86::VFNMADD213SDZr_Intk, X86::VFNMADD213SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD213SDZr_Intkz, X86::VFNMADD213SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD213SHZr_Intk, X86::VFNMADD213SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD213SHZr_Intkz, X86::VFNMADD213SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD213SSZr_Intk, X86::VFNMADD213SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD213SSZr_Intkz, X86::VFNMADD213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMADD213SDZrk_Int, X86::VFNMADD213SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SDZrkz_Int, X86::VFNMADD213SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SHZrk_Int, X86::VFNMADD213SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SHZrkz_Int, X86::VFNMADD213SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SSZrk_Int, X86::VFNMADD213SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD213SSZrkz_Int, X86::VFNMADD213SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMADD231NEPBF16Z128rk, X86::VFNMADD231NEPBF16Z128mk, 0},
   {X86::VFNMADD231NEPBF16Z128rkz, X86::VFNMADD231NEPBF16Z128mkz, 0},
   {X86::VFNMADD231NEPBF16Z256rk, X86::VFNMADD231NEPBF16Z256mk, 0},
@@ -6585,12 +6588,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMADD231PSZ256rkz, X86::VFNMADD231PSZ256mkz, 0},
   {X86::VFNMADD231PSZrk, X86::VFNMADD231PSZmk, 0},
   {X86::VFNMADD231PSZrkz, X86::VFNMADD231PSZmkz, 0},
-  {X86::VFNMADD231SDZr_Intk, X86::VFNMADD231SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD231SDZr_Intkz, X86::VFNMADD231SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD231SHZr_Intk, X86::VFNMADD231SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD231SHZr_Intkz, X86::VFNMADD231SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMADD231SSZr_Intk, X86::VFNMADD231SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMADD231SSZr_Intkz, X86::VFNMADD231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMADD231SDZrk_Int, X86::VFNMADD231SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SDZrkz_Int, X86::VFNMADD231SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SHZrk_Int, X86::VFNMADD231SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SHZrkz_Int, X86::VFNMADD231SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SSZrk_Int, X86::VFNMADD231SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMADD231SSZrkz_Int, X86::VFNMADD231SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMSUB132NEPBF16Z128rk, X86::VFNMSUB132NEPBF16Z128mk, 0},
   {X86::VFNMSUB132NEPBF16Z128rkz, X86::VFNMSUB132NEPBF16Z128mkz, 0},
   {X86::VFNMSUB132NEPBF16Z256rk, X86::VFNMSUB132NEPBF16Z256mk, 0},
@@ -6615,12 +6618,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMSUB132PSZ256rkz, X86::VFNMSUB132PSZ256mkz, 0},
   {X86::VFNMSUB132PSZrk, X86::VFNMSUB132PSZmk, 0},
   {X86::VFNMSUB132PSZrkz, X86::VFNMSUB132PSZmkz, 0},
-  {X86::VFNMSUB132SDZr_Intk, X86::VFNMSUB132SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB132SDZr_Intkz, X86::VFNMSUB132SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB132SHZr_Intk, X86::VFNMSUB132SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB132SHZr_Intkz, X86::VFNMSUB132SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB132SSZr_Intk, X86::VFNMSUB132SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB132SSZr_Intkz, X86::VFNMSUB132SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB132SDZrk_Int, X86::VFNMSUB132SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SDZrkz_Int, X86::VFNMSUB132SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SHZrk_Int, X86::VFNMSUB132SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SHZrkz_Int, X86::VFNMSUB132SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SSZrk_Int, X86::VFNMSUB132SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB132SSZrkz_Int, X86::VFNMSUB132SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMSUB213NEPBF16Z128rk, X86::VFNMSUB213NEPBF16Z128mk, 0},
   {X86::VFNMSUB213NEPBF16Z128rkz, X86::VFNMSUB213NEPBF16Z128mkz, 0},
   {X86::VFNMSUB213NEPBF16Z256rk, X86::VFNMSUB213NEPBF16Z256mk, 0},
@@ -6645,12 +6648,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMSUB213PSZ256rkz, X86::VFNMSUB213PSZ256mkz, 0},
   {X86::VFNMSUB213PSZrk, X86::VFNMSUB213PSZmk, 0},
   {X86::VFNMSUB213PSZrkz, X86::VFNMSUB213PSZmkz, 0},
-  {X86::VFNMSUB213SDZr_Intk, X86::VFNMSUB213SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB213SDZr_Intkz, X86::VFNMSUB213SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB213SHZr_Intk, X86::VFNMSUB213SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB213SHZr_Intkz, X86::VFNMSUB213SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB213SSZr_Intk, X86::VFNMSUB213SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB213SSZr_Intkz, X86::VFNMSUB213SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB213SDZrk_Int, X86::VFNMSUB213SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SDZrkz_Int, X86::VFNMSUB213SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SHZrk_Int, X86::VFNMSUB213SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SHZrkz_Int, X86::VFNMSUB213SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SSZrk_Int, X86::VFNMSUB213SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB213SSZrkz_Int, X86::VFNMSUB213SSZmkz_Int, TB_NO_REVERSE},
   {X86::VFNMSUB231NEPBF16Z128rk, X86::VFNMSUB231NEPBF16Z128mk, 0},
   {X86::VFNMSUB231NEPBF16Z128rkz, X86::VFNMSUB231NEPBF16Z128mkz, 0},
   {X86::VFNMSUB231NEPBF16Z256rk, X86::VFNMSUB231NEPBF16Z256mk, 0},
@@ -6675,12 +6678,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VFNMSUB231PSZ256rkz, X86::VFNMSUB231PSZ256mkz, 0},
   {X86::VFNMSUB231PSZrk, X86::VFNMSUB231PSZmk, 0},
   {X86::VFNMSUB231PSZrkz, X86::VFNMSUB231PSZmkz, 0},
-  {X86::VFNMSUB231SDZr_Intk, X86::VFNMSUB231SDZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB231SDZr_Intkz, X86::VFNMSUB231SDZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB231SHZr_Intk, X86::VFNMSUB231SHZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB231SHZr_Intkz, X86::VFNMSUB231SHZm_Intkz, TB_NO_REVERSE},
-  {X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE},
-  {X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE},
+  {X86::VFNMSUB231SDZrk_Int, X86::VFNMSUB231SDZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SDZrkz_Int, X86::VFNMSUB231SDZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SHZrk_Int, X86::VFNMSUB231SHZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SHZrkz_Int, X86::VFNMSUB231SHZmkz_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SSZrk_Int, X86::VFNMSUB231SSZmk_Int, TB_NO_REVERSE},
+  {X86::VFNMSUB231SSZrkz_Int, X86::VFNMSUB231SSZmkz_Int, TB_NO_REVERSE},
   {X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE},
   {X86::VGETEXPSHZrk, X86::VGETEXPSHZmk, TB_NO_REVERSE},
   {X86::VGETEXPSSZrk, X86::VGETEXPSSZmk, TB_NO_REVERSE},
@@ -6729,9 +6732,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0},
   {X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0},
   {X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0},
-  {X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VMAXSHZrr_Intk, X86::VMAXSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VMAXSDZrrk_Int, X86::VMAXSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VMAXSHZrrk_Int, X86::VMAXSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VMAXSSZrrk_Int, X86::VMAXSSZrmk_Int, TB_NO_REVERSE},
   {X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0},
   {X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0},
   {X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0},
@@ -6753,9 +6756,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMINMAXPSZ128rrik, X86::VMINMAXPSZ128rmik, 0},
   {X86::VMINMAXPSZ256rrik, X86::VMINMAXPSZ256rmik, 0},
   {X86::VMINMAXPSZrrik, X86::VMINMAXPSZrmik, 0},
-  {X86::VMINMAXSDrrik, X86::VMINMAXSDrmik, TB_NO_REVERSE},
-  {X86::VMINMAXSHrrik, X86::VMINMAXSHrmik, TB_NO_REVERSE},
-  {X86::VMINMAXSSrrik, X86::VMINMAXSSrmik, TB_NO_REVERSE},
+  {X86::VMINMAXSDrrik_Int, X86::VMINMAXSDrmik_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSHrrik_Int, X86::VMINMAXSHrmik_Int, TB_NO_REVERSE},
+  {X86::VMINMAXSSrrik_Int, X86::VMINMAXSSrmik_Int, TB_NO_REVERSE},
   {X86::VMINPBF16Z128rrk, X86::VMINPBF16Z128rmk, 0},
   {X86::VMINPBF16Z256rrk, X86::VMINPBF16Z256rmk, 0},
   {X86::VMINPBF16Zrrk, X86::VMINPBF16Zrmk, 0},
@@ -6768,9 +6771,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0},
   {X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0},
   {X86::VMINPSZrrk, X86::VMINPSZrmk, 0},
-  {X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VMINSHZrr_Intk, X86::VMINSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VMINSDZrrk_Int, X86::VMINSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VMINSHZrrk_Int, X86::VMINSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VMINSSZrrk_Int, X86::VMINSSZrmk_Int, TB_NO_REVERSE},
   {X86::VMPSADBWZ128rrik, X86::VMPSADBWZ128rmik, 0},
   {X86::VMPSADBWZ256rrik, X86::VMPSADBWZ256rmik, 0},
   {X86::VMPSADBWZrrik, X86::VMPSADBWZrmik, 0},
@@ -6786,9 +6789,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0},
   {X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0},
   {X86::VMULPSZrrk, X86::VMULPSZrmk, 0},
-  {X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VMULSHZrr_Intk, X86::VMULSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VMULSDZrrk_Int, X86::VMULSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VMULSHZrrk_Int, X86::VMULSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VMULSSZrrk_Int, X86::VMULSSZrmk_Int, TB_NO_REVERSE},
   {X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0},
   {X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0},
   {X86::VORPDZrrk, X86::VORPDZrmk, 0},
@@ -7344,9 +7347,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VREDUCESDZrrik, X86::VREDUCESDZrmik, TB_NO_REVERSE},
   {X86::VREDUCESHZrrik, X86::VREDUCESHZrmik, TB_NO_REVERSE},
   {X86::VREDUCESSZrrik, X86::VREDUCESSZrmik, TB_NO_REVERSE},
-  {X86::VRNDSCALESDZrri_Intk, X86::VRNDSCALESDZrmi_Intk, TB_NO_REVERSE},
-  {X86::VRNDSCALESHZrri_Intk, X86::VRNDSCALESHZrmi_Intk, TB_NO_REVERSE},
-  {X86::VRNDSCALESSZrri_Intk, X86::VRNDSCALESSZrmi_Intk, TB_NO_REVERSE},
+  {X86::VRNDSCALESDZrrik_Int, X86::VRNDSCALESDZrmik_Int, TB_NO_REVERSE},
+  {X86::VRNDSCALESHZrrik_Int, X86::VRNDSCALESHZrmik_Int, TB_NO_REVERSE},
+  {X86::VRNDSCALESSZrrik_Int, X86::VRNDSCALESSZrmik_Int, TB_NO_REVERSE},
   {X86::VRSQRT14SDZrrk, X86::VRSQRT14SDZrmk, TB_NO_REVERSE},
   {X86::VRSQRT14SSZrrk, X86::VRSQRT14SSZrmk, TB_NO_REVERSE},
   {X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE},
@@ -7381,9 +7384,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0},
   {X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0},
   {X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0},
-  {X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE},
-  {X86::VSQRTSHZr_Intk, X86::VSQRTSHZm_Intk, TB_NO_REVERSE},
-  {X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE},
+  {X86::VSQRTSDZrk_Int, X86::VSQRTSDZmk_Int, TB_NO_REVERSE},
+  {X86::VSQRTSHZrk_Int, X86::VSQRTSHZmk_Int, TB_NO_REVERSE},
+  {X86::VSQRTSSZrk_Int, X86::VSQRTSSZmk_Int, TB_NO_REVERSE},
   {X86::VSUBNEPBF16Z128rrk, X86::VSUBNEPBF16Z128rmk, 0},
   {X86::VSUBNEPBF16Z256rrk, X86::VSUBNEPBF16Z256rmk, 0},
   {X86::VSUBNEPBF16Zrrk, X86::VSUBNEPBF16Zrmk, 0},
@@ -7396,9 +7399,9 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0},
   {X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0},
   {X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0},
-  {X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE},
-  {X86::VSUBSHZrr_Intk, X86::VSUBSHZrm_Intk, TB_NO_REVERSE},
-  {X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE},
+  {X86::VSUBSDZrrk_Int, X86::VSUBSDZrmk_Int, TB_NO_REVERSE},
+  {X86::VSUBSHZrrk_Int, X86::VSUBSHZrmk_Int, TB_NO_REVERSE},
+  {X86::VSUBSSZrrk_Int, X86::VSUBSSZrmk_Int, TB_NO_REVERSE},
   {X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0},
   {X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0},
   {X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0},
diff --git a/llvm/test/ThinLTO/X86/memprof-recursive.ll b/llvm/test/ThinLTO/X86/memprof-recursive.ll
new file mode 100644
index 0000000..2b1d708
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-recursive.ll
@@ -0,0 +1,141 @@
+;; Test recursion handling during cloning.
+;;
+;; See llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll for
+;; information on how the test was created.
+
+; RUN: opt -thinlto-bc %s >%t.o
+
+;; By default we should enable cloning of contexts involved with recursive
+;; cycles, but not through the cycle itself. I.e. until full support for
+;; recursion is added, the cloned recursive call from C back to B (line 12) will
+;; not be updated to call a clone.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS
+
+;; Skipping recursive callsites should result in no cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-callsites=false \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --allow-empty \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="marked with memprof allocation attribute cold"
+
+;; Skipping recursive contexts should prevent spurious call to cloned version of
+;; B from the context starting at memprof_recursive.cc:19:13, which is actually
+;; recursive (until that support is added).
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-contexts=false \
+; RUN:  -o %t.out 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS
+
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1
+;; We should only call the cold clone for the recursive context if we enabled
+;; recursive contexts via -memprof-allow-recursive-contexts=true (default).
+; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; SKIP-RECUR-CONTEXTS-NOT: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Dv() !dbg !3 {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !dbg !6, !memprof !7, !callsite !14
+  ret ptr null
+}
+
+define ptr @_Z1Ci(i32 %n) !dbg !15 {
+entry:
+  %call = tail call ptr @_Z1Dv(), !dbg !16, !callsite !17
+  br label %return
+
+if.end:                                           ; No predecessors!
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !18, !callsite !19
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Bi(i32 %n) !dbg !20 {
+entry:
+  %call = tail call ptr @_Z1Ci(i32 0), !dbg !21, !callsite !22
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = tail call ptr @_Z1Bi(i32 0), !dbg !23, !callsite !25
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !26, !callsite !27
+  %call2 = tail call ptr @_Z1Bi(i32 0), !dbg !28, !callsite !29
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 7aec6dc477f8148ed066d10dfc7a012a51b6599c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_recursive.cc", directory: ".", checksumkind: CSK_MD5, checksum: "2f15f63b187a0e0d40e7fdd18b10576a")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "D", linkageName: "_Z1Dv", scope: !1, file: !1, line: 4, type: !4, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 5, column: 10, scope: !3)
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 6307901912192269588}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 8632435727821051414}
+!12 = !{!13, !"cold"}
+!13 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 -3421689549917153178}
+!14 = !{i64 6541423618768552252}
+!15 = distinct !DISubprogram(name: "C", linkageName: "_Z1Ci", scope: !1, file: !1, line: 8, type: !4, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DILocation(line: 10, column: 12, scope: !15)
+!17 = !{i64 -200552803509692312}
+!18 = !DILocation(line: 12, column: 10, scope: !15)
+!19 = !{i64 -7155190423157709404}
+!20 = distinct !DISubprogram(name: "B", linkageName: "_Z1Bi", scope: !1, file: !1, line: 14, type: !4, scopeLine: 14, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = !DILocation(line: 15, column: 10, scope: !20)
+!22 = !{i64 -2954124005641725917}
+!23 = !DILocation(line: 18, column: 13, scope: !24)
+!24 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !4, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = !{i64 8632435727821051414}
+!26 = !DILocation(line: 19, column: 13, scope: !24)
+!27 = !{i64 -3421689549917153178}
+!28 = !DILocation(line: 20, column: 13, scope: !24)
+!29 = !{i64 6307901912192269588}
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
new file mode 100644
index 0000000..52257c1
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -S < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP3]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index 366446a..93872c3 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -381,26 +381,58 @@ define void @pr65222(i32 %flags, i1 %cmp, i1 %tobool.not) {
 ; CHECK:       then:
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT:%.*]], label [[COND1_SI_UNFOLD_TRUE:%.*]], label [[COND_SI_UNFOLD_TRUE:%.*]]
 ; CHECK:       cond.si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP]], label [[TOUNFOLD_SI_UNFOLD_FALSE1:%.*]], label [[COND_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK:       cond.si.unfold.true.jt2:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI:%.*]] = phi i32 [ 2, [[THEN]] ]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[TOUNFOLD_SI_UNFOLD_FALSE:%.*]], label [[COND_SI_UNFOLD_FALSE:%.*]]
 ; CHECK:       cond.si.unfold.false:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 0, [[COND_SI_UNFOLD_TRUE]] ]
-; CHECK-NEXT:    br label [[TOUNFOLD_SI_UNFOLD_FALSE]]
+; CHECK-NEXT:    br label [[TOUNFOLD_SI_UNFOLD_FALSE1]]
+; CHECK:       cond.si.unfold.false.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1_JT0:%.*]] = phi i32 [ 0, [[COND_SI_UNFOLD_TRUE1:%.*]] ]
+; CHECK-NEXT:    br label [[TOUNFOLD_SI_UNFOLD_FALSE_JT0:%.*]]
 ; CHECK:       tounfold.si.unfold.false:
-; CHECK-NEXT:    [[COND_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI]], [[COND_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI1]], [[COND_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[COND_SI_UNFOLD_PHI:%.*]] = phi i32 [ poison, [[COND_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI1]], [[COND_SI_UNFOLD_FALSE]] ]
 ; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       tounfold.si.unfold.false.jt0:
+; CHECK-NEXT:    [[COND_SI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI1_JT0]], [[COND_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[IF_END_JT0:%.*]]
+; CHECK:       tounfold.si.unfold.false.jt2:
+; CHECK-NEXT:    [[COND_SI_UNFOLD_PHI_JT2:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI]], [[COND_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    br label [[IF_END_JT2:%.*]]
 ; CHECK:       cond1.si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END]], label [[COND1_SI_UNFOLD_FALSE_JT1:%.*]]
+; CHECK:       cond1.si.unfold.true.jt3:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI2:%.*]] = phi i32 [ 3, [[THEN]] ]
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END]], label [[COND1_SI_UNFOLD_FALSE:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END_JT3:%.*]], label [[COND1_SI_UNFOLD_FALSE:%.*]]
 ; CHECK:       cond1.si.unfold.false:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI3:%.*]] = phi i32 [ 1, [[COND1_SI_UNFOLD_TRUE]] ]
 ; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       cond1.si.unfold.false.jt1:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI3_JT1:%.*]] = phi i32 [ 1, [[COND1_SI_UNFOLD_TRUE1:%.*]] ]
+; CHECK-NEXT:    br label [[IF_END_JT1:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[UNFOLDED:%.*]] = phi i32 [ [[FLAGS:%.*]], [[WHILE_COND]] ], [ [[COND_SI_UNFOLD_PHI]], [[TOUNFOLD_SI_UNFOLD_FALSE]] ], [ [[DOTSI_UNFOLD_PHI2]], [[COND1_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI3]], [[COND1_SI_UNFOLD_FALSE]] ]
-; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ [[FLAGS]], [[WHILE_COND]] ], [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE]] ], [ 0, [[COND1_SI_UNFOLD_TRUE]] ], [ 0, [[COND1_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[UNFOLDED:%.*]] = phi i32 [ [[FLAGS:%.*]], [[WHILE_COND]] ], [ [[COND_SI_UNFOLD_PHI]], [[TOUNFOLD_SI_UNFOLD_FALSE1]] ], [ poison, [[COND1_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI3]], [[COND1_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ [[FLAGS]], [[WHILE_COND]] ], [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE1]] ], [ 0, [[COND1_SI_UNFOLD_TRUE1]] ], [ 0, [[COND1_SI_UNFOLD_FALSE]] ]
 ; CHECK-NEXT:    switch i32 [[UNFOLDED]], label [[UNREACHABLE:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
 ; CHECK-NEXT:    ]
+; CHECK:       if.end.jt1:
+; CHECK-NEXT:    [[UNFOLDED_JT1:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI3_JT1]], [[COND1_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT:    [[OTHER_JT1:%.*]] = phi i32 [ 0, [[COND1_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT:    br label [[UNREACHABLE]]
+; CHECK:       if.end.jt3:
+; CHECK-NEXT:    [[UNFOLDED_JT3:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI2]], [[COND1_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    [[OTHER_JT3:%.*]] = phi i32 [ 0, [[COND1_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    br label [[UNREACHABLE]]
+; CHECK:       if.end.jt0:
+; CHECK-NEXT:    [[UNFOLDED_JT0:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT0]], [[TOUNFOLD_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    [[OTHER_JT0:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[SW_BB]]
+; CHECK:       if.end.jt2:
+; CHECK-NEXT:    [[UNFOLDED_JT2:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT2]], [[TOUNFOLD_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[OTHER_JT2:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    br label [[UNREACHABLE]]
 ; CHECK:       unreachable:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       sw.bb:
diff --git a/llvm/test/Transforms/DFAJumpThreading/negative.ll b/llvm/test/Transforms/DFAJumpThreading/negative.ll
index a964281..3eab1e1 100644
--- a/llvm/test/Transforms/DFAJumpThreading/negative.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/negative.ll
@@ -218,9 +218,45 @@ for.end:
 declare i32 @arbitrary_function()
 
 ; Don't confuse %state.2 for the initial switch value.
+; [ 3, %case2 ] can still be threaded.
 define i32 @negative6(i32 %init) {
-; REMARK: SwitchNotPredictable
-; REMARK-NEXT: negative6
+; CHECK-LABEL: define i32 @negative6(
+; CHECK-SAME: i32 [[INIT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INIT]], 0
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[STATE_2:%.*]] = call i32 @arbitrary_function()
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ [[STATE_2]], %[[LOOP_2]] ]
+; CHECK-NEXT:    switch i32 [[STATE]], label %[[INFLOOP_I:.*]] [
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 3, label %[[CASE3:.*]]
+; CHECK-NEXT:      i32 4, label %[[CASE4:.*]]
+; CHECK-NEXT:      i32 0, label %[[CASE0:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[LOOP_3_JT3:.*]]:
+; CHECK-NEXT:    [[STATE_JT3:%.*]] = phi i32 [ 3, %[[CASE2]] ]
+; CHECK-NEXT:    br label %[[CASE3]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    br label %[[LOOP_3_JT3]]
+; CHECK:       [[CASE3]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_2_BACKEDGE:.*]], label %[[CASE4]]
+; CHECK:       [[CASE4]]:
+; CHECK-NEXT:    br label %[[LOOP_2_BACKEDGE]]
+; CHECK:       [[LOOP_2_BACKEDGE]]:
+; CHECK-NEXT:    br label %[[LOOP_2]]
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[INFLOOP_I]]:
+; CHECK-NEXT:    br label %[[INFLOOP_I]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %cmp = icmp eq i32 %init, 0
   br label %loop.2
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll
new file mode 100644
index 0000000..0f8cf5c7
--- /dev/null
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount-with-frompc.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,RISCV64
+; RUN: opt -mtriple=riscv32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,RISCV32
+; RUN: opt -mtriple=loongarch64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,LOONGARCH64
+; RUN: opt -mtriple=loongarch32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,LOONGARCH32
+; RUN: opt -mtriple=aarch64 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
+; RUN: opt -mtriple=aarch64_be -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64_BE
+; RUN: opt -mtriple=aarch64_32 -passes="ee-instrument<post-inline>" -S < %s | FileCheck %s --check-prefixes=CHECK,AARCH64_32
+
+define void @f1() "instrument-function-entry-inlined"="_mcount" {
+; CHECK-LABEL: define void @f1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @_mcount(ptr [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AARCH64: {{.*}}
+; AARCH64_32: {{.*}}
+; AARCH64_BE: {{.*}}
+; LOONGARCH32: {{.*}}
+; LOONGARCH64: {{.*}}
+; RISCV32: {{.*}}
+; RISCV64: {{.*}}
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
index bd5f4c2..56ccfb9 100644
--- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
@@ -129,6 +129,13 @@ define void @naked() naked {
   ret void
 }
 
+define available_externally void @always_inline() {
+; CHECK-LABEL: define available_externally void @always_inline() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
 ; The attributes are "consumed" when the instrumentation is inserted.
 ; CHECK: attributes
 ; CHECK-NOT: instrument-function
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
index 9243969..179b5b0 100644
--- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -45,11 +45,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i24 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
@@ -63,11 +59,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP3]], i32 [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
@@ -83,11 +75,7 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
@@ -103,11 +91,7 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
@@ -155,11 +139,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP3]], i64 [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index d71ae8b..0507ec9 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -71,11 +71,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
 ; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X32-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; X32-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 4)
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index f686e29..86dc3e5 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -73,11 +73,7 @@ define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
 ; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
 ; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[TMP5]], i32 [[TMP6]])
 ; X64-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
@@ -189,11 +185,7 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
 ; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
 ; X64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    [[TMP11:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[TMP5]], i64 [[TMP6]])
 ; X64-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
index 50c4dc4..1840f04 100644
--- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll
+++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
@@ -3,13 +3,9 @@
 ; RUN: echo '!19 = !{!"%/t/exit-block.ll", !0}' > %t/1
 ; RUN: cat %s %t/1 > %t/2
 
-; By default, the exit block is the second.
+; The exit block is the second.
 ; RUN: opt -passes=insert-gcov-profiling -disable-output %t/2
-; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-SECOND %s
-
-; But we can optionally emit it last, to match GCC<4.8 (r189778).
-; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='407*' -disable-output %t/2
-; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck --check-prefixes=CHECK,EXIT-LAST %s
+; RUN: llvm-cov gcov -n -dump %t/exit-block.gcno 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -66,10 +62,7 @@ attributes #2 = { nounwind }
 
 ; There should be no destination edges for the exit block.
 ; CHECK: Block : 1 Counter : 0
-; EXIT-LAST:       Destination Edges
-; EXIT-SECOND-NOT: Destination Edges
 ; CHECK: Block : 2 Counter : 0
 ; CHECK: Block : 4 Counter : 0
-; EXIT-LAST-NOT: Destination Edges
-; EXIT-SECOND:   Destination Edges
+; CHECK:         Destination Edges
 ; CHECK-NOT: Block :
diff --git a/llvm/test/Transforms/GCOVProfiling/version.ll b/llvm/test/Transforms/GCOVProfiling/version.ll
index bfac255..4751bc1 100644
--- a/llvm/test/Transforms/GCOVProfiling/version.ll
+++ b/llvm/test/Transforms/GCOVProfiling/version.ll
@@ -5,16 +5,16 @@
 ; RUN: cat %t/little.txt %s %t/version.txt > %t/2
 
 ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/2
-; RUN: head -c8 %t/version.gcno | grep '^oncg.804'
+; RUN: head -c8 %t/version.gcno | grep '^oncg.11B'
 ; RUN: rm %t/version.gcno
 ; RUN: not opt -passes=insert-gcov-profiling -default-gcov-version=asdfasdf -disable-output < %t/2
-; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='402*' -disable-output < %t/2
-; RUN: head -c8 %t/version.gcno | grep '^oncg.204'
+; RUN: opt -passes=insert-gcov-profiling -default-gcov-version='B21*' -disable-output < %t/2
+; RUN: head -c8 %t/version.gcno | grep '^oncg.12B'
 ; RUN: rm %t/version.gcno
 
 ; RUN: cat %t/big.txt %s %t/version.txt > %t/big.ll
 ; RUN: opt -passes=insert-gcov-profiling -disable-output < %t/big.ll
-; RUN: head -c8 %t/version.gcno | grep '^gcno408.'
+; RUN: head -c8 %t/version.gcno | grep '^gcnoB11.'
 
 define void @test() !dbg !5 {
   ret void, !dbg !8
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
index 7ffbd64..4cd1491 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
@@ -391,9 +391,33 @@ define void @nonzt0_callee() {
   ret void
 }
 
+define void @new_zt0_callee() "aarch64_new_zt0" {
+; CHECK-LABEL: define void @new_zt0_callee
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; inlineasm", ""()
+  call void @inlined_body()
+  ret void
+}
+
+define void @nonzt0_caller_new_zt0_callee_dont_inline() {
+; CHECK-LABEL: define void @nonzt0_caller_new_zt0_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @new_zt0_callee()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @new_zt0_callee()
+  ret void
+}
+
 define void @shared_zt0_caller_nonzt0_callee_dont_inline() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_caller_nonzt0_callee_dont_inline
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    call void @nonzt0_callee()
 ; CHECK-NEXT:    ret void
 ;
@@ -403,7 +427,7 @@ define void @shared_zt0_caller_nonzt0_callee_dont_inline() "aarch64_inout_zt0" {
 
 define void @shared_zt0_callee() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_callee
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR5]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
@@ -415,7 +439,7 @@ define void @shared_zt0_callee() "aarch64_inout_zt0" {
 
 define void @shared_zt0_caller_shared_zt0_callee_inline() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_caller_shared_zt0_callee_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR5]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll b/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
index 84462f9..d4edf12e 100644
--- a/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
+++ b/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
@@ -12,6 +12,17 @@ define i8 @add-shl-sdiv-scalar0(i8 %x) {
   ret i8 %rz
 }
 
+define i8 @add-shl-sdiv-scalar0_commuted(i8 %x) {
+; CHECK-LABEL: @add-shl-sdiv-scalar0_commuted(
+; CHECK-NEXT:    [[RZ:%.*]] = srem i8 [[X:%.*]], 4
+; CHECK-NEXT:    ret i8 [[RZ]]
+;
+  %sd = sdiv i8 %x, -4
+  %sl = shl i8 %sd, 2
+  %rz = add i8 %x, %sl
+  ret i8 %rz
+}
+
 define i8 @add-shl-sdiv-scalar1(i8 %x) {
 ; CHECK-LABEL: @add-shl-sdiv-scalar1(
 ; CHECK-NEXT:    [[RZ:%.*]] = srem i8 [[X:%.*]], 64
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index fffe1f8..9651858 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -1445,8 +1445,7 @@ define i1 @bitwise_and_logical_and_icmps_comm2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @bitwise_and_logical_and_icmps_comm2(
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42
 ; CHECK-NEXT:    [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i8 [[Z_SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[Z_SHIFT]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[X:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i8 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[TMP4]], i1 [[C1]], i1 false
@@ -1796,8 +1795,7 @@ define i1 @bitwise_or_logical_or_icmps_comm2(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @bitwise_or_logical_or_icmps_comm2(
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42
 ; CHECK-NEXT:    [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i8 [[Z_SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[Z_SHIFT]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[X:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i8 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[OR2:%.*]] = select i1 [[TMP4]], i1 true, i1 [[C1]]
@@ -2068,12 +2066,10 @@ define i1 @bitwise_and_logical_and_masked_icmp_allzeros_poison1(i1 %c, i32 %x, i
 
 define i1 @bitwise_and_logical_and_masked_icmp_allzeros_poison2(i1 %c, i32 %x, i32 %y) {
 ; CHECK-LABEL: @bitwise_and_logical_and_masked_icmp_allzeros_poison2(
-; CHECK-NEXT:    [[X_M1:%.*]] = and i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[C1:%.*]] = icmp eq i32 [[X_M1]], 0
-; CHECK-NEXT:    [[AND1:%.*]] = select i1 [[C1]], i1 [[C:%.*]], i1 false
-; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = or i32 [[Y1:%.*]], 8
+; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X_M2]], 0
-; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C2]]
+; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[C2]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    ret i1 [[AND2]]
 ;
   %x.m1 = and i32 %x, 8
@@ -2120,12 +2116,10 @@ define i1 @bitwise_and_logical_and_masked_icmp_allones_poison1(i1 %c, i32 %x, i3
 
 define i1 @bitwise_and_logical_and_masked_icmp_allones_poison2(i1 %c, i32 %x, i32 %y) {
 ; CHECK-LABEL: @bitwise_and_logical_and_masked_icmp_allones_poison2(
-; CHECK-NEXT:    [[X_M1:%.*]] = and i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[C1:%.*]] = icmp ne i32 [[X_M1]], 0
-; CHECK-NEXT:    [[AND1:%.*]] = select i1 [[C1]], i1 [[C:%.*]], i1 false
-; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = or i32 [[Y1:%.*]], 8
+; CHECK-NEXT:    [[X_M2:%.*]] = and i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i32 [[X_M2]], [[Y]]
-; CHECK-NEXT:    [[AND2:%.*]] = and i1 [[AND1]], [[C2]]
+; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[C2]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    ret i1 [[AND2]]
 ;
   %x.m1 = and i32 %x, 8
diff --git a/llvm/test/Transforms/InstCombine/and-xor-or.ll b/llvm/test/Transforms/InstCombine/and-xor-or.ll
index 5a0890e..5a58995 100644
--- a/llvm/test/Transforms/InstCombine/and-xor-or.ll
+++ b/llvm/test/Transforms/InstCombine/and-xor-or.ll
@@ -388,10 +388,9 @@ define i8 @xor_shl(i8 %x, i8 %y, i8 %zarg, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@xor_shl
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[ZARG:%.*]], i8 [[SHAMT:%.*]]) {
 ; CHECK-NEXT:    [[Z:%.*]] = sdiv i8 42, [[ZARG]]
-; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = xor i8 [[Z]], [[SX]]
-; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[TMP2]], [[Z]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %z = sdiv i8 42, %zarg ; thwart complexity-based canonicalization
@@ -406,10 +405,9 @@ define i8 @and_lshr(i8 %x, i8 %y, i8 %zarg, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@and_lshr
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[ZARG:%.*]], i8 [[SHAMT:%.*]]) {
 ; CHECK-NEXT:    [[Z:%.*]] = sdiv i8 42, [[ZARG]]
-; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = and i8 [[Z]], [[SX]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SY]], [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[TMP2]], [[Z]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %z = sdiv i8 42, %zarg ; thwart complexity-based canonicalization
@@ -435,6 +433,51 @@ define i8 @or_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
   ret i8 %r
 }
 
+define i8 @or_lshr_commuted1(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted1
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %sy, %a
+  ret i8 %r
+}
+
+define i8 @or_lshr_commuted2(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted2
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @or_lshr_commuted3(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted3
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %sx, %z
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
 define i8 @xor_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@xor_lshr
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
diff --git a/llvm/test/Transforms/InstCombine/bit-checks.ll b/llvm/test/Transforms/InstCombine/bit-checks.ll
index 43cd6dd..936c02c 100644
--- a/llvm/test/Transforms/InstCombine/bit-checks.ll
+++ b/llvm/test/Transforms/InstCombine/bit-checks.ll
@@ -1335,6 +1335,22 @@ define i1 @no_masks_with_logical_or(i32 %a, i32 %b, i32 noundef %c) {
   ret i1 %or2
 }
 
+define i1 @no_masks_with_logical_or_commuted(i32 %a, i32 %b, i32 noundef %c) {
+; CHECK-LABEL: @no_masks_with_logical_or_commuted(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 63
+; CHECK-NEXT:    [[C:%.*]] = or i32 [[C1:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[C]], 0
+; CHECK-NEXT:    [[OR2:%.*]] = select i1 [[CMP3]], i1 true, i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR2]]
+;
+  %cmp1 = icmp ne i32 %a, 0
+  %cmp2 = icmp ne i32 %b, 63
+  %or1 = select i1 %cmp1, i1 true, i1 %cmp2
+  %cmp3 = icmp ne i32 %c, 0
+  %or2 = or i1 %cmp3, %or1
+  ret i1 %or2
+}
+
 define i1 @no_masks_with_logical_or2(i32 %a, i32 %b, i32 noundef %c) {
 ; CHECK-LABEL: @no_masks_with_logical_or2(
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 63
diff --git a/llvm/test/Transforms/InstCombine/compare-signs.ll b/llvm/test/Transforms/InstCombine/compare-signs.ll
index 9703b47..59ec9ad 100644
--- a/llvm/test/Transforms/InstCombine/compare-signs.ll
+++ b/llvm/test/Transforms/InstCombine/compare-signs.ll
@@ -152,6 +152,19 @@ define i1 @test4a(i32 %a) {
   ret i1 %c
 }
 
+define i1 @test4a_commuted(i32 %a) {
+; CHECK-LABEL: @test4a_commuted(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[SIGNUM:%.*]], 1
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %l = ashr i32 %a, 31
+  %na = sub i32 0, %a
+  %r = lshr i32 %na, 31
+  %signum = or i32 %r, %l
+  %c = icmp slt i32 %signum, 1
+  ret i1 %c
+}
+
 define <2 x i1> @test4a_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @test4a_vec(
 ; CHECK-NEXT:    [[C:%.*]] = icmp slt <2 x i32> [[A:%.*]], splat (i32 1)
diff --git a/llvm/test/Transforms/InstCombine/copysign.ll b/llvm/test/Transforms/InstCombine/copysign.ll
index abc707a..ee093a7 100644
--- a/llvm/test/Transforms/InstCombine/copysign.ll
+++ b/llvm/test/Transforms/InstCombine/copysign.ll
@@ -82,7 +82,7 @@ define float @not_known_positive_sign_arg(float %x, float %y) {
 
 define float @copysign_sign_arg(float %x, float %y, float %z) {
 ; CHECK-LABEL: @copysign_sign_arg(
-; CHECK-NEXT:    [[R:%.*]] = call ninf float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %s = call reassoc float @llvm.copysign.f32(float %y, float %z)
@@ -90,6 +90,26 @@ define float @copysign_sign_arg(float %x, float %y, float %z) {
   ret float %r
 }
 
+define float @copysign_sign_arg_nnan(float %x, float %y, float %z) {
+; CHECK-LABEL: @copysign_sign_arg_nnan(
+; CHECK-NEXT:    [[R:%.*]] = call nnan float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %s = call nnan float @llvm.copysign.f32(float %y, float %z)
+  %r = call nnan float @llvm.copysign.f32(float %x, float %s)
+  ret float %r
+}
+
+define float @copysign_sign_arg_mixed(float %x, float %y, float %z) {
+; CHECK-LABEL: @copysign_sign_arg_mixed(
+; CHECK-NEXT:    [[R:%.*]] = call nsz float @llvm.copysign.f32(float [[X:%.*]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %s = call ninf nsz float @llvm.copysign.f32(float %y, float %z)
+  %r = call nnan nsz float @llvm.copysign.f32(float %x, float %s)
+  ret float %r
+}
+
 define float @fneg_mag(float %x, float %y) {
 ; CHECK-LABEL: @fneg_mag(
 ; CHECK-NEXT:    [[R:%.*]] = call float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]])
diff --git a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
index 0d0af91..15fad55 100644
--- a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
@@ -19,7 +19,7 @@ define float @test_fcmp_ogt_fadd_select_constant(float %in) {
 define float @test_fcmp_ogt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -87,7 +87,7 @@ define float @test_fcmp_olt_fadd_select_constant(float %in) {
 define float @test_fcmp_olt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_olt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -155,7 +155,7 @@ define float @test_fcmp_oge_fadd_select_constant(float %in) {
 define float @test_fcmp_oge_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_oge_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -223,7 +223,7 @@ define float @test_fcmp_ole_fadd_select_constant(float %in) {
 define float @test_fcmp_ole_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ole_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -293,7 +293,7 @@ define float @test_fcmp_ugt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ugt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp ole float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -366,7 +366,7 @@ define float @test_fcmp_uge_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_uge_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp olt float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -439,7 +439,7 @@ define float @test_fcmp_ult_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ult_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp oge float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -512,7 +512,7 @@ define float @test_fcmp_ule_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ule_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
 ; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp ogt float [[IN]], 0.000000e+00
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float 0.000000e+00, float [[IN]]
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = select i1 [[CMP1_INV]], float [[IN]], float 0.000000e+00
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll
index 579247a..a8cdf80 100644
--- a/llvm/test/Transforms/InstCombine/icmp-add.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-add.ll
@@ -79,6 +79,19 @@ bb:
   ret i1 %i4
 }
 
+define i1 @cvt_icmp_0_zext_plus_zext_eq_i2(i1 %a, i1 %b) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_zext_eq_i2(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = xor i1 [[TMP1]], true
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a.ext = zext i1 %a to i2
+  %b.ext = zext i1 %b to i2
+  %add = add i2 %a.ext, %b.ext
+  %cmp = icmp eq i2 %add, 0
+  ret i1 %cmp
+}
+
 define i1 @cvt_icmp_1_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
 ; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_eq(
 ; CHECK-NEXT:  bb:
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index f9b90c2..7f8f1ae 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -709,3 +709,51 @@ define i1 @pointer_icmp_aligned_with_offset_negative(ptr align 8 %a, ptr align 8
   %cmp = icmp eq ptr %gep, %a2
   ret i1 %cmp
 }
+
+define i1 @gep_diff_base_same_indices(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr i8, ptr %x, i64 %z
+  %gep2 = getelementptr i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nuw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nusw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @gep_diff_base_same_indices_nuw_nusw(ptr %x, ptr %y, i64 %z) {
+; CHECK-LABEL: @gep_diff_base_same_indices_nuw_nusw(
+; CHECK-NEXT:    [[X:%.*]] = getelementptr nuw i8, ptr [[X1:%.*]], i64 [[Z:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr nusw i8, ptr [[Y1:%.*]], i64 [[Z]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i8, ptr %x, i64 %z
+  %gep2 = getelementptr nusw i8, ptr %y, i64 %z
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/onehot_merge.ll b/llvm/test/Transforms/InstCombine/onehot_merge.ll
index 2e57597..3b7314d 100644
--- a/llvm/test/Transforms/InstCombine/onehot_merge.ll
+++ b/llvm/test/Transforms/InstCombine/onehot_merge.ll
@@ -1143,3 +1143,19 @@ define i1 @foo1_and_signbit_lshr_without_shifting_signbit_not_pwr2_logical(i32 %
   %or = select i1 %t2, i1 true, i1 %t4
   ret i1 %or
 }
+
+define i1 @two_types_of_bittest(i8 %x, i8 %c) {
+; CHECK-LABEL: @two_types_of_bittest(
+; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[C:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[T0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %t0 = shl i8 1, %c
+  %icmp1 = icmp slt i8 %x, 0
+  %and = and i8 %x, %t0
+  %icmp2 = icmp ne i8 %and, 0
+  %ret = and i1 %icmp1, %icmp2
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index bac51c8..b052746 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -654,6 +654,64 @@ join:
   ret ptr %gep
 }
 
+define ptr @gep_of_phi_of_gep_flags1(i1 %c, ptr %p) {
+; CHECK-LABEL: @gep_of_phi_of_gep_flags1(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 1
+  br label %join
+
+else:
+  %gep2 = getelementptr i32, ptr %p, i64 2
+  br label %join
+
+join:
+  %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ]
+  %gep = getelementptr i32, ptr %phi, i64 1
+  ret ptr %gep
+}
+
+define ptr @gep_of_phi_of_gep_flags2(i1 %c, ptr %p) {
+; CHECK-LABEL: @gep_of_phi_of_gep_flags2(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr nuw i8, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %gep1 = getelementptr nuw i32, ptr %p, i64 1
+  br label %join
+
+else:
+  %gep2 = getelementptr nuw i32, ptr %p, i64 2
+  br label %join
+
+join:
+  %phi = phi ptr [ %gep1, %if ], [ %gep2, %else ]
+  %gep = getelementptr i32, ptr %phi, i64 1
+  ret ptr %gep
+}
+
 define ptr @gep_of_phi_of_gep_different_type(i1 %c, ptr %p) {
 ; CHECK-LABEL: @gep_of_phi_of_gep_different_type(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/overflow.ll b/llvm/test/Transforms/InstCombine/overflow.ll
index a8969a5..22e1631 100644
--- a/llvm/test/Transforms/InstCombine/overflow.ll
+++ b/llvm/test/Transforms/InstCombine/overflow.ll
@@ -11,7 +11,7 @@ define i32 @test1(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { i32, i1 } [[SADD]], 1
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SADD_RESULT:%.*]] = extractvalue { i32, i1 } [[SADD]], 0
@@ -49,7 +49,7 @@ define i32 @test2(i32 %a, i32 %b, ptr %P) nounwind ssp {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[ADD_OFF]], 4294967295
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CONV9:%.*]] = trunc i64 [[ADD]] to i32
@@ -86,7 +86,7 @@ define i64 @test3(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], -4294967296
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret i64 [[ADD]]
@@ -116,7 +116,7 @@ define zeroext i8 @test4(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-NEXT:    [[CMP:%.*]] = extractvalue { i8, i1 } [[SADD]], 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SADD_RESULT:%.*]] = extractvalue { i8, i1 } [[SADD]], 0
@@ -150,7 +150,7 @@ define i32 @test8(i64 %a, i64 %b) nounwind ssp {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], -4294967296
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR2]]
+; CHECK-NEXT:    tail call void @throwAnExceptionOrWhatever() #[[ATTR3]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CONV9:%.*]] = trunc i64 [[ADD]] to i32
@@ -171,3 +171,91 @@ if.end:
   ret i32 %conv9
 }
 
+define i32 @uadd_no_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @uadd_no_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @smul_no_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @smul_no_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %val = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @smul_overflow(i32 %a, i32 %b) {
+; CHECK-LABEL: @smul_overflow(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[OV]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %val = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  tail call void @llvm.assume(i1 %ov)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @uadd_no_overflow_invalid1(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @uadd_no_overflow_invalid1(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    call void @use(i32 [[RES]])
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    [[NOWRAP:%.*]] = xor i1 [[OV]], true
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOWRAP]])
+; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %res = extractvalue { i32, i1 } %val, 0
+  call void @use(i32 %res)
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  tail call void @llvm.assume(i1 %nowrap)
+  ret i32 %res
+if.else:
+  ret i32 0
+}
+
+define i32 @uadd_no_overflow_invalid2(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @uadd_no_overflow_invalid2(
+; CHECK-NEXT:    [[VAL:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[VAL]], 1
+; CHECK-NEXT:    [[NOWRAP:%.*]] = xor i1 [[OV]], true
+; CHECK-NEXT:    call void @use(i32 0)
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOWRAP]])
+; CHECK-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %val = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %ov = extractvalue { i32, i1 } %val, 1
+  %nowrap = xor i1 %ov, true
+  call void @use(i32 0) ; It is not guaranteed to transfer execution to its successors
+  tail call void @llvm.assume(i1 %nowrap)
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+declare void @use(i32)
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index e3bf5ef..4756b4f 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -2822,3 +2822,179 @@ for.cond:                                         ; preds = %for.cond, %entry
 exit:                                             ; preds = %for.cond
   ret i64 0
 }
+
+define i1 @test_zext_icmp_eq_0(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[B:%.*]], true
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[C:%.*]], 0
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[CMP:%.*]] = phi i1 [ [[TMP0]], [[IF]] ], [ [[TMP1]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_ne_0(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_ne_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[C:%.*]], 0
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[B:%.*]], [[IF]] ], [ [[TMP0]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp ne i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_1(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[C:%.*]], 1
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[B:%.*]], [[IF]] ], [ [[TMP0]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 1
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_0_loop(i1 %c, i1 %b) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[Y:%.*]] = and i1 [[X]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP0]] = xor i1 [[Y]], true
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[X]]
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi i32 [ 1, %entry ], [ %ext, %loop ]
+  %x = icmp eq i32 %phi, 0
+  %y = and i1 %x, %b
+  %ext = zext i1 %y to i32
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret i1 %x
+}
+
+define i1 @test_zext_icmp_eq_0_multi_use(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_multi_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i1 [[B:%.*]] to i32
+; CHECK-NEXT:    call void @use(i32 [[B_EXT]])
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  call void @use(i32 %b.ext)
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_0_not_bool(i1 %a, i2 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_not_bool(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i2 [[B:%.*]] to i32
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i2 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
index e7d6cc7..920497c0 100644
--- a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
+++ b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
@@ -372,6 +372,32 @@ define <2 x i8> @srem_XY_XZ_with_CY_gt_CZ_no_nuw_out(<2 x i8> %X) {
   ret <2 x i8> %r
 }
 
+define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw(i8 noundef %X) {
+; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw(
+; CHECK-NEXT:    [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127
+; CHECK-NEXT:    [[BO1:%.*]] = shl nsw i8 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = srem i8 [[BO1]], [[BO0]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %BO0 = mul nsw i8 %X, 127
+  %BO1 = shl nsw i8 %X, 7
+  %r = srem i8 %BO1, %BO0
+  ret i8 %r
+}
+
+define i8 @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted(i8 noundef %X) {
+; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_drop_nsw_commuted(
+; CHECK-NEXT:    [[BO0:%.*]] = mul nsw i8 [[X:%.*]], 127
+; CHECK-NEXT:    [[BO1:%.*]] = shl nsw i8 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = srem i8 [[BO0]], [[BO1]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %BO0 = mul nsw i8 %X, 127
+  %BO1 = shl nsw i8 %X, 7
+  %r = srem i8 %BO0, %BO1
+  ret i8 %r
+}
+
 define i8 @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1(i8 %X) {
 ; CHECK-LABEL: @srem_XY_XZ_with_CY_gt_CZ_fail_missing_flag1(
 ; CHECK-NEXT:    [[BO0:%.*]] = mul nuw nsw i8 [[X:%.*]], 10
diff --git a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
index 35b4087..2cb70e8 100644
--- a/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
+++ b/llvm/test/Transforms/InstCombine/select-cmp-cttz-ctlz.ll
@@ -495,6 +495,19 @@ define i32 @test_cttz_not_bw(i32 %x) {
   ret i32 %res
 }
 
+define i32 @test_cttz_not_bw_noundef(i32 %x) {
+; CHECK-LABEL: @test_cttz_not_bw_noundef(
+; CHECK-NEXT:    [[CT:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP_NOT]], i32 123, i32 [[CT]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %ct = tail call noundef i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %cmp = icmp ne i32 %x, 0
+  %res = select i1 %cmp, i32 %ct, i32 123
+  ret i32 %res
+}
+
 define i32 @test_cttz_not_bw_multiuse(i32 %x) {
 ; CHECK-LABEL: @test_cttz_not_bw_multiuse(
 ; CHECK-NEXT:    [[CT:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false)
diff --git a/llvm/test/Transforms/InstCombine/select-divrem.ll b/llvm/test/Transforms/InstCombine/select-divrem.ll
index a674f9c..7dff78e 100644
--- a/llvm/test/Transforms/InstCombine/select-divrem.ll
+++ b/llvm/test/Transforms/InstCombine/select-divrem.ll
@@ -322,6 +322,21 @@ define i8 @rem_euclid_non_const_pow2(i8 %0, i8 %1) {
   ret i8 %sel
 }
 
+define i8 @rem_euclid_non_const_pow2_commuted(i8 %0, i8 %1) {
+; CHECK-LABEL: @rem_euclid_non_const_pow2_commuted(
+; CHECK-NEXT:    [[NOTMASK:%.*]] = shl nsw i8 -1, [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i8 [[NOTMASK]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[TMP1:%.*]], [[TMP3]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %pow2 = shl i8 1, %0
+  %rem = srem i8 %1, %pow2
+  %cond = icmp slt i8 %rem, 0
+  %add = add i8 %pow2, %rem
+  %sel = select i1 %cond, i8 %add, i8 %rem
+  ret i8 %sel
+}
+
 define i32 @rem_euclid_pow2_true_arm_folded(i32 %n) {
 ; CHECK-LABEL: @rem_euclid_pow2_true_arm_folded(
 ; CHECK-NEXT:    [[RES:%.*]] = and i32 [[N:%.*]], 1
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 0168a80..0f15fa6 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3937,11 +3937,8 @@ entry:
 define i32 @src_or_eq_0_and_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_or_eq_0_and_xor(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
 entry:
   %or = or i32 %y, %x
@@ -3956,11 +3953,8 @@ entry:
 define i32 @src_or_eq_0_xor_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_or_eq_0_xor_and(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
 entry:
   %or = or i32 %y, %x
@@ -4438,11 +4432,8 @@ define i32 @src_no_trans_select_and_eq0_xor_and(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_and(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp eq i32 %or, 0
@@ -4453,11 +4444,8 @@ define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_or_eq0_or_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_xor(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp eq i32 %or, 0
@@ -4492,11 +4480,8 @@ define i32 @src_no_trans_select_or_eq0_xor_or(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_and_ne0_xor_or(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_and_ne0_xor_or(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0_NOT:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0_NOT]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp ne i32 %or, 0
@@ -4843,3 +4828,16 @@ define i32 @replace_and_cond_multiuse2(i1 %cond1, i1 %cond2) {
   %mux = select i1 %cond1, i32 %sel, i32 1
   ret i32 %mux
 }
+
+define i32 @src_simplify_2x_at_once_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_simplify_2x_at_once_and(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, -1
+  %sub = sub i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %sub, i32 %xor
+  ret i32 %cond
+}
diff --git a/llvm/test/Transforms/InstCombine/xor-and-or.ll b/llvm/test/Transforms/InstCombine/xor-and-or.ll
index 47275ce..c380e27 100644
--- a/llvm/test/Transforms/InstCombine/xor-and-or.ll
+++ b/llvm/test/Transforms/InstCombine/xor-and-or.ll
@@ -25,6 +25,18 @@ define i1 @xor_logic_and_logic_or2(i1 %c, i1 %x, i1 %y) {
   ret i1 %r
 }
 
+define i1 @xor_logic_and_logic_or2_commuted(i1 %c, i1 %x, i1 %y) {
+; CHECK-LABEL: @xor_logic_and_logic_or2_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X:%.*]], true
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C:%.*]], i1 [[TMP1]], i1 [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %o = select i1 %y, i1 true, i1 %c
+  %a = select i1 %c, i1 %x, i1 false
+  %r = xor i1 %o, %a
+  ret i1 %r
+}
+
 define i1 @xor_logic_and_logic_or3(i1 %c, i1 %x, i1 %y) {
 ; CHECK-LABEL: @xor_logic_and_logic_or3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i1 [[C:%.*]]
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
new file mode 100644
index 0000000..543c731
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
@@ -0,0 +1,1129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; f2i/f2ui and d2i/d2ui - double/float to i32 tests
+
+;###############################################################
+;#               Tests with Positive 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_f2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_d2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_d2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 1.5)
+  ret i32 %res
+}
+
+;###############################################################
+;#               Tests with Negative 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_f2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float -1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_d2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rm() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rn() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_d2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rp() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rn(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rp(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rz(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double -1.5)
+  ret i32 %res
+}
+
+;###############################################################
+;#                    Tests with NaN                           #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2i_rm() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rn() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_f2i_rp() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_d2i_rm() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2i_rn() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_d2i_rp() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2i_rz() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+;###############################################################
+;#            Tests with Positive Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_f2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_d2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_d2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+;###############################################################
+;#            Tests with Negative Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_f2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_d2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rm() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_d2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double 0x800FFFFFFFFFFFFF)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+declare i32 @llvm.nvvm.f2i.rm(float)
+declare i32 @llvm.nvvm.f2i.rn(float)
+declare i32 @llvm.nvvm.f2i.rp(float)
+declare i32 @llvm.nvvm.f2i.rz(float)
+
+declare i32 @llvm.nvvm.f2i.rm.ftz(float)
+declare i32 @llvm.nvvm.f2i.rn.ftz(float)
+declare i32 @llvm.nvvm.f2i.rp.ftz(float)
+declare i32 @llvm.nvvm.f2i.rz.ftz(float)
+
+declare i32 @llvm.nvvm.d2i.rm(double)
+declare i32 @llvm.nvvm.d2i.rn(double)
+declare i32 @llvm.nvvm.d2i.rp(double)
+declare i32 @llvm.nvvm.d2i.rz(double)
+
+
+declare i32 @llvm.nvvm.f2ui.rm(float)
+declare i32 @llvm.nvvm.f2ui.rn(float)
+declare i32 @llvm.nvvm.f2ui.rp(float)
+declare i32 @llvm.nvvm.f2ui.rz(float)
+
+declare i32 @llvm.nvvm.f2ui.rm.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rn.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rp.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rz.ftz(float)
+
+declare i32 @llvm.nvvm.d2ui.rm(double)
+declare i32 @llvm.nvvm.d2ui.rn(double)
+declare i32 @llvm.nvvm.d2ui.rp(double)
+declare i32 @llvm.nvvm.d2ui.rz(double)
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll
new file mode 100644
index 0000000..be38177
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll
@@ -0,0 +1,1129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; f2ll/f2ull and d2ll/d2ull - double/float to i64 tests
+
+;###############################################################
+;#               Tests with Positive 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 1.5)
+  ret i64 %res
+}
+
+;###############################################################
+;#               Tests with Negative 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float -1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rm() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rn() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rp() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rn(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rp(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rz(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double -1.5)
+  ret i64 %res
+}
+
+;###############################################################
+;#                    Tests with NaN                           #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+;###############################################################
+;#            Tests with Positive Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+;###############################################################
+;#            Tests with Negative Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rm() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double 0x800FFFFFFFFFFFFF)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+declare i64 @llvm.nvvm.f2ll.rm(float)
+declare i64 @llvm.nvvm.f2ll.rn(float)
+declare i64 @llvm.nvvm.f2ll.rp(float)
+declare i64 @llvm.nvvm.f2ll.rz(float)
+
+declare i64 @llvm.nvvm.f2ll.rm.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rn.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rp.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rz.ftz(float)
+
+declare i64 @llvm.nvvm.d2ll.rm(double)
+declare i64 @llvm.nvvm.d2ll.rn(double)
+declare i64 @llvm.nvvm.d2ll.rp(double)
+declare i64 @llvm.nvvm.d2ll.rz(double)
+
+
+declare i64 @llvm.nvvm.f2ull.rm(float)
+declare i64 @llvm.nvvm.f2ull.rn(float)
+declare i64 @llvm.nvvm.f2ull.rp(float)
+declare i64 @llvm.nvvm.f2ull.rz(float)
+
+declare i64 @llvm.nvvm.f2ull.rm.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rn.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rp.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rz.ftz(float)
+
+declare i64 @llvm.nvvm.d2ull.rm(double)
+declare i64 @llvm.nvvm.d2ull.rn(double)
+declare i64 @llvm.nvvm.d2ull.rp(double)
+declare i64 @llvm.nvvm.d2ull.rz(double)
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
new file mode 100644
index 0000000..fe8a7e5
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o - | FileCheck %s
+
+; This test verifies that the vectorizer can handle an extended sequence of
+; getelementptr instructions and generate longer vectors. With special handling,
+; some elements can still be vectorized even if they require looking up the
+; common underlying object deeper than 6 levels from the original pointer.
+
+; The test below is the simplified version of actual performance oriented
+; workload; the offsets in getelementptr instructions are similar or same for
+; the test simplicity.
+
+define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
+; CHECK-NEXT:    [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
+; CHECK-NEXT:    [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
+; CHECK-NEXT:    [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[A6]], align 16
+; CHECK-NEXT:    ret void
+;
+
+  %level1 = getelementptr i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 8
+  %d8 = getelementptr i8, ptr %b7, i32 12
+
+  store half 0xH0000, ptr %a6, align 16
+  store <4 x half> zeroinitializer, ptr %b7, align 2
+  store <2 x half> zeroinitializer, ptr %c8, align 2
+  store half 0xH0000, ptr %d8, align 2
+  ret void
+}
+
+define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: define void @v1x8_levels_6_7_8_9_10_11_12_13(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
+; CHECK-NEXT:    [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
+; CHECK-NEXT:    [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
+; CHECK-NEXT:    [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[A6]], align 16
+; CHECK-NEXT:    ret void
+;
+
+  %level1 = getelementptr i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 2
+  %d9 = getelementptr i8, ptr %c8, i32 2
+  %e10 = getelementptr i8, ptr %d9, i32 2
+  %f11 = getelementptr i8, ptr %e10, i32 2
+  %g12 = getelementptr i8, ptr %f11, i32 2
+  %h13 = getelementptr i8, ptr %g12, i32 2
+
+  store half 0xH0000, ptr %a6, align 16
+  store half 0xH0000, ptr %b7, align 2
+  store half 0xH0000, ptr %c8, align 2
+  store half 0xH0000, ptr %d9, align 2
+  store half 0xH0000, ptr %e10, align 8
+  store half 0xH0000, ptr %f11, align 2
+  store half 0xH0000, ptr %g12, align 2
+  store half 0xH0000, ptr %h13, align 2
+  ret void
+}
+
+define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(i32 %arg0, ptr addrspace(3) align 16 %arg1_ptr, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, half %arg6_half, half %arg7_half, <2 x half> %arg8_2xhalf) {
+; CHECK-LABEL: define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr addrspace(3) align 16 [[ARG1_PTR:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]], i32 [[ARG4:%.*]], i32 [[ARG5:%.*]], half [[ARG6_HALF:%.*]], half [[ARG7_HALF:%.*]], <2 x half> [[ARG8_2XHALF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[ARG1_PTR]], i32 458752
+; CHECK-NEXT:    br [[DOTPREHEADER11_PREHEADER:label %.*]]
+; CHECK:       [[_PREHEADER11_PREHEADER:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[ARG0]], 6
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[ARG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[ARG3]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ARG0]], 2
+; CHECK-NEXT:    br i1 [[CMP]], [[DOTLR_PH:label %.*]], [[DOTEXIT_POINT:label %.*]]
+; CHECK:       [[_LR_PH:.*:]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP5]], i32 [[ARG4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[GEP]], i32 [[ARG5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> poison, half [[ARG6_HALF]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half 0xH0000, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half 0xH0000, i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half 0xH0000, i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half 0xH0000, i32 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x half> [[TMP15]], half [[ARG7_HALF]], i32 7
+; CHECK-NEXT:    store <8 x half> [[TMP16]], ptr addrspace(3) [[TMP6]], align 2
+; CHECK-NEXT:    br [[DOTEXIT_POINT]]
+; CHECK:       [[_EXIT_POINT:.*:]]
+; CHECK-NEXT:    ret void
+;
+  %base1 = getelementptr inbounds i8, ptr addrspace(3) %arg1_ptr, i32 458752
+  br label %.preheader11.preheader
+
+.preheader11.preheader:
+  %base2 = shl nuw nsw i32 %arg0, 6
+  %base3 = getelementptr inbounds i8, ptr addrspace(3) %base1, i32 %base2
+
+  %base4 = getelementptr inbounds i8, ptr addrspace(3) %base3, i32 %arg2
+  %base5 = getelementptr inbounds i8, ptr addrspace(3) %base4, i32 %arg3
+
+  %cmp = icmp sgt i32 %arg0, 2
+  br i1 %cmp, label %.lr.ph, label %.exit_point
+
+.lr.ph:
+  %gep = getelementptr inbounds i8, ptr addrspace(3) %base5, i32 %arg4
+
+  %dst = getelementptr inbounds i8, ptr addrspace(3) %gep, i32 %arg5
+  %dst_off2 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 2
+  %dst_off10 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 10
+  %dst_off14 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 14
+
+  store half %arg6_half, ptr addrspace(3) %dst, align 2
+  store <4 x half> zeroinitializer, ptr addrspace(3) %dst_off2, align 2
+  store <2 x half> %arg8_2xhalf, ptr addrspace(3) %dst_off10, align 2
+  store half %arg7_half, ptr addrspace(3) %dst_off14, align 2
+  br label %.exit_point
+
+.exit_point:
+  ret void
+}
+
+; The regression test for merging equivalence classes. It is reduced and adapted
+; for LSV from llvm/test/CodeGen/NVPTX/variadics-backend.ll, which failed at
+; post-commit checks with memory sanitizer on the initial attempt to implement
+; the merging of the equivalence classes.
+define void @variadics1(ptr %vlist) {
+; CHECK-LABEL: define void @variadics1(
+; CHECK-SAME: ptr [[VLIST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[ARGP_CUR7_ALIGNED2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[VLIST]], i64 0)
+; CHECK-NEXT:    [[ARGP_NEXT8:%.*]] = getelementptr i8, ptr [[ARGP_CUR7_ALIGNED2]], i64 8
+; CHECK-NEXT:    [[X0:%.*]] = getelementptr i8, ptr [[ARGP_NEXT8]], i32 7
+; CHECK-NEXT:    [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X0]], i64 0)
+; CHECK-NEXT:    [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8
+; CHECK-NEXT:    [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7
+; CHECK-NEXT:    [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296
+; CHECK-NEXT:    [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[X5:%.*]] = fadd double [[X42]], [[X31]]
+; CHECK-NEXT:    store double [[X5]], ptr null, align 8
+; CHECK-NEXT:    ret void
+;
+  %argp.cur7.aligned2 = call ptr @llvm.ptrmask.p0.i64(ptr %vlist, i64 0)
+  %argp.next8 = getelementptr i8, ptr %argp.cur7.aligned2, i64 8
+  %x0 = getelementptr i8, ptr %argp.next8, i32 7
+  %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x0, i64 0)
+  %argp.next12 = getelementptr i8, ptr %argp.cur11.aligned, i64 8
+  %x2 = getelementptr i8, ptr %argp.next12, i32 7
+  %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x2, i64 0)
+  %x3 = load double, ptr %argp.cur16.aligned, align 8
+  %argp.cur16.aligned_off8 = getelementptr i8, ptr %argp.cur16.aligned, i32 8
+  %x4 = load double, ptr %argp.cur16.aligned_off8, align 8
+  %x5 = fadd double %x4, %x3
+  store double %x5, ptr null, align 8
+  ret void
+}
+
+declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
index 64e8a6b..7a3817d 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-unknown-unknown -print-lsr-output < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=aarch64 -stop-after=loop-reduce < %s | FileCheck %s
 
 declare void @foo(i64)
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll b/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
index 8761122..e6b5991 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
@@ -13,7 +13,7 @@ target triple = "nvptx64-nvidia-cuda"
 ; That would be worthless, because "i" is simulated by two 32-bit registers and
 ; truncating it to 32-bit is as simple as directly using the register that
 ; contains the low bits.
-define void @trunc_is_free(i64 %begin, i64 %stride, i64 %end) {
+define ptx_kernel void @trunc_is_free(i64 %begin, i64 %stride, i64 %end) {
 ; CHECK-LABEL: @trunc_is_free(
 entry:
   %cmp.4 = icmp eq i64 %begin, %end
@@ -41,5 +41,3 @@ for.body:                                         ; preds = %for.body.preheader,
 
 declare void @_Z3usei(i32)
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @trunc_is_free, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
index 63149ad..d9b8907 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
@@ -1,67 +1,57 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "estimated cost.*frem" --version 4
-
+; REQUIRES: asserts
 ; RUN: opt -mattr=+neon -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-NO-VECLIB
-
 ; RUN: opt -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-NO-VECLIB
-
 ; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-ARMPL
-
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-SLEEF
-
 ; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL
-
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF
-
 ; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL-TAILFOLD
-
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF-TAILFOLD
 
-; REQUIRES: asserts
-
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f64'
 ; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f64'
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-ARMPL-LABEL: 'frem_f64'
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-SLEEF-LABEL: 'frem_f64'
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-LABEL: 'frem_f64'
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-LABEL: 'frem_f64'
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f64'
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f64'
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
   entry:
   br label %for.body
@@ -84,58 +74,58 @@ define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f32'
 ; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-NO-VECLIB:  Cost of 52 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f32'
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of 52 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-ARMPL-LABEL: 'frem_f32'
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-ARMPL:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-SLEEF-LABEL: 'frem_f32'
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-SLEEF:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-LABEL: 'frem_f32'
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-LABEL: 'frem_f32'
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f32'
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f32'
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
   entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index ddf6c10..254cdf2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -209,6 +209,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[C_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
@@ -218,7 +219,6 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], [[TMP3]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> splat (i8 1)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index 4f05087..e63155b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -80,15 +80,13 @@ define void @powi_call(ptr %P) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3)
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -102,7 +100,7 @@ define void @powi_call(ptr %P) {
 ; CHECK-NEXT:    store double [[POWI]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -233,6 +231,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index ac7f147..5b77ced 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -13,9 +13,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
@@ -99,9 +99,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 8673559..caa98d7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -821,11 +821,11 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 8
 ; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; PRED-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
-; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 257, [[TMP7]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 03de9ac..88b14b1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -233,8 +233,8 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 [[START]], [[N_VEC3]]
@@ -409,8 +409,8 @@ define void @test_widen_extended_induction(ptr %dst) {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], <i8 0, i8 1>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
index f7a1eb4..a939969 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
@@ -48,8 +48,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index cbf9bf0..8c5d84e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -91,8 +91,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END6:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
@@ -117,11 +117,11 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    br i1 [[CMP_N11]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END1]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END6]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END2]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL11]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL9]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 1
@@ -151,9 +151,9 @@ exit:
   ret void
 }
 
-define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4, ptr %src, ptr %dst.3, i1 %c.3, ptr %dst.2) {
+define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.ptr, ptr %dst.1, i1 %c.4, ptr %src, ptr %dst.3, i1 %c.3, ptr %dst.2) {
 ; CHECK-LABEL: define void @test_exit_branch_cost(
-; CHECK-SAME: ptr [[DST:%.*]], i64 [[X:%.*]], i32 [[Y:%.*]], ptr [[DST_1:%.*]], i1 [[C_4:%.*]], ptr [[SRC:%.*]], ptr [[DST_3:%.*]], i1 [[C_3:%.*]], ptr [[DST_2:%.*]]) {
+; CHECK-SAME: ptr [[DST:%.*]], ptr noalias [[X_PTR:%.*]], ptr noalias [[Y_PTR:%.*]], ptr [[DST_1:%.*]], i1 [[C_4:%.*]], ptr [[SRC:%.*]], ptr [[DST_3:%.*]], i1 [[C_3:%.*]], ptr [[DST_2:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
@@ -172,11 +172,11 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
-; CHECK-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[CONFLICT_RDX21:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
 ; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX21]], [[FOUND_CONFLICT14]]
 ; CHECK-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
@@ -184,161 +184,101 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
-; CHECK-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; CHECK-NEXT:    [[CONFLICT_RDX41:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
 ; CHECK-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX41]], [[FOUND_CONFLICT26]]
 ; CHECK-NEXT:    [[BOUND028:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND129:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT30:%.*]] = and i1 [[BOUND028]], [[BOUND129]]
-; CHECK-NEXT:    [[CONFLICT_RDX31:%.*]] = or i1 [[CONFLICT_RDX27]], [[FOUND_CONFLICT30]]
+; CHECK-NEXT:    [[CONFLICT_RDX65:%.*]] = or i1 [[CONFLICT_RDX27]], [[FOUND_CONFLICT30]]
 ; CHECK-NEXT:    [[BOUND032:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND133:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT34:%.*]] = and i1 [[BOUND032]], [[BOUND133]]
-; CHECK-NEXT:    [[CONFLICT_RDX35:%.*]] = or i1 [[CONFLICT_RDX31]], [[FOUND_CONFLICT34]]
+; CHECK-NEXT:    [[FOUND_CONFLICT68:%.*]] = and i1 [[BOUND032]], [[BOUND133]]
+; CHECK-NEXT:    [[CONFLICT_RDX35:%.*]] = or i1 [[CONFLICT_RDX65]], [[FOUND_CONFLICT68]]
 ; CHECK-NEXT:    [[BOUND036:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND137:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT38:%.*]] = and i1 [[BOUND036]], [[BOUND137]]
 ; CHECK-NEXT:    [[CONFLICT_RDX39:%.*]] = or i1 [[CONFLICT_RDX35]], [[FOUND_CONFLICT38]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX39]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT40:%.*]] = insertelement <2 x i1> poison, i1 [[C_3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT41:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT40]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT56:%.*]] = insertelement <2 x i1> poison, i1 [[C_4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT57:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT56]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE74:.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE55:.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7:![0-9]+]], !noalias [[META10:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF42:.*]], label %[[PRED_STORE_CONTINUE43:.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF42:.*]], label %[[PRED_STORE_CONTINUE43:.*]]
 ; CHECK:       [[PRED_STORE_IF42]]:
 ; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE43]]
 ; CHECK:       [[PRED_STORE_CONTINUE43]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF44:.*]], label %[[PRED_STORE_CONTINUE45:.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF44:.*]], label %[[PRED_STORE_CONTINUE45:.*]]
 ; CHECK:       [[PRED_STORE_IF44]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
+; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE45]]
 ; CHECK:       [[PRED_STORE_CONTINUE45]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF46:.*]], label %[[PRED_STORE_CONTINUE47:.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF46:.*]], label %[[PRED_STORE_CONTINUE47:.*]]
 ; CHECK:       [[PRED_STORE_IF46]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
+; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE47]]
 ; CHECK:       [[PRED_STORE_CONTINUE47]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF48:.*]], label %[[PRED_STORE_CONTINUE49:.*]]
-; CHECK:       [[PRED_STORE_IF48]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE49]]
-; CHECK:       [[PRED_STORE_CONTINUE49]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF50:.*]], label %[[PRED_STORE_CONTINUE51:.*]]
-; CHECK:       [[PRED_STORE_IF50]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE51]]
-; CHECK:       [[PRED_STORE_CONTINUE51]]:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF52:.*]], label %[[PRED_STORE_CONTINUE53:.*]]
-; CHECK:       [[PRED_STORE_IF52]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE53]]
-; CHECK:       [[PRED_STORE_CONTINUE53]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF54:.*]], label %[[PRED_STORE_CONTINUE55:.*]]
-; CHECK:       [[PRED_STORE_IF54]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE55]]
-; CHECK:       [[PRED_STORE_CONTINUE55]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = or <2 x i1> [[TMP47]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = or <2 x i1> [[TMP47]], [[TMP21]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP20]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[PREDPHI58:%.*]] = select <2 x i1> [[TMP21]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP22]], i32 0
-; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF59:.*]], label %[[PRED_STORE_CONTINUE60:.*]]
-; CHECK:       [[PRED_STORE_IF59]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[PREDPHI]], i32 0
-; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DST_2]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE60]]
-; CHECK:       [[PRED_STORE_CONTINUE60]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP22]], i32 1
-; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF61:.*]], label %[[PRED_STORE_CONTINUE62:.*]]
-; CHECK:       [[PRED_STORE_IF61]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[PREDPHI]], i32 1
-; CHECK-NEXT:    store i64 [[TMP27]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE62]]
-; CHECK:       [[PRED_STORE_CONTINUE62]]:
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP23]], i32 0
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF63:.*]], label %[[PRED_STORE_CONTINUE64:.*]]
-; CHECK:       [[PRED_STORE_IF63]]:
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF48:.*]], label %[[PRED_STORE_CONTINUE49:.*]]
+; CHECK:       [[PRED_STORE_IF48]]:
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[PREDPHI58]], i32 0
-; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE64]]
-; CHECK:       [[PRED_STORE_CONTINUE64]]:
+; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST_2]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE49]]
+; CHECK:       [[PRED_STORE_CONTINUE49]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP23]], i32 1
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF65:.*]], label %[[PRED_STORE_CONTINUE66:.*]]
-; CHECK:       [[PRED_STORE_IF65]]:
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF50:.*]], label %[[PRED_STORE_CONTINUE51:.*]]
+; CHECK:       [[PRED_STORE_IF50]]:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[PREDPHI58]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP31]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE66]]
-; CHECK:       [[PRED_STORE_CONTINUE66]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
-; CHECK-NEXT:    [[TMP33:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
-; CHECK-NEXT:    [[TMP34:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP32]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE51]]
+; CHECK:       [[PRED_STORE_CONTINUE51]]:
 ; CHECK-NEXT:    [[TMP35:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[TMP33]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = or <2 x i1> [[TMP22]], [[TMP34]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = or <2 x i1> [[TMP23]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
-; CHECK-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF67:.*]], label %[[PRED_STORE_CONTINUE68:.*]]
-; CHECK:       [[PRED_STORE_IF67]]:
-; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19:![0-9]+]]
-; CHECK-NEXT:    store i64 [[TMP45]], ptr [[DST]], align 8, !alias.scope [[META20:![0-9]+]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE68]]
-; CHECK:       [[PRED_STORE_CONTINUE68]]:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
-; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF69:.*]], label %[[PRED_STORE_CONTINUE70:.*]]
-; CHECK:       [[PRED_STORE_IF69]]:
-; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP39]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE70]]
-; CHECK:       [[PRED_STORE_CONTINUE70]]:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0
-; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF71:.*]], label %[[PRED_STORE_CONTINUE72:.*]]
-; CHECK:       [[PRED_STORE_IF71]]:
-; CHECK-NEXT:    [[TMP41:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP41]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE72]]
-; CHECK:       [[PRED_STORE_CONTINUE72]]:
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF52:.*]], label %[[PRED_STORE_CONTINUE53:.*]]
+; CHECK:       [[PRED_STORE_IF52]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19:![0-9]+]]
+; CHECK-NEXT:    store i64 [[TMP24]], ptr [[DST]], align 8, !alias.scope [[META20:![0-9]+]], !noalias [[META19]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE53]]
+; CHECK:       [[PRED_STORE_CONTINUE53]]:
 ; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1
-; CHECK-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF73:.*]], label %[[PRED_STORE_CONTINUE74]]
-; CHECK:       [[PRED_STORE_IF73]]:
-; CHECK-NEXT:    [[TMP43:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP43]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE74]]
-; CHECK:       [[PRED_STORE_CONTINUE74]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF54:.*]], label %[[PRED_STORE_CONTINUE55]]
+; CHECK:       [[PRED_STORE_IF54]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
+; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE55]]
+; CHECK:       [[PRED_STORE_CONTINUE55]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP46]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -348,6 +288,10 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[X_GEP:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[X_GEP]], align 8
+; CHECK-NEXT:    [[Y_GEP:%.*]] = getelementptr i32, ptr [[Y_PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[Y_GEP]], align 4
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i64 [[X]], 0
 ; CHECK-NEXT:    br i1 [[C1]], label %[[THEN_4:.*]], label %[[THEN_1:.*]]
 ; CHECK:       [[THEN_1]]:
@@ -386,6 +330,10 @@ entry:
 
 loop.header:
   %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %x.gep = getelementptr i64, ptr %x.ptr, i64 %iv
+  %x = load i64, ptr %x.gep
+  %y.gep = getelementptr i32, ptr %y.ptr, i64 %iv
+  %y = load i32, ptr %y.gep
   %c1 = icmp eq i64 %x, 0
   br i1 %c1, label %then.4, label %then.1
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index d42e6af..56a468e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -819,8 +819,8 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
 ; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index bf27f9e..f9cc195 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -333,10 +333,10 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[SHL:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index dec124b..a550f1c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -170,8 +170,8 @@ entry:
 ; VF_2-LABEL: Checking a loop in 'i64_factor_8'
 ; VF_2:         Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 8 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 393ee8d..9dceb01 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -50,12 +50,12 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc nuw i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP7]]
-; CHECK-NEXT:    [[DOTCAST7:%.*]] = trunc nuw i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST7]]
+; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 12
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -171,10 +171,10 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST6:%.*]] = trunc nuw i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 528e202..6d57f21 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -105,9 +105,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 8
 ; CHECK-VS1-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS1-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
-; CHECK-VS1-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VS1-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 8
+; CHECK-VS1-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-VS1-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -127,7 +127,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS1:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-VS1-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK-VS1:       [[WHILE_BODY]]:
 ; CHECK-VS1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -213,9 +213,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
 ; CHECK-VS2-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS2-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
-; CHECK-VS2-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VS2-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 4
+; CHECK-VS2-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VS2-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
@@ -235,7 +235,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS2:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-VS2-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK-VS2:       [[WHILE_BODY]]:
 ; CHECK-VS2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -428,9 +428,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
index 771dd000..0ff98d2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
@@ -19,8 +19,8 @@ define i64 @mul_select_operand_known_1_via_scev() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[VEC_PHI]])
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 0cea16d..3d4f7e0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -137,9 +137,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; VSCALEFORTUNING2-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VSCALEFORTUNING2:       scalar.ph:
-; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; VSCALEFORTUNING2-NEXT:    [[SCALAR_RECUR_INIT11:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; VSCALEFORTUNING2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; VSCALEFORTUNING2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; VSCALEFORTUNING2-NEXT:    br label [[LOOP:%.*]]
 ; VSCALEFORTUNING2:       loop:
@@ -260,9 +260,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP29]], i32 [[TMP47]]
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
-; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
 ; PRED-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
 ; PRED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP44]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
 ; PRED-NEXT:    br label [[LOOP1:%.*]]
 ; PRED:       loop:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index a426cdf..a83c62b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -23,9 +23,9 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[IDX]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[DOTSPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 6ecaff0..cb4fd04 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -512,9 +512,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-UNORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[A2]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[A1]], i32 0
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -540,9 +540,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
-; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -583,9 +583,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-ORDERED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -607,9 +607,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
-; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
@@ -649,9 +649,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-ORDERED-TF-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
@@ -684,9 +684,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 7d058a6..295c065 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -380,9 +380,9 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 10000, [[TMP24]]
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 10000, [[N_MOD_VF2]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 5f09431..90ef2da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -16,10 +16,10 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i7>
@@ -72,7 +72,7 @@ for.end:                                          ; preds = %for.body
 
 ; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
 ; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext  ir<%indvars.iv1294> to i64
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext ir<%indvars.iv1294> to i64
 
 define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i3_zext(
@@ -82,10 +82,10 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i3>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 0924992..bf95622 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -467,9 +467,9 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP1]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -553,9 +553,9 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP9]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1248,9 +1248,9 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
@@ -1339,10 +1339,10 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP11]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i64> [[TMP9]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP10]], splat (i64 3)
@@ -1449,9 +1449,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP8]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], [[DOTNEG]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
@@ -1492,9 +1492,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP33:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index 64b69be..322f96f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -19,12 +19,12 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index d81cfbf..1f7d0b7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -11,7 +11,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; VPLANS-LABEL: Checking a loop in 'simple_memset'
 ; VPLANS:      VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
 ; VPLANS-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; VPLANS-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; VPLANS:      vp<[[TC:%[0-9]+]]> = original trip-count
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: ir-bb<entry>:
 ; VPLANS-NEXT:  EMIT vp<[[TC]]> = EXPAND SCEV (1 umax %n)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index e5b9812..75b2df9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -215,9 +215,9 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index de15004..603bd98 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -17,6 +17,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%start.1> + vp<[[VEC_TC]]> * ir<8>
+; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%start.2> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   <x1> vector loop: {
@@ -55,11 +57,12 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[START_2:%.*]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
@@ -150,9 +153,9 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 4b096e1..881de8d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -25,10 +25,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP26:%.*]] = shl i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP26]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -138,12 +138,12 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -229,12 +229,12 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
index 3ef99ff..c119248 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -45,10 +45,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi [[VTC]], ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %iv.next, %N
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -75,7 +76,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]]
 ; CHECK-NEXT:     [[VECP_IDX:vp.*]] = vector-pointer [[GEP_IDX]]
 ; CHECK-NEXT:     WIDEN [[IDX:.*]] = load [[VECP_IDX]]
-; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext  [[IDX]] to i64
+; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64
 ; CHECK-NEXT:     WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
 ; CHECK-NEXT:     WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1>
 ; CHECK-NEXT:     EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]]
@@ -90,10 +91,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi [[VTC]], ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %iv.next, %N
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index 8ac46fe..0c246c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -43,10 +43,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -88,10 +89,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -138,6 +140,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
@@ -183,10 +186,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -232,6 +236,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
@@ -277,10 +282,11 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:  EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:   IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %exitcond = icmp eq i64 %indvars.iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
index bd2e5dc..0bc3ea9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -8,14 +8,14 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'zext_i8_i16'
-; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
-; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = zext ir<%0> to i16
 ; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = zext i8 %0 to i32
 ; CHECK-LABEL: define void @zext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -85,14 +85,14 @@ exit:                                 ; preds = %for.body
 
 define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i32 %len) #0 {
 ; CHECK-COST-LABEL: LV: Checking a loop in 'sext_i8_i16'
-; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
-; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext  ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 2 for VF 16: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 1: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 2: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 1 for VF vscale x 4: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN-CAST ir<%conv> = sext ir<%0> to i16
 ; CHECK-LABEL: define void @sext_i8_i16
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 648f6e8..a119707 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -41,10 +41,11 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %cmp = icmp ne i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -86,10 +87,11 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:        IR   %cmp = icmp ne i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
index 7b18e5c..b229103 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -27,7 +27,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
 ; CHECK: Cost of 0 for VF 2: vp<%4> = vector-pointer ir<%arrayidx>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<%4>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 20 for VF 2: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 26 for VF 2: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
@@ -44,7 +44,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
 ; CHECK: Cost of 0 for VF 4: vp<%4> = vector-pointer ir<%arrayidx>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<%4>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
@@ -61,7 +61,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi"
 ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<%3>
 ; CHECK: Cost of 0 for VF 8: vp<%4> = vector-pointer ir<%arrayidx>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<%4>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv> = sext ir<%1> to i32
 ; CHECK: Cost of 36 for VF 8: WIDEN ir<%cmp2> = icmp sgt ir<%conv>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%conv6> = add ir<%1>, ir<%0>
 ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx7> = getelementptr ir<%d>, vp<%3>
@@ -144,15 +144,15 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%0> = load vp<[[VEC_PTR]]>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 4 for VF 2: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 26 for VF 2: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 18 for VF 2: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 2: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 22 for VF 2: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<%0>
@@ -176,15 +176,15 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%0> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 4: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 2 for VF 4: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 0 for VF 4: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<%0>
@@ -208,15 +208,15 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%0> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 8: WIDEN ir<%1> = load vp<[[VEC_PTR2]]>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 4 for VF 8: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 8: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 4 for VF 8: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 2 for VF 8: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}
@@ -240,15 +240,15 @@ for.inc:                                          ; preds = %for.body, %if.then
 ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.2 = ptradd ir<%pSrcB>, vp<[[STEPS3]]>
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR:%.+]]> = vector-pointer vp<%next.gep>
 ; CHECK: Cost of 2 for VF 16: WIDEN ir<%0> = load vp<[[VEC_PTR]]>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext  ir<%0> to i32
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv1> = sext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR1:%.+]]> = vector-pointer vp<%next.gep>.2
 ; CHECK: Cost of 2 for VF 16: WIDEN ir<%1> = load vp<[[VEC_PTR1]]>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext  ir<%1> to i32
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv3> = sext ir<%1> to i32
 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%mul> = mul nsw ir<%conv3>, ir<%conv1>
 ; CHECK: Cost of 8 for VF 16: WIDEN ir<%shr> = ashr ir<%mul>, ir<7>
 ; CHECK: Cost of 0 for VF 16: WIDEN ir<%2> = icmp slt ir<%shr>, ir<127>
 ; CHECK: Cost of 8 for VF 16: WIDEN-SELECT ir<%spec.select.i> = select ir<%2>, ir<%shr>, ir<127>
-; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc  ir<%spec.select.i> to i8
+; CHECK: Cost of 6 for VF 16: WIDEN-CAST ir<%conv4> = trunc ir<%spec.select.i> to i8
 ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1
 ; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4>
 ; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index 976c6a9..551b85b 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -17,8 +17,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
@@ -58,8 +58,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
@@ -99,8 +99,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12  for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_2'
 ; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
@@ -140,23 +140,23 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_2'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_2'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0
@@ -181,8 +181,8 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_2'
 ; VF_4:          Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2
@@ -263,23 +263,23 @@ entry:
 ; VF_2-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_2'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_2'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0
@@ -309,30 +309,30 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_4-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp3,  ptr %tmp0, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_3'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0
@@ -361,30 +361,30 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_3'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0
@@ -413,9 +413,9 @@ entry:
 ; VF_2:          Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_3'
 ; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
@@ -427,16 +427,16 @@ entry:
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_3'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0
@@ -465,30 +465,30 @@ entry:
 ; VF_2:          Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_3'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_3'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0
@@ -517,9 +517,9 @@ entry:
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
 ; VF_4-LABEL:  Checking a loop in 'f16_factor_3'
 ; VF_4:          Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2
@@ -621,30 +621,30 @@ entry:
 ; VF_2:          Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_3'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_3'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0
@@ -677,37 +677,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_4-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_4:         Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_4-NEXT:    Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_4-NEXT:    Found an estimated cost of 8 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_8-LABEL:  Checking a loop in 'i8_factor_4'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0
@@ -740,37 +740,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_4-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_4:          Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_4-NEXT:     Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_8-LABEL:  Checking a loop in 'i16_factor_4'
 ; VF_8:          Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 ; VF_16-LABEL: Checking a loop in 'i16_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0
@@ -803,10 +803,10 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_2-NEXT:     Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in 'i32_factor_4'
 ; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
@@ -821,19 +821,19 @@ entry:
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 ; VF_16-LABEL: Checking a loop in 'i32_factor_4'
 ; VF_16:         Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0
@@ -866,37 +866,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_4:          Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'i64_factor_4'
 ; VF_8:          Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'i64_factor_4'
 ; VF_16:         Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0
@@ -1055,37 +1055,37 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT:     Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT:     Found an estimated cost of 4 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_4-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_4:          Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT:     Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_8-LABEL:  Checking a loop in 'f64_factor_4'
 ; VF_8:          Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT:     Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
 ; VF_16-LABEL: Checking a loop in 'f64_factor_4'
 ; VF_16:         Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8
 ; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT:    Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index 79ced9a..2f1af79 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -108,7 +108,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END27:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
@@ -117,40 +117,40 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF24]]
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND30:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT31:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI32:%.*]] = phi <2 x i64> [ [[TMP55]], %[[VEC_EPILOG_PH]] ], [ [[TMP56:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX29]], 0
-; CHECK-NEXT:    [[NEXT_GEP33:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[NEXT_GEP33]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <2 x i8>, ptr [[TMP58]], align 1
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[WIDE_LOAD34]] to <2 x i64>
-; CHECK-NEXT:    [[TMP60:%.*]] = shl <2 x i64> [[VEC_IND30]], splat (i64 1)
-; CHECK-NEXT:    [[TMP61:%.*]] = shl <2 x i64> [[TMP59]], [[TMP60]]
-; CHECK-NEXT:    [[TMP56]] = or <2 x i64> [[TMP61]], [[VEC_PHI32]]
-; CHECK-NEXT:    [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT31]] = add <2 x i64> [[VEC_IND30]], splat (i64 2)
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP62]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND27:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT28:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI29:%.*]] = phi <2 x i64> [ [[TMP57]], %[[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP59:%.*]] = add i64 [[INDEX26]], 0
+; CHECK-NEXT:    [[NEXT_GEP30:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[NEXT_GEP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i8>, ptr [[TMP60]], align 1
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[WIDE_LOAD32]] to <2 x i64>
+; CHECK-NEXT:    [[TMP62:%.*]] = shl <2 x i64> [[VEC_IND27]], splat (i64 1)
+; CHECK-NEXT:    [[TMP63:%.*]] = shl <2 x i64> [[TMP61]], [[TMP62]]
+; CHECK-NEXT:    [[TMP58]] = or <2 x i64> [[TMP63]], [[VEC_PHI29]]
+; CHECK-NEXT:    [[INDEX_NEXT32]] = add nuw i64 [[INDEX26]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT28]] = add <2 x i64> [[VEC_IND27]], splat (i64 2)
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT32]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP56]])
-; CHECK-NEXT:    [[CMP_N36:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N36]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP58]])
+; CHECK-NEXT:    [[CMP_N33:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[CMP_N33]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL35:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END27]], %[[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX37:%.*]] = phi i64 [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL34:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX35:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX37]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL34]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL36]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[PTR_IV]], align 1
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP53]] to i64
@@ -164,7 +164,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP_I166_I]], i1 [[CMP2]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = icmp eq i64 [[RED_NEXT_LCSSA]], 0
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
index 130ef75..3d00c22 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
@@ -75,17 +75,17 @@ define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[SRC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT4]], i32 8, <vscale x 2 x i1> [[TMP8]], <vscale x 2 x ptr> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 2 x i1> [[VEC_PHI]], [[TMP9]]
@@ -100,8 +100,8 @@ define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[ANY_OF_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 11efac95..f2318d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -22,9 +22,9 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -116,9 +116,9 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -210,9 +210,9 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -314,9 +314,9 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -420,9 +420,9 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -534,9 +534,9 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
@@ -851,9 +851,9 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP15]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 2 x i64> [[TMP18]], splat (i64 3)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP20]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 16c23cd..450405f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -29,10 +29,10 @@ define void @dead_load(ptr %p, i16 %start) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP18]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[START_EXT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
@@ -111,9 +111,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
@@ -334,9 +334,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 9
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i32> [[BROADCAST_SPLAT]], splat (i32 -1)
@@ -419,9 +419,9 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP9]], splat (i64 2)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
@@ -433,24 +433,24 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = zext <vscale x 4 x i8> [[TMP16]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> [[TMP18]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[TMP17]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index a7765f4..038e726 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -432,6 +432,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -439,7 +440,6 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
@@ -477,6 +477,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -486,10 +487,8 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
-; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP6]]
+; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
@@ -560,6 +559,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -567,7 +567,6 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
@@ -605,6 +604,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -614,10 +614,8 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
-; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP6]]
+; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index dd2e75f..58d6fd0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -28,8 +28,8 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 8131c7b..e4425a9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -63,13 +63,13 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[X_I64]], [[TMP49]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i32 [[DOTCAST]], 3
 ; CHECK-NEXT:    [[IND_END22:%.*]] = add i32 [[X_I32]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[X_I64]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
@@ -92,11 +92,11 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ], [ [[X_I64]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL14]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_CONV:%.*]] = phi i32 [ [[BC_RESUME_VAL13]], %[[SCALAR_PH]] ], [ [[TMP64:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_I64:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = load i64, ptr [[GEP_I64]], align 8
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext i32 [[IV_CONV]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 5c5600b..10ac870 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -49,28 +49,26 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -78,16 +76,16 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -118,28 +116,26 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 5, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 5)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -147,16 +143,16 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -187,28 +183,26 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -216,16 +210,16 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -256,19 +250,17 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -285,7 +277,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -317,19 +309,17 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -346,7 +336,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -390,7 +380,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -407,7 +397,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 2c19aab..dc63072 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -38,9 +38,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
-; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
 ; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
@@ -117,9 +117,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
-; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; RV64-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV64-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
 ; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
new file mode 100644
index 0000000..93bd44f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
@@ -0,0 +1,39 @@
+; RUN: opt -passes=debugify,loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -S < %s 2>&1 | FileCheck --check-prefix=DEBUGLOC %s
+
+; Testing the debug locations of the generated vector intrinsic is same as
+; its scalar counterpart.
+
+define void @vp_select(ptr %a, ptr %b, ptr %c, i64 %N) {
+; DEBUGLOC-LABEL: define void @vp_select(
+; DEBUGLOC: vector.body:
+; DEBUGLOC:   = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> %{{.+}}, <vscale x 4 x i32> %{{.+}}, <vscale x 4 x i32> %{{.+}}, i32 %{{.+}}), !dbg ![[SELLOC:[0-9]+]]
+; DEBUGLOC: loop:
+; DEBUGLOC:   = select i1 %{{.+}}, i32 %{{.+}}, i32 %{{.+}}, !dbg ![[SELLOC]]
+;
+ entry:
+   br label %loop
+
+loop:
+   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+   %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+   %load.b = load i32, ptr %gep.b, align 4
+   %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+   %load.c = load i32, ptr %gep.c, align 4
+   %cmp = icmp sgt i32 %load.b, %load.c
+   %neg.c = sub i32 0, %load.c
+   %sel = select i1 %cmp, i32 %load.c, i32 %neg.c
+   %add = add i32 %sel, %load.b
+   %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv
+   store i32 %add, ptr %gep.a, align 4
+   %iv.next = add nuw nsw i64 %iv, 1
+   %exitcond = icmp eq i64 %iv.next, %N
+   br i1 %exitcond, label %exit, label %loop
+
+ exit:
+   ret void
+ }
+
+ ; DEBUGLOC: [[SELLOC]] = !DILocation(line: 9
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index eb60c24..951d833 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -64,6 +64,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -92,11 +94,13 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -181,11 +185,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
 ; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
-; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
 ; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -217,8 +220,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
@@ -311,6 +314,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -339,11 +344,13 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
+; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -428,11 +435,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
 ; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %ind.end = sub i64 %0, %n.vec
-; CHECK-NEXT:    IR   %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    IR   %ind.end3 = sub i32 %n, %.cast
 ; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
@@ -464,8 +470,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%0>
-; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end3>, ir<%n>
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 375278e..3386a7d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -14,16 +14,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -31,13 +29,13 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -73,16 +71,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -90,13 +86,13 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 8395ffd..30cb33e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -86,9 +86,9 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 64)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
@@ -162,10 +162,10 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP18]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
@@ -328,7 +328,6 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -348,7 +347,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       loop:
 ; NOSTRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -606,7 +605,6 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; NOSTRIDED-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -626,7 +624,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; NOSTRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       loop:
 ; NOSTRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -731,12 +729,12 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]]
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]]
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP11]]
-; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
 ; STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; STRIDED:       vector.body:
 ; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index bfdcfbf..f38aa11 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -163,20 +163,14 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i1> splat (i1 true), [[TMP0]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -194,7 +188,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[ADD]] = add i8 [[F_039]], 1
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[F_039]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -293,7 +287,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> [[BROADCAST_SPLAT6]], i32 8, <vscale x 2 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -314,7 +308,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[V]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -354,8 +348,7 @@ attributes #1 = { "target-features"="+64bit,+v" }
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
index 11cf832..f07aaec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
@@ -989,6 +989,62 @@ exit:
   ret void
 }
 
+; There's no @llvm.vp.log10, so don't transform it.
+define void @log10(ptr %a, ptr %b, i64 %N) {
+; IF-EVL-LABEL: define void @log10(
+; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT:  [[ENTRY:.*]]:
+; IF-EVL-NEXT:    br label %[[LOOP:.*]]
+; IF-EVL:       [[LOOP]]:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-NEXT:    [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]])
+; IF-EVL-NEXT:    [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store float [[COND]], ptr [[GEP9]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; IF-EVL:       [[EXIT]]:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: define void @log10(
+; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    br label %[[LOOP:.*]]
+; NO-VP:       [[LOOP]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]])
+; NO-VP-NEXT:    [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store float [[COND]], ptr [[GEP9]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP:       [[EXIT]]:
+; NO-VP-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds float, ptr %b, i64 %iv
+  %0 = load float, ptr %gep, align 4
+  %cond = tail call float @llvm.log10.f32(float %0)
+  %gep9 = getelementptr inbounds float, ptr %a, i64 %iv
+  store float %cond, ptr %gep9, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.smin.i32(i32, i32)
 declare i32 @llvm.umax.i32(i32, i32)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 209c251..f323231 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -20,10 +20,10 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -37,16 +37,16 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 0, [[TMP18]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 1, [[TMP18]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
@@ -121,10 +121,10 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
-; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 7efa65e..a2f85b9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -25,9 +25,9 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index e7eb577..cd1d734 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -63,11 +63,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; IF-EVL-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: ir-bb<for.body>:
-; IF-EVL-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; IF-EVL-OUTLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-OUTLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; IF-EVL-OUTLOOP-NEXT: No successors
@@ -113,11 +114,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: ir-bb<for.body>:
-; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; IF-EVL-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; IF-EVL-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; IF-EVL-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; IF-EVL-INLOOP-NEXT: No successors
@@ -159,11 +161,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: ir-bb<for.body>:
-; NO-VP-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-OUTLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; NO-VP-OUTLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; NO-VP-OUTLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; NO-VP-OUTLOOP-NEXT: No successors
@@ -205,11 +208,12 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; NO-VP-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.body>
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: ir-bb<for.body>:
-; NO-VP-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; NO-VP-INLOOP-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph)
 ; NO-VP-INLOOP-NEXT:   IR   %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ]
 ; NO-VP-INLOOP:        IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; NO-VP-INLOOP-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index 98245fc..2de0f7e 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -19,125 +19,103 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT3]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], splat (i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
 ; CHECK-NEXT:    store i1 [[TMP10]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    store i1 [[TMP9]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; CHECK:       pred.store.if3:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2
 ; CHECK-NEXT:    store i1 [[TMP12]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
 ; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3
 ; CHECK-NEXT:    store i1 [[TMP14]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
 ; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4
 ; CHECK-NEXT:    store i1 [[TMP16]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
 ; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5
 ; CHECK-NEXT:    store i1 [[TMP18]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
 ; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
-; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
 ; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6
 ; CHECK-NEXT:    store i1 [[TMP20]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
 ; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
 ; CHECK:       pred.store.if13:
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7
 ; CHECK-NEXT:    store i1 [[TMP22]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
 ; CHECK:       pred.store.continue14:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
-; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
 ; CHECK:       pred.store.if15:
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8
 ; CHECK-NEXT:    store i1 [[TMP24]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
 ; CHECK:       pred.store.continue16:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
-; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK-NEXT:    br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
 ; CHECK:       pred.store.if17:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9
 ; CHECK-NEXT:    store i1 [[TMP26]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
 ; CHECK:       pred.store.continue18:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
-; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
 ; CHECK:       pred.store.if19:
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10
 ; CHECK-NEXT:    store i1 [[TMP28]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
 ; CHECK:       pred.store.continue20:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
-; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
 ; CHECK:       pred.store.if21:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11
 ; CHECK-NEXT:    store i1 [[TMP30]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
 ; CHECK:       pred.store.continue22:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
 ; CHECK:       pred.store.if23:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12
 ; CHECK-NEXT:    store i1 [[TMP32]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
 ; CHECK:       pred.store.continue24:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
 ; CHECK:       pred.store.if25:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13
 ; CHECK-NEXT:    store i1 [[TMP34]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
 ; CHECK:       pred.store.continue26:
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
-; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
 ; CHECK:       pred.store.if27:
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14
 ; CHECK-NEXT:    store i1 [[TMP36]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
 ; CHECK:       pred.store.continue28:
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
-; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK-NEXT:    br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
 ; CHECK:       pred.store.if29:
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15
 ; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
 ; CHECK:       pred.store.continue30:
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
-; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]]
-; CHECK:       pred.store.if31:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    store i1 [[TMP40]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32]]
-; CHECK:       pred.store.continue32:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -156,7 +134,7 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK-NEXT:    store i1 [[ICMP_SGT]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -183,7 +161,6 @@ exit:                                             ; preds = %for.body
 }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
index d0754f1..7b0fa64 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
@@ -66,8 +66,8 @@ define void @func_21() {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LV:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index fcf1ba0..61bcbaa 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -16,30 +16,24 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i64 0, 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 1, 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw i64 2, 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 3, 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[TMP11]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
@@ -64,14 +58,13 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; CHECK:       [[PRED_STORE_IF5]]:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP31]], ptr [[DST]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -93,7 +86,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -125,7 +118,6 @@ exit:
 }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 61cae9c..83e2f848 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -17,15 +17,11 @@ define void @f1() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i16 0 to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -40,7 +36,7 @@ define void @f1() {
 ; CHECK-NEXT:    store ptr [[_TMP2]], ptr [[_TMP7]], align 8
 ; CHECK-NEXT:    [[_TMP9]] = add nsw i16 [[C_1_0]], 1
 ; CHECK-NEXT:    [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
-; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index f319036..15bdbea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -42,8 +42,8 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 3, [[N_VEC3]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 6a12be7..5c0aeb5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -495,8 +495,8 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 {
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP30]], i1 false, i1 false
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[ANY_OF:%.*]] = phi i1 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ANY_OF_NEXT:%.*]], [[LOOP]] ]
@@ -986,8 +986,8 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
new file mode 100644
index 0000000..3d23090
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i1 @fn(ptr %nno) #0 {
+; CHECK-LABEL: define i1 @fn(
+; CHECK-SAME: ptr [[NNO:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 10, i64 9, i64 8, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 10, [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 10)
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison)
+; CHECK-NEXT:    [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[REVERSE1]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP12]])
+; CHECK-NEXT:    br i1 true, label [[FOR_END36:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -2, [[MIDDLE_BLOCK]] ], [ 10, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY20:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC35:%.*]] ]
+; CHECK-NEXT:    [[SUM_01:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_1:%.*]], [[FOR_INC35]] ]
+; CHECK-NEXT:    [[REM4:%.*]] = and i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp eq i64 [[REM4]], 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    br i1 [[CMP21]], label [[IF_THEN22:%.*]], label [[FOR_INC35]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[TMP15]], 1
+; CHECK-NEXT:    [[REM27:%.*]] = urem i32 [[MUL]], 10
+; CHECK-NEXT:    br label [[FOR_INC35]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[REM27_PN:%.*]] = phi i32 [ [[REM27]], [[IF_THEN22]] ], [ [[TMP15]], [[FOR_BODY20]] ]
+; CHECK-NEXT:    [[SUM_1]] = or i32 [[REM27_PN]], [[SUM_01]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP19_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP19_NOT]], label [[FOR_END36]], label [[FOR_BODY20]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_INC35]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[CMP41:%.*]] = icmp eq i32 [[SUM_1_LCSSA]], 0
+; CHECK-NEXT:    ret i1 [[CMP41]]
+;
+entry:
+  br label %loop.header
+
+loop.header:                                       ; preds = %entry, %loop.latch
+  %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop.latch ]
+  %sum.01 = phi i32 [ 0, %entry ], [ %sum.1, %loop.latch ]
+  %rem4 = and i64 %iv, 1
+  %cmp21 = icmp eq i64 %rem4, 0
+  %gep = getelementptr inbounds nuw i32, ptr %nno, i64 %iv
+  %0 = load i32, ptr %gep, align 4
+  br i1 %cmp21, label %if.then, label %loop.latch
+
+if.then:                                        ; preds = %loop.header
+  %mul = shl i32 %0, 1
+  %rem27 = urem i32 %mul, 10
+  br label %loop.latch
+
+loop.latch:                                        ; preds = %loop.header, %if.then
+  %rem27.pn = phi i32 [ %rem27, %if.then ], [ %0, %loop.header ]
+  %sum.1 = or i32 %rem27.pn, %sum.01
+  %iv.next = add nsw i64 %iv, -1
+  %cmp19.not = icmp eq i64 %iv, 0
+  br i1 %cmp19.not, label %exit, label %loop.header
+
+exit:                                        ; preds = %loop.latch
+  %sum.1.lcssa = phi i32 [ %sum.1, %loop.latch ]
+  %cmp41 = icmp eq i32 %sum.1.lcssa, 0
+  ret i1 %cmp41
+}
+
+attributes #0 = { "target-features"="+avx" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index 5fb7df2..c14ddca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -60,8 +60,8 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 [[IV_START]], [[N_VEC4]]
@@ -171,11 +171,11 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 16
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 48
 ; CHECK-NEXT:    store <16 x i16> [[TMP4]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP5]], ptr [[TMP10]], align 2
-; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP21]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD_3]], [[TMP1]]
@@ -191,8 +191,8 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[L]], 8
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[L]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    [[DOTCAST7:%.*]] = trunc i64 [[N_VEC5]] to i16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 0e511cf..6fc7080 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -48,8 +48,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -154,10 +154,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index ab0b454..fc6059d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -61,8 +61,8 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
 ; AUTO_VEC:       vec.epilog.ph:
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[N_VEC3:%.*]] = and i64 [[ZEXT]], 2147483644
 ; AUTO_VEC-NEXT:    [[DOTCAST5:%.*]] = uitofp nneg i64 [[N_VEC3]] to float
 ; AUTO_VEC-NEXT:    [[TMP7:%.*]] = fmul fast float [[DOTCAST5]], 5.000000e-01
@@ -441,8 +441,8 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]]
 ; AUTO_VEC:       vec.epilog.ph:
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    [[N_VEC6:%.*]] = and i64 [[TMP0]], 4294967292
 ; AUTO_VEC-NEXT:    [[DOTCAST8:%.*]] = uitofp nneg i64 [[N_VEC6]] to float
 ; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul reassoc float [[DOTCAST8]], 4.200000e+01
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index b6bccab..8c338d6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -665,16 +665,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AVX512:       vec.epilog.iter.check:
-; AVX512-NEXT:    [[TMP22:%.*]] = mul i64 [[N_VEC]], 64
-; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP22]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 4
 ; AVX512-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
+; AVX512-NEXT:    [[TMP38:%.*]] = mul i64 [[N_VEC]], 64
+; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]]
 ; AVX512-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; AVX512-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; AVX512:       vec.epilog.ph:
-; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP3]], 8
 ; AVX512-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF9]]
 ; AVX512-NEXT:    [[TMP24:%.*]] = mul i64 [[N_VEC10]], 4
@@ -691,12 +691,12 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP27]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD23]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD24]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 3b55044..68cbfad 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -348,10 +348,10 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -12, [[MIDDLE_BLOCK]] ], [ 100, [[VECTOR_MEMCHECK]] ], [ 100, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[VECTOR_MEMCHECK]] ], [ [[A]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
new file mode 100644
index 0000000..1dd2692
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: @wide_add_induction_step_live_in(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF:%.*]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i16 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[LOOP]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ADD_LCSSA]]
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ]
+  %add = add i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %add, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret i16 %add
+}
+
+define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: @wide_sub_induction_step_live_in(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF:%.*]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 -2, [[OFF]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP10]], ptr [[TMP9]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[SUB:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i16 [ [[SUB]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[SUB_LCSSA]]
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ]
+  %sub = sub i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sub, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret i16 %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index b277264..f50177e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -56,12 +56,12 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc nuw i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[DOTCAST9:%.*]] = trunc nuw i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST9]]
+; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 56
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -181,10 +181,10 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST8:%.*]] = trunc nuw i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 120
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index 597be33..9e87cc2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -368,9 +368,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 63
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT3]], <64 x i32> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -414,9 +414,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; AUTOVF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP3]], 7
 ; AUTOVF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; AUTOVF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = mul i32 [[N_VEC]], -72
 ; AUTOVF-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]]
-; AUTOVF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1
 ; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT3]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AUTOVF-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
index bb7fe4d..1a9e7dd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
@@ -14,72 +14,70 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT16]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE15:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[PRED_SDIV_CONTINUE15]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_SDIV_CONTINUE15]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE17:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[PRED_SDIV_CONTINUE17]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_SDIV_CONTINUE17]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
 ; CHECK:       [[PRED_SDIV_IF]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
 ; CHECK:       [[PRED_SDIV_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SDIV_IF2:.*]], label %[[PRED_SDIV_CONTINUE3:.*]]
-; CHECK:       [[PRED_SDIV_IF2]]:
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE3]]
-; CHECK:       [[PRED_SDIV_CONTINUE3]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF4:.*]], label %[[PRED_SDIV_CONTINUE5:.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SDIV_IF4:.*]], label %[[PRED_SDIV_CONTINUE5:.*]]
 ; CHECK:       [[PRED_SDIV_IF4]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE5]]
 ; CHECK:       [[PRED_SDIV_CONTINUE5]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SDIV_IF6:.*]], label %[[PRED_SDIV_CONTINUE7:.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF6:.*]], label %[[PRED_SDIV_CONTINUE7:.*]]
 ; CHECK:       [[PRED_SDIV_IF6]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE7]]
 ; CHECK:       [[PRED_SDIV_CONTINUE7]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF8:.*]], label %[[PRED_SDIV_CONTINUE9:.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SDIV_IF8:.*]], label %[[PRED_SDIV_CONTINUE9:.*]]
 ; CHECK:       [[PRED_SDIV_IF8]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE9]]
 ; CHECK:       [[PRED_SDIV_CONTINUE9]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE7]] ], [ [[TMP8]], %[[PRED_SDIV_IF8]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
 ; CHECK:       [[PRED_SDIV_IF10]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE11]]
 ; CHECK:       [[PRED_SDIV_CONTINUE11]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP12]], %[[PRED_SDIV_IF10]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP8]], %[[PRED_SDIV_IF10]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
 ; CHECK:       [[PRED_SDIV_IF12]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE13]]
 ; CHECK:       [[PRED_SDIV_CONTINUE13]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP16]], %[[PRED_SDIV_IF12]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP12]], %[[PRED_SDIV_IF12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15:.*]]
 ; CHECK:       [[PRED_SDIV_IF14]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE15]]
 ; CHECK:       [[PRED_SDIV_CONTINUE15]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP20]], %[[PRED_SDIV_IF14]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP16]], %[[PRED_SDIV_IF14]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF16:.*]], label %[[PRED_SDIV_CONTINUE17]]
+; CHECK:       [[PRED_SDIV_IF16]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE17]]
+; CHECK:       [[PRED_SDIV_CONTINUE17]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE15]] ], [ [[TMP20]], %[[PRED_SDIV_IF16]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i16> zeroinitializer, <4 x i16> [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP24]] = or <4 x i1> [[VEC_PHI]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI1]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI3]], [[TMP22]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -91,8 +89,8 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[B:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index cc60359..7816c49 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -16,18 +16,16 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -43,7 +41,7 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 41868d6..d1c0201 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -85,9 +85,9 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
 ; VEC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VEC:       scalar.ph:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ]
+; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ]
 ; VEC-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; VEC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ]
 ; VEC-NEXT:    br label [[FOR_BODY:%.*]]
 ; VEC:       for.body:
 ; VEC-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IDX:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
index 052a963..28f8988 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -38,16 +38,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%4>
 ; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%arrayidx>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%0> = load vp<%5>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext  ir<%0> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv> = zext ir<%0> to i32
 ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%4>
 ; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%arrayidx2>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%1> = load vp<%6>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext  ir<%1> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST ir<%conv3> = zext ir<%1> to i32
 ; CHECK: Cost of 0 for VF 2: WIDEN ir<%conv4> = and ir<%sum.013>, ir<255>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add> = add ir<%conv>, ir<%conv4>
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%add5> = add ir<%add>, ir<%conv3>
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc  ir<%add5> to i8
-; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext  vp<%7> to i32
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%7> = trunc ir<%add5> to i8
+; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<%8> = zext vp<%7> to i32
 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%0>
 ; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%1>
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index 8d56c33..cfae26a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -15,19 +15,18 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
 ; CHECK:       [[PRED_UREM_IF]]:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[MUL]], [[X]]
 ; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE]]
 ; CHECK:       [[PRED_UREM_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[REM]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
 ; CHECK:       [[PRED_UREM_IF1]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = urem i64 [[MUL]], [[X]]
@@ -48,7 +47,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK:       [[PRED_UREM_CONTINUE6]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP9]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[P:%.*]] = select i1 [[TMP14]], i64 [[TMP12]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select i1 [[TMP15]], i64 [[TMP13]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 236ed30..a0294f7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -65,16 +65,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT99:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END9:%.*]] = add i64 8, [[TMP64]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF6:%.*]] = urem i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[N_VEC7:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[N_VEC7]], 2
@@ -112,16 +112,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 8, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       iter.check27:
+; CHECK:       iter.check23:
 ; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9
 ; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = add nuw i64 [[TMP27]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK25:%.*]] = icmp ult i64 [[TMP28]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH46:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK29:%.*]]
-; CHECK:       vector.main.loop.iter.check29:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH41:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK25:%.*]]
+; CHECK:       vector.main.loop.iter.check25:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK28:%.*]] = icmp ult i64 [[TMP28]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH47:%.*]], label [[VECTOR_PH30:%.*]]
-; CHECK:       vector.ph30:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH42:%.*]], label [[VECTOR_PH30:%.*]]
+; CHECK:       vector.ph26:
 ; CHECK-NEXT:    [[N_MOD_VF31:%.*]] = urem i64 [[TMP28]], 16
 ; CHECK-NEXT:    [[N_VEC32:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF31]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[N_VEC32]], 2
@@ -129,16 +129,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[TOBOOL6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY35:%.*]]
-; CHECK:       vector.body35:
-; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY35]] ]
-; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY35]] ]
-; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY35]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
+; CHECK:       vector.body29:
+; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i64> splat (i64 8), [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP31]], <16 x i64> [[TMP32]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP33]], i32 16, <16 x i1> [[TMP34]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = or disjoint <16 x i64> [[VEC_IND37]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP36:%.*]] = add nsw <16 x i64> [[TMP30]], [[TMP35]]
@@ -153,44 +153,44 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK24:%.*]], label [[VECTOR_BODY35]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block24:
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block20:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK48:%.*]]
-; CHECK:       vec.epilog.iter.check49:
-; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
+; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK43:%.*]]
+; CHECK:       vec.epilog.iter.check43:
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
+; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH46]], label [[VEC_EPILOG_PH47]]
-; CHECK:       vec.epilog.ph48:
-; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK48]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK29]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH41]], label [[VEC_EPILOG_PH42]]
+; CHECK:       vec.epilog.ph42:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
 ; CHECK-NEXT:    [[N_MOD_VF52:%.*]] = urem i64 [[TMP28]], 8
 ; CHECK-NEXT:    [[N_VEC53:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF52]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[N_VEC53]], 2
 ; CHECK-NEXT:    [[IND_END54:%.*]] = add i64 8, [[TMP43]]
 ; CHECK-NEXT:    [[IND_END57:%.*]] = mul i64 [[N_VEC53]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT50:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT50]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    [[DOTSPLATINSERT62:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL42]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT63:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT62]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION64:%.*]] = add <8 x i64> [[DOTSPLAT63]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT72:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT72]], <8 x i1> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY58:%.*]]
-; CHECK:       vec.epilog.vector.body58:
-; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH47]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
-; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH47]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
-; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH47]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY58]] ]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY50:%.*]]
+; CHECK:       vec.epilog.vector.body52:
+; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP45]], <8 x i64> [[TMP46]], i64 0
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP47]], i32 16, <8 x i1> [[TMP48]])
 ; CHECK-NEXT:    [[TMP49:%.*]] = or disjoint <8 x i64> [[VEC_IND70]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP50:%.*]] = add nsw <8 x i64> [[TMP44]], [[TMP49]]
@@ -205,17 +205,17 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK46:%.*]], label [[VEC_EPILOG_VECTOR_BODY58]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       vec.epilog.middle.block46:
-; CHECK-NEXT:    [[CMP_N75:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[CMP_N75]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH46]]
-; CHECK:       vec.epilog.scalar.ph47:
-; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK46]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK48]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL59:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK46]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK48]] ]
+; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY50]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       vec.epilog.middle.block40:
+; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
+; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH41]]
+; CHECK:       vec.epilog.scalar.ph41:
+; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
 ; CHECK:       for.body.us:
-; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH46]] ]
-; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL59]], [[VEC_EPILOG_SCALAR_PH46]] ]
+; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH41]] ]
+; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL67]], [[VEC_EPILOG_SCALAR_PH41]] ]
 ; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw i64 [[TMP56]], [[INDVARS_IV70]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index 55ff26c..c9132ba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -142,8 +142,8 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_114]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY15:%.*]]
-; CHECK:       vector.body15:
-; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE28:%.*]] ]
+; CHECK:       vector.body14:
+; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE78:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX16]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX16]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
@@ -151,7 +151,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; CHECK:       pred.store.if21:
+; CHECK:       pred.store.if20:
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[OFFSET_IDX]]
@@ -160,10 +160,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]]
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; CHECK:       pred.store.continue22:
+; CHECK:       pred.store.continue21:
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; CHECK:       pred.store.if23:
+; CHECK:       pred.store.if22:
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
@@ -173,10 +173,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]]
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; CHECK:       pred.store.continue24:
+; CHECK:       pred.store.continue23:
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; CHECK:       pred.store.if25:
+; CHECK:       pred.store.if24:
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
@@ -186,10 +186,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]]
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.continue26:
+; CHECK:       pred.store.continue25:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3
-; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.if27:
+; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE78]]
+; CHECK:       pred.store.if26:
 ; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
@@ -198,8 +198,8 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]]
 ; CHECK-NEXT:    store i32 [[TMP49]], ptr [[TMP48]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.continue28:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE78]]
+; CHECK:       pred.store.continue27:
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC12]]
 ; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY15]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -273,7 +273,7 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT11]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE18:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[OFFSET_IDX6:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
@@ -290,38 +290,38 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
-; CHECK:       pred.store.if13:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 4
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[NEXT_GEP8]], align 16
 ; CHECK-NEXT:    store i32 [[TMP9]], ptr [[NEXT_GEP3]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
-; CHECK:       pred.store.if15:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[OFFSET_IDX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 8
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[NEXT_GEP9]], align 16
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[NEXT_GEP4]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
-; CHECK:       pred.store.continue16:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.if17:
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.if15:
 ; CHECK-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[OFFSET_IDX]], 12
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 12
 ; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[NEXT_GEP10]], align 16
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP5]], align 16
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.continue16:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -470,8 +470,8 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
-; CHECK:       pred.store.if10:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 4
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 2
@@ -480,11 +480,11 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP9]] to i32
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 7
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[NEXT_GEP7]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
-; CHECK:       pred.store.continue11:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
-; CHECK:       pred.store.if12:
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
 ; CHECK-NEXT:    [[TMP13:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = or disjoint i64 [[OFFSET_IDX]], 4
@@ -493,11 +493,11 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i16 [[TMP15]] to i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 7
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[NEXT_GEP8]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
-; CHECK:       pred.store.continue13:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.if14:
+; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE15]]
+; CHECK:       pred.store.if13:
 ; CHECK-NEXT:    [[TMP19:%.*]] = or disjoint i64 [[OFFSET_IDX5]], 12
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or disjoint i64 [[OFFSET_IDX]], 6
@@ -507,7 +507,7 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
 ; CHECK-NEXT:    store i32 [[TMP23]], ptr [[NEXT_GEP9]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.continue15:
+; CHECK:       pred.store.continue14:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index b8dcfd3..8661d86f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 
 ; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
-; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp  ir<%tmp> to double
+; CHECK: Cost of 5 for VF 2: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double
+; CHECK: Cost of 10 for VF 4: WIDEN-CAST ir<%conv> = uitofp ir<%tmp> to double
 define void @uint64_to_double_cost(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
index 4c95584..2fea016 100644
--- a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
+++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
@@ -171,11 +171,11 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 1), <4 x i64> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 1), <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index 6e7efe0..e11f77d 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -33,7 +33,7 @@
 ; CHECK:   br i1 {{.+}}, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !prof [[PROF_F0_VEC_EPILOG_VECTOR_BODY:![0-9]+]]
 ;
 ; CHECK: vec.epilog.middle.block:
-; CHECK:   br i1 %cmp.n9, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
+; CHECK:   br i1 %cmp.n{{.+}}, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
 ;
 ; CHECK: vec.epilog.scalar.ph:
 ; CHECK:   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
index 04ce956..0f34f62 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
@@ -11,22 +11,20 @@ define i32 @foo(ptr %p) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG3:![0-9]+]]
-; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG7:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG3]]
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG3]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]]
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]], !dbg [[DBG7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG8:![0-9]+]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG3]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG9:![0-9]+]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG3]]
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG7]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG14:![0-9]+]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG9]]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG7]]
+; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG8]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -63,17 +61,16 @@ exit:                              ; preds = %loop
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
 ; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}})
-; CHECK: [[DBG3]] = !DILocation(line: 4, scope: [[META4:![0-9]+]])
+; CHECK: [[DBG3]] = !DILocation(line: 6, scope: [[META4:![0-9]+]])
 ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]])
 ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]])
 ; CHECK: [[META6]] = !{}
-; CHECK: [[DBG7]] = !DILocation(line: 6, scope: [[META4]])
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
-; CHECK: [[META9]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META10]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]])
-; CHECK: [[DBG12]] = !DILocation(line: 5, scope: [[META4]])
-; CHECK: [[DBG13]] = !DILocation(line: 7, scope: [[META4]])
-; CHECK: [[DBG14]] = !DILocation(line: 8, scope: [[META4]])
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META9]]}
+; CHECK: [[DBG7]] = !DILocation(line: 5, scope: [[META4]])
+; CHECK: [[DBG8]] = !DILocation(line: 9, scope: [[META4]])
+; CHECK: [[DBG9]] = !DILocation(line: 4, scope: [[META4]])
+; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]])
+; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]])
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]}
+; CHECK: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 8baff7b..1333451 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -104,6 +104,83 @@ exit:
   ret void
 }
 
+define void @deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
 define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
@@ -150,7 +227,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -173,7 +250,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -251,7 +328,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -274,7 +351,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -352,7 +429,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -375,7 +452,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -453,7 +530,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -476,7 +553,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -550,7 +627,7 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr no
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -573,7 +650,7 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr no
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -653,7 +730,7 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr n
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -676,7 +753,7 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr n
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -757,7 +834,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -781,7 +858,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -854,7 +931,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -876,7 +953,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -949,7 +1026,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -971,7 +1048,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1027,7 +1104,7 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1049,7 +1126,7 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1123,7 +1200,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1145,7 +1222,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1218,7 +1295,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1240,7 +1317,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1313,7 +1390,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -1335,7 +1412,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP31:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1397,4 +1474,6 @@ exit:
 ; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index c159ec8..94593a7 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -241,9 +241,9 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i1 [[BC_MERGE_RDX]], false
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
@@ -275,8 +275,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -426,8 +426,8 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) {
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC8]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK:%.*]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK:%.*]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i1 [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index 0eab97b..32d32a6 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -45,12 +45,13 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:  scalar.ph
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -125,13 +126,14 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
 ; CHECK-NEXT:    EMIT vp<[[RESUME_3_P:%.*]]>.2 = resume-phi vp<[[RESUME_3]]>.2, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
 ; CHECK-NEXT:    IR   %for.3 = phi i16 [ 33, %entry ], [ %for.2, %loop ] (extra operand: vp<[[RESUME_3_P]]>.2 from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -186,9 +188,9 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT:     vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]>
 ; CHECK-NEXT:     EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next>
-; CHECK-NEXT:     WIDEN-CAST ir<%for.x.prev> = trunc  vp<[[SPLICE_X]]> to i32
+; CHECK-NEXT:     WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32
 ; CHECK-NEXT:     EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
-; CHECK-NEXT:     WIDEN-CAST ir<%for.y.i64> = sext  vp<[[SPLICE_Y]]> to i64
+; CHECK-NEXT:     WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -205,12 +207,13 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
@@ -263,9 +266,9 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN ir<%for.x.next> = mul ir<%l>, ir<2>
 ; CHECK-NEXT:     EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next>
-; CHECK-NEXT:     WIDEN-CAST ir<%for.x.prev> = trunc  vp<[[SPLICE_X]]> to i32
+; CHECK-NEXT:     WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32
 ; CHECK-NEXT:     EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
-; CHECK-NEXT:     WIDEN-CAST ir<%for.y.i64> = sext  vp<[[SPLICE_Y]]> to i64
+; CHECK-NEXT:     WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -282,12 +285,13 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 7aedb21..fc71f8a 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -661,10 +661,10 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
 ; CHECK-NEXT:    br i1 true, label %End, label %scalar.ph
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ]
 ; CHECK-NEXT:    phi double [ [[TMP0]], %middle.block ], [ 0.000000e+00, %Entry ]
 ; CHECK-NEXT:    phi double [ [[TMP3]], %middle.block ], [ 0.000000e+00, %Entry ]
 ; CHECK-NEXT:    phi double [ [[VECTOR_RECUR_EXTRACT9]], %middle.block ], [ 0.000000e+00, %Entry ]
+; CHECK-NEXT:    %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ]
 ; CHECK:      End:
 ; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP0]], %middle.block ]
 ; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP3]], %middle.block ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index fe16e8c..253ecac 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -41,8 +41,8 @@ define void @can_sink_after_store(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -121,8 +121,8 @@ define void @sink_sdiv(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -202,8 +202,8 @@ define void @can_sink_with_additional_user(i32 %x, ptr %ptr, i64 %tc) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR:%.*]]
 ; CHECK:       for:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
@@ -387,9 +387,9 @@ define void @instruction_with_2_FOR_operands(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[BB74:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[BB13:%.*]]
 ; CHECK:       bb13:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[TMP60:%.*]], [[BB13]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
@@ -463,9 +463,9 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses(ptr noalias
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -554,9 +554,9 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses_chain(ptr n
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -852,8 +852,8 @@ define void @sink_dominance(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
@@ -935,8 +935,8 @@ define void @sink_dominance_2(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
@@ -1057,9 +1057,9 @@ define void @test_for_sink_instruction_after_same_incoming_1(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
@@ -1125,9 +1125,9 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index a127890..0b2e7fe 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -82,11 +82,12 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -172,11 +173,12 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -245,12 +247,13 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %and.red = phi i32 [ 1234, %entry ], [ %and.red.next, %loop ]
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
@@ -361,11 +364,12 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -458,11 +462,12 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
-; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:        IR   %C = icmp sgt i32 %iv.next, %recur.next
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -503,6 +508,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<2> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -546,11 +552,12 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[END]]>, ir<2>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %.pn = phi i32 [ 0, %entry ], [ %l, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp ugt i64 %iv, 3
 ; CHECK-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 2d50f82..509b8f9 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -55,8 +55,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[SCALAR_BODY]] ]
@@ -111,8 +111,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[SCALAR_BODY]] ]
@@ -165,8 +165,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[SCALAR_BODY]] ]
@@ -265,8 +265,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup.loopexit:
@@ -334,8 +334,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup.loopexit:
@@ -398,8 +398,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ poison, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       for.cond.cleanup.loopexit:
@@ -525,8 +525,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[SCALAR_BODY]] ]
@@ -598,8 +598,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP20:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP21:%.*]], [[SCALAR_BODY]] ]
@@ -669,8 +669,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP15:%.*]], [[SCALAR_BODY]] ]
@@ -912,8 +912,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -956,8 +956,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -1005,8 +1005,8 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -1780,8 +1780,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -1836,8 +1836,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -1888,8 +1888,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
@@ -2016,8 +2016,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP48:%.*]], [[FOR_BODY]] ]
@@ -2076,8 +2076,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
@@ -2143,8 +2143,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ]
@@ -2240,8 +2240,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
@@ -2299,8 +2299,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
@@ -2353,8 +2353,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
@@ -2685,8 +2685,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2816,8 +2816,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ]
@@ -2881,8 +2881,8 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2990,8 +2990,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -3189,8 +3189,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
@@ -3273,8 +3273,8 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; SINK-AFTER-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -3707,13 +3707,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
@@ -3726,7 +3721,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-IC-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-IC-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-IC:       exit:
 ; UNROLL-NO-IC-NEXT:    ret i32 0
 ;
@@ -3736,11 +3731,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; UNROLL-NO-VF-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
@@ -3753,7 +3745,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-VF:       exit:
 ; UNROLL-NO-VF-NEXT:    ret i32 0
 ;
@@ -3763,13 +3755,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER:       vector.ph:
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; SINK-AFTER-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
@@ -3782,7 +3769,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; SINK-AFTER-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; SINK-AFTER-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
 ; SINK-AFTER-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; SINK-AFTER:       exit:
 ; SINK-AFTER-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index c4509e4..7db53d8 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -172,6 +172,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2:%.*]], true
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
@@ -184,7 +185,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]]
 ; UNROLL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
-; UNROLL-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
+; UNROLL-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL:       pred.store.if:
 ; UNROLL-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
@@ -192,10 +193,8 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL:       pred.store.continue3:
 ; UNROLL-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI]], 1
 ; UNROLL-NEXT:    [[TMP12:%.*]] = add i32 [[VEC_PHI1]], 1
-; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NEXT:    [[TMP14:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NEXT:    [[PREDPHI]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]]
-; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]]
+; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP12]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -244,6 +243,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2:%.*]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
@@ -256,7 +256,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP5]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
 ; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP8]], ptr [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
@@ -268,10 +268,8 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY:       pred.store.continue3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = add i32 [[VEC_PHI]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI1]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI]] = select i1 [[TMP12]], i32 [[VEC_PHI]], i32 [[TMP10]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP11]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP12]], i32 [[VEC_PHI1]], i32 [[TMP11]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -321,9 +319,10 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; VEC-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
-; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND_2:%.*]], i64 0
 ; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
@@ -351,7 +350,6 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.continue2:
 ; VEC-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 1)
-; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; VEC-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP17]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP16]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index ecb00d4..f553864 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -1,21 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -S | FileCheck %s
 
 ; int int_inc;
 ;
-;int induction_with_global(int init, int *restrict A, int N) {
+;void induction_with_global(int init, int *restrict A, int N) {
 ;  int x = init;
 ;  for (int i=0;i<N;i++){
 ;    A[i] = x;
 ;    x += int_inc;
 ;  }
-;  return x;
 ;}
 
-; CHECK-LABEL: @induction_with_global(
-; CHECK:       for.body.lr.ph:
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+
+@int_inc = common global i32 0, align 4
+
+define void @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N) {
+; CHECK-LABEL: define void @induction_with_global(
+; CHECK-SAME: i32 [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @int_inc, align 4
-; CHECK:       vector.ph:
-; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 %init, i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INIT]], [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INIT]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
@@ -24,53 +42,56 @@
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP10]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-
-@int_inc = common global i32 0, align 4
-
-define i32 @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_05]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP0]], [[X_05]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
-  %cmp4 = icmp sgt i32 %N, 0
-  br i1 %cmp4, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
   %0 = load i32, ptr @int_inc, align 4
-  %1 = mul i32 %0, %N
   br label %for.body
 
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %x.05 = phi i32 [ %init, %for.body.lr.ph ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %x.05 = phi i32 [ %init, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.05, ptr %arrayidx, align 4
   %add = add nsw i32 %0, %x.05
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %2 = add i32 %1, %init
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %2, %for.end.loopexit ]
-  ret i32 %x.0.lcssa
-}
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %iv.next.trunc, %N
+  br i1 %exitcond, label %exit, label %for.body
 
+exit:
+  ret void
+}
 
 ;int induction_with_loop_inv(int init, int *restrict A, int N, int M) {
 ;  int x = init;
@@ -83,82 +104,123 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;  return x;
 ;}
 
-; CHECK-LABEL: @induction_with_loop_inv(
-; CHECK:       vector.ph:
-; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 %x.011, i64 0
+define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-LABEL: define i32 @induction_with_loop_inv(
+; CHECK-SAME: i32 [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV15:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT16:%.*]], %[[OUTER_LATCH:.*]] ]
+; CHECK-NEXT:    [[J_012:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC5:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[X_011:%.*]] = phi i32 [ [[INIT]], %[[ENTRY]] ], [ [[X_0_LCSSA:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[DOTCAST]], [[J_012]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[X_011]], [[TMP1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[X_011]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 %j.012, i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[J_012]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[J_012]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x i32> %vec.ind, ptr [[TMP8]], align 4
-; CHECK:         %index.next = add nuw i64 %index, 8
-; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N, i32 %M) {
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[INNER_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[X_011]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[INNER:.*]]
+; CHECK:       [[INNER]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[X_18:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[INNER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[X_18]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[X_18]], [[J_012]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[INNER_EXIT]], label %[[INNER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[INNER_EXIT]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[X_011]], [[INDVARS_IV15]]
+; CHECK-NEXT:    br label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[X_0_LCSSA]] = phi i32 [ [[TMP9]], %[[INNER_EXIT]] ]
+; CHECK-NEXT:    [[INC5]] = add nuw nsw i32 [[J_012]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT16]] = add i32 [[INDVARS_IV15]], [[N]]
+; CHECK-NEXT:    [[EXITCOND17:%.*]] = icmp eq i32 [[INC5]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND17]], label %[[EXIT:.*]], label %[[OUTER_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+;
 entry:
-  %cmp10 = icmp sgt i32 %M, 0
-  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %for.end6
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry
-  %cmp27 = icmp sgt i32 %N, 0
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.inc4, %for.cond1.preheader.lr.ph
-  %indvars.iv15 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next16, %for.inc4 ]
-  %j.012 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc5, %for.inc4 ]
-  %x.011 = phi i32 [ %init, %for.cond1.preheader.lr.ph ], [ %x.1.lcssa, %for.inc4 ]
-  br i1 %cmp27, label %for.body3.preheader, label %for.inc4
-
-for.body3.preheader:                              ; preds = %for.cond1.preheader
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-  %x.18 = phi i32 [ %add, %for.body3 ], [ %x.011, %for.body3.preheader ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  %j.012 = phi i32 [ 0, %entry ], [ %inc5, %outer.latch ]
+  %x.011 = phi i32 [ %init, %entry ], [ %x.1.lcssa, %outer.latch ]
+  br label %inner
+
+inner:
+  %iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %x.18 = phi i32 [ %x.011, %outer.header ], [ %add, %inner ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %x.18, ptr %arrayidx, align 4
   %add = add nsw i32 %x.18, %j.012
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.inc4.loopexit, label %for.body3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.next.trunc = trunc i64 %iv.next to i32
+  %inner.ec = icmp eq i32 %iv.next.trunc, %N
+  br i1 %inner.ec, label %inner.exit, label %inner
 
-for.inc4.loopexit:                                ; preds = %for.body3
-  %0 = add i32 %x.011, %indvars.iv15
-  br label %for.inc4
+inner.exit:
+  %add.ivs  = add i32 %x.011, %outer.iv
+  br label %outer.latch
 
-for.inc4:                                         ; preds = %for.inc4.loopexit, %for.cond1.preheader
-  %x.1.lcssa = phi i32 [ %x.011, %for.cond1.preheader ], [ %0, %for.inc4.loopexit ]
+outer.latch:
+  %x.1.lcssa = phi i32 [ %add.ivs, %inner.exit ]
   %inc5 = add nuw nsw i32 %j.012, 1
-  %indvars.iv.next16 = add i32 %indvars.iv15, %N
-  %exitcond17 = icmp eq i32 %inc5, %M
-  br i1 %exitcond17, label %for.end6.loopexit, label %for.cond1.preheader
-
-for.end6.loopexit:                                ; preds = %for.inc4
-  %x.1.lcssa.lcssa = phi i32 [ %x.1.lcssa, %for.inc4 ]
-  br label %for.end6
+  %outer.iv.next = add i32 %outer.iv, %N
+  %outer.ec = icmp eq i32 %inc5, %M
+  br i1 %outer.ec, label %exit, label %outer.header
 
-for.end6:                                         ; preds = %for.end6.loopexit, %entry
-  %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
-  ret i32 %x.0.lcssa
+exit:
+  ret i32 %x.1.lcssa
 }
 
-
-; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
-; CHECK:       vector.ph:
-; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-LABEL: define void @non_primary_iv_loop_inv_trunc(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i64 [[STEP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[STEP]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
@@ -166,19 +228,38 @@ for.end6:                                         ; preds = %for.end6.loopexit,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
-; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    %index.next = add nuw i64 %index, 8
-; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[J]] to i32
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], [[STEP]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -197,22 +278,43 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: @iv_no_binary_op_in_descriptor(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+
+define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-LABEL: define void @iv_no_binary_op_in_descriptor(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    store <8 x i64> [[VEC_IND]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP3]], label %middle.block, label [[VECTOR_BODY]]
-
-define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT_P]] = phi i64 [ [[IV_NEXT]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT_P]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop.header
 
@@ -231,3 +333,162 @@ loop.latch:
 exit:
   ret void
 }
+
+define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_add_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[O_1]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %add, %loop ]
+  %add = add i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %add, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
+; CHECK-LABEL: define void @wide_sub_induction_step_live_in(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i16 [[OFF:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[O_1:%.*]] = add i16 [[OFF]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 -2, [[OFF]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i16> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i16 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o.1 = add i16 %off, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.2 = phi i16 [ 0, %entry ], [ %sub, %loop ]
+  %sub = sub i16 %iv.2, %o.1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %sub, ptr %gep.dst, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec , label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 28c1c2a..96311de 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1962,6 +1962,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -1989,7 +1990,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP15]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2030,6 +2030,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483646
 ; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
+; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -2054,8 +2056,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; IND:       pred.udiv.continue2:
 ; IND-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ]
-; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
-; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP10]]
 ; IND-NEXT:    [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2097,7 +2097,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; UNROLL-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i1> [[TMP27]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2143,8 +2144,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL:       pred.udiv.continue8:
 ; UNROLL-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]]
-; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP21]], <2 x i32> [[WIDE_LOAD2]]
+; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
+; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP21]]
 ; UNROLL-NEXT:    [[TMP22]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NEXT:    [[TMP23]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2189,6 +2190,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2239,10 +2241,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL-NO-IC:       pred.udiv.continue8:
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = phi <2 x i32> [ [[TMP20]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]]
-; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP26]]
+; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP26]]
 ; UNROLL-NO-IC-NEXT:    [[TMP29]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP30]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2284,7 +2284,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[TMP47:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison, i1 poison, i1 poison>
+; INTERLEAVE-NEXT:    [[TMP48:%.*]] = shufflevector <4 x i1> [[TMP47]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE16:%.*]] ]
@@ -2366,8 +2367,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
 ; INTERLEAVE:       pred.udiv.continue16:
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ]
-; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP21]], <4 x i32> [[WIDE_LOAD]]
-; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP41]], <4 x i32> [[WIDE_LOAD2]]
+; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP21]]
+; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP41]]
 ; INTERLEAVE-NEXT:    [[TMP42]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; INTERLEAVE-NEXT:    [[TMP43]] = add <4 x i32> [[PREDPHI17]], [[VEC_PHI1]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -2773,8 +2774,8 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2844,8 +2845,8 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2910,8 +2911,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2981,8 +2982,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -3051,8 +3052,8 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -3122,8 +3123,8 @@ define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -5561,9 +5562,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i32 1
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -5781,9 +5782,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD7]], i32 1
 ; UNROLL-NO-IC-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       exit:
@@ -6236,9 +6237,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6307,9 +6308,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IND:       scalar.ph:
-; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; IND-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; IND-NEXT:    br label [[LOOP:%.*]]
 ; IND:       loop:
 ; IND-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6382,9 +6383,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
-; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL:       loop:
 ; UNROLL-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6463,9 +6464,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
@@ -6538,9 +6539,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
-; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE:       loop:
 ; INTERLEAVE-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index bb17580..5bc832f 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -14,6 +14,7 @@
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + vp<[[VEC_TC]]> * ir<1>
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {
@@ -76,6 +77,7 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT: Successor(s): vector.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<false> + vp<[[VEC_TC]]> * ir<true>
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {
@@ -116,11 +118,13 @@ declare i32 @llvm.smin.i32(i32, i32)
 ; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: scalar.ph:
+; DBG-NEXT:  EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; DBG-NEXT:  EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<false>
 ; DBG-NEXT: Successor(s): ir-bb<loop.header>
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<loop.header>:
-; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-; DBG-NEXT:   IR   %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ]
+; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; DBG-NEXT:   IR   %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; DBG-NEXT:   IR   %d.next = xor i1 %d, true
 ; DBG-NEXT: No successors
 ; DBG-EMPTY:
@@ -222,11 +226,12 @@ exit:
 ; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; DBG-EMPTY:
 ; DBG-NEXT: scalar.ph:
+; DBG-NEXT:  EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; DBG-NEXT:  EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
 ; DBG-NEXT: Successor(s): ir-bb<loop>
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<loop>:
-; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; DBG-NEXT:   IR   %for = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
 ; DBG:        IR   %ec = icmp slt i32 %iv.next.trunc, %n
 ; DBG-NEXT: No successors
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 0e7a68c..abd91d3 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -1508,9 +1508,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
index bc1c1bf..e8ad6a3 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
@@ -134,12 +134,12 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 3
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 0
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 88be9fa..aef25a0 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -401,8 +401,8 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[ARRAYIDX5_PROMOTED]], [[VECTOR_MEMCHECK]] ], [ [[ARRAYIDX5_PROMOTED]], [[FOR_BODY3_LR_PH]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
 ; CHECK-NEXT:    [[TMP20:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[FOR_BODY3]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
index 07ee589..681ffe9 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
@@ -40,8 +40,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC1-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF4IC1:       [[INNER_LOOP]]:
 ; CHECK-VF4IC1-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
@@ -114,8 +114,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF4IC4-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF4IC4:       [[INNER_LOOP]]:
 ; CHECK-VF4IC4-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
@@ -189,8 +189,8 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LOOP_EXIT]], label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
-; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_OUTER]], %[[OUTER_LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_LOOP]] ]
 ; CHECK-VF1IC4-NEXT:    br label %[[INNER_LOOP:.*]]
 ; CHECK-VF1IC4:       [[INNER_LOOP]]:
 ; CHECK-VF1IC4-NEXT:    [[RDX_INNER:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT:%.*]], %[[INNER_LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 10b6d1f..482f731 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --replace-value-regex "!llvm.loop ![0-9]+" --version 5
 ; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck --check-prefixes=CHECK,VEC %s
-; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 < %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 < %s | FileCheck --check-prefixes=CHECK,INTERLEAVE %s
 
 define i32 @postinc(i32 %k)  {
 ; CHECK-LABEL: define i32 @postinc(
@@ -430,6 +430,39 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; VEC-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i64 [[IV_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i64 @iv_scalar_steps_and_outside_users(
+; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store i64 [[TMP0]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
+; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i64 [[IV_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -485,6 +518,42 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; VEC-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i32 [[IV_2_LCSSA]]
 ;
+; INTERLEAVE-LABEL: define i32 @iv_2_dead_in_loop_only_used_outside(
+; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    store i64 [[TMP0]], ptr [[TMP2]], align 4
+; INTERLEAVE-NEXT:    store i64 [[TMP1]], ptr [[TMP3]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 2004, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[IV_2_NEXT]] = add nuw i32 [[IV_2]], 2
+; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
+; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[IV_2_LCSSA]]
+;
 entry:
   br label %loop
 
@@ -502,3 +571,606 @@ exit:
   %iv.2.lcssa = phi i32 [ %iv.2, %loop ]
   ret i32 %iv.2.lcssa
 }
+
+define i32 @postinc_sub(i32 %k)  {
+; CHECK-LABEL: define i32 @postinc_sub(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[K]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC]] = sub nsw i32 [[INC_PHI]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ %k, %entry ], [ %inc, %for.body ]
+  %inc = sub nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+define i32 @postinc_swapped_ops(i32 %k)  {
+; CHECK-LABEL: define i32 @postinc_swapped_ops(
+; CHECK-SAME: i32 [[K:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 1, [[INC_PHI]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 1, %inc.phi
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+define i32 @postinc_not_iv_backedge_value(i32 %k)  {
+; VEC-LABEL: define i32 @postinc_not_iv_backedge_value(
+; VEC-SAME: i32 [[K:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[FOR_BODY:.*]]
+; VEC:       [[FOR_BODY]]:
+; VEC-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; VEC-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; VEC-NEXT:    [[INC_2:%.*]] = add i32 [[INC_PHI]], 2
+; VEC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; VEC-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[FOR_END]]:
+; VEC-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[INC_2_LCSSA]]
+;
+; INTERLEAVE-LABEL: define i32 @postinc_not_iv_backedge_value(
+; INTERLEAVE-SAME: i32 [[K:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[K]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i32 [[K]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[FOR_BODY:.*]]
+; INTERLEAVE:       [[FOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
+; INTERLEAVE-NEXT:    [[INC_2:%.*]] = add i32 [[INC_PHI]], 2
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], [[K]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[FOR_END]]:
+; INTERLEAVE-NEXT:    [[INC_2_LCSSA:%.*]] = phi i32 [ [[INC_2]], %[[FOR_BODY]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[INC_2_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %inc.2 = add i32 %inc.phi, 2
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.2
+}
+
+define float @fp_postinc_use_fadd(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fadd(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT:    [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; VEC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> <float 0.000000e+00, float 1.000000e+00>, [[DOTSPLAT2]]
+; VEC-NEXT:    [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT:    [[ADD]] = fadd fast float [[FP_IV]], [[FPINC]]
+; VEC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret float [[ADD_LCSSA]]
+;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fadd(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = fadd fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT:    [[ADD]] = fadd fast float [[FP_IV]], [[FPINC]]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  store float %fp.iv, ptr %gep.A, align 4
+  %add = fadd fast float %fp.iv, %fpinc
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %add
+}
+
+define float @fp_postinc_use_fsub(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) {
+; VEC-LABEL: define float @fp_postinc_use_fsub(
+; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; VEC-NEXT:    [[TMP1:%.*]] = fsub fast float [[INIT]], [[TMP0]]
+; VEC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
+; VEC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0
+; VEC-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> <float 0.000000e+00, float 1.000000e+00>, [[DOTSPLAT2]]
+; VEC-NEXT:    [[INDUCTION:%.*]] = fsub fast <2 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC-NEXT:    [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00
+; VEC-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; VEC-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = fsub fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
+; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; VEC-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; VEC-NEXT:    [[ADD]] = fsub fast float [[FP_IV]], [[FPINC]]
+; VEC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[EXIT]]:
+; VEC-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret float [[ADD_LCSSA]]
+;
+; INTERLEAVE-LABEL: define float @fp_postinc_use_fsub(
+; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = fsub fast float [[INIT]], [[TMP0]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = fsub fast float [[INIT]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP5]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = fsub fast float [[OFFSET_IDX]], [[TMP7]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]]
+; INTERLEAVE-NEXT:    store float [[TMP6]], ptr [[TMP9]], align 4
+; INTERLEAVE-NEXT:    store float [[TMP8]], ptr [[TMP10]], align 4
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; INTERLEAVE-NEXT:    store float [[FP_IV]], ptr [[GEP_A]], align 4
+; INTERLEAVE-NEXT:    [[ADD]] = fsub fast float [[FP_IV]], [[FPINC]]
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv = phi float [ %init, %entry ], [ %add, %loop ]
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  store float %fp.iv, ptr %gep.A, align 4
+  %add = fsub fast float %fp.iv, %fpinc
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %add
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/121745.
+define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) {
+; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification(
+; VEC-SAME: ptr [[DST:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; VEC-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; VEC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; VEC-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 1
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
+; VEC-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP2]], align 2
+; VEC-NEXT:    [[TMP4:%.*]] = add i32 [[STEP_2]], [[TMP0]]
+; VEC-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP6]]
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; VEC-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VEC-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
+; VEC-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; VEC-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[E_EXIT]]:
+; VEC-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[RES]]
+;
+; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification(
+; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; INTERLEAVE-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP1]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
+; INTERLEAVE-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[E_EXIT]]:
+; INTERLEAVE-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %step.1 = sext i8 0 to i32
+  %step.2 = add nsw i32 %step.1, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i32 %step.2, %iv
+  %cmp.i = icmp slt i32 %iv.next, 8
+  br i1 %cmp.i, label %loop, label %e.exit
+
+e.exit:
+  %res = phi i32 [ %iv.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) {
+; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(
+; VEC-SAME: ptr [[DST:%.*]]) {
+; VEC-NEXT:  [[ENTRY:.*]]:
+; VEC-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; VEC-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; VEC-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP_2]], i64 0
+; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VEC:       [[VECTOR_BODY]]:
+; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
+; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VEC-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
+; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; VEC-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; VEC-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; VEC-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 1)
+; VEC-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[TMP4]]
+; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
+; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[MIDDLE_BLOCK]]:
+; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; VEC-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; VEC:       [[SCALAR_PH]]:
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    br label %[[LOOP:.*]]
+; VEC:       [[LOOP]]:
+; VEC-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VEC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; VEC-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VEC-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
+; VEC-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[INC]]
+; VEC-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; VEC-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; VEC:       [[E_EXIT]]:
+; VEC-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    ret i32 [[RES]]
+;
+; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(
+; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; INTERLEAVE-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; INTERLEAVE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP1]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP2]], align 2
+; INTERLEAVE-NEXT:    store i16 0, ptr [[TMP3]], align 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP1]], 1
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP4]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; INTERLEAVE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; INTERLEAVE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; INTERLEAVE-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; INTERLEAVE-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
+; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[INC]]
+; INTERLEAVE-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
+; INTERLEAVE-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE:       [[E_EXIT]]:
+; INTERLEAVE-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %step.1 = sext i8 0 to i32
+  %step.2 = add nsw i32 %step.1, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %inc = add i32 %iv, 1
+  %iv.next = add i32 %step.2, %inc
+  %cmp.i = icmp slt i32 %iv.next, 8
+  br i1 %cmp.i, label %loop, label %e.exit
+
+e.exit:
+  %res = phi i32 [ %iv.next, %loop ]
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 70199fa..1bfb341 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -500,8 +500,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
@@ -590,8 +590,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.ph:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT8]], <2 x i8> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll
index ed7762f..fe660a8 100644
--- a/llvm/test/Transforms/LoopVectorize/pr37248.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll
@@ -41,26 +41,26 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[START]], [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
-; CHECK-NEXT:    [[TMP11:%.*]] = add i16 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
+; CHECK-NEXT:    [[TMP12:%.*]] = add i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; CHECK:       pred.store.if2:
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; CHECK:       pred.store.continue3:
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP16]], i32 -1
 ; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP17]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index 4f47e66..a129a4b 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -6,25 +6,25 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C_1:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT1]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i1> poison, i1 [[C_2:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT3]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT5]], splat (i32 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ <i32 35902, i32 0>, [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> [[VEC_IND]]
 ; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> splat (i32 9), <2 x i32> [[PREDPHI]]
 ; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
index 3dafe827..a4b229d 100644
--- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
@@ -52,13 +52,13 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK:       vector.ph7:
 ; CHECK-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF8]]
-; CHECK-NEXT:    br label [[VECTOR_BODY11:%.*]]
-; CHECK:       vector.body9:
-; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY11]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY10:%.*]]
+; CHECK:       vector.body10:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY10]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !4, !noalias !7
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX12]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY11]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY10]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block4:
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[LOOP_3_LR_PH:%.*]], label [[SCALAR_PH5]]
@@ -69,8 +69,8 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[IDXPROM_I_I61:%.*]] = and i64 [[IV761_LCSSA]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I62:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[IDXPROM_I_I61]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK22:%.*]] = icmp ult i64 [[TMP3]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH21:%.*]], label [[VECTOR_MEMCHECK14:%.*]]
-; CHECK:       vector.memcheck14:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH22:%.*]], label [[VECTOR_MEMCHECK15:%.*]]
+; CHECK:       vector.memcheck15:
 ; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[IDXPROM_I_I61]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP12]], 4
@@ -78,22 +78,22 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[BOUND017:%.*]] = icmp ult ptr [[TMP1]], [[SCEVGEP16]]
 ; CHECK-NEXT:    [[BOUND118:%.*]] = icmp ult ptr [[ARRAYIDX_I_I62]], [[SCEVGEP15]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH21]], label [[VECTOR_PH23:%.*]]
-; CHECK:       vector.ph23:
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH22]], label [[VECTOR_PH24:%.*]]
+; CHECK:       vector.ph24:
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF24]]
-; CHECK-NEXT:    br label [[VECTOR_BODY26:%.*]]
-; CHECK:       vector.body26:
-; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH23]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY26]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY27:%.*]]
+; CHECK:       vector.body27:
+; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH24]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY27]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !10, !noalias !13
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX29]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY26]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block20:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK21:%.*]], label [[VECTOR_BODY27]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       middle.block21:
 ; CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH21]]
-; CHECK:       scalar.ph21:
-; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK20]] ], [ 0, [[VECTOR_MEMCHECK14]] ], [ 0, [[LOOP_3_LR_PH]] ]
+; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH22]]
+; CHECK:       scalar.ph22:
+; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK21]] ], [ 0, [[VECTOR_MEMCHECK15]] ], [ 0, [[LOOP_3_LR_PH]] ]
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
 ; CHECK:       loop.2:
 ; CHECK-NEXT:    [[IV846:%.*]] = phi i64 [ [[IV_NEXT85:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL13]], [[SCALAR_PH5]] ]
@@ -105,7 +105,7 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[EXITCOND92_NOT:%.*]] = icmp eq i64 [[IV846]], [[IV]]
 ; CHECK-NEXT:    br i1 [[EXITCOND92_NOT]], label [[LOOP_3_LR_PH]], label [[LOOP_2]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       loop.3:
-; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH21]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
+; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH22]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I_I62]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_I_I653:%.*]] = getelementptr i32, ptr [[TMP2:%.*]], i64 [[IV93:%.*]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll
index 50e1807..24b9441 100644
--- a/llvm/test/Transforms/LoopVectorize/pr66616.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll
@@ -46,7 +46,7 @@ define void @pr66616(ptr %ptr) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[ADD3_LCSSA]], [[DOTCAST]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY7:%.*]]
-; CHECK:       vector.body5:
+; CHECK:       vector.body4:
 ; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH3]] ], [ [[INDEX_NEXT9:%.*]], [[VECTOR_BODY7]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
index 5052ba8..7b57624 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -91,7 +91,7 @@ define void @scalar_cast_dbg(ptr nocapture %a, i32 %start, i64 %k) {
 ; DEBUGLOC:   = trunc i64 %index to i32, !dbg [[CASTLOC:![0-9]+]]
 ;
 ; DEBUGLOC: loop:
-; DEBUGLOC-NOT:   %trunc.iv = trunc i64 %iv to i32, !dbg [[CASTLOC]]
+; DEBUGLOC:   %trunc.iv = trunc i64 %iv to i32, !dbg [[CASTLOC]]
 ;
 entry:
   br label %loop
@@ -109,6 +109,31 @@ exit:
   ret void
 }
 
+define void @widen_intrinsic_dbg(i64 %n, ptr %y, ptr %x) {
+; DEBUGLOC-LABEL: define void @widen_intrinsic_dbg(
+; DEBUGLOC: vector.body:
+; DEBUGLOC:   = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !dbg ![[INTRINSIC_LOC:[0-9]+]]
+; DEBUGLOC: loop:
+; DEBUGLOC:   = call float @llvm.sqrt.f32(float %{{.+}}), !dbg ![[INTRINSIC_LOC]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.y = getelementptr inbounds float, ptr %y, i64 %iv
+  %load = load float, ptr %gep.y, align 4
+  %call = call float @llvm.sqrt.f32(float %load)
+  %gep.x = getelementptr inbounds float, ptr %x, i64 %iv
+  store float %call, ptr %gep.x, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 !0 = !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.width", i32 4}
 ; CHECK-NOT: !{metadata !"llvm.loop.vectorize.width", i32 4}
@@ -116,3 +141,4 @@ exit:
 
 ; DEBUGLOC: ![[RESUMELOC]] = !DILocation(line: 2
 ; DEBUGLOC: ![[PTRIVLOC]] = !DILocation(line: 12
+; DEBUGLOC: ![[INTRINSIC_LOC]] = !DILocation(line: 44
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll
new file mode 100644
index 0000000..57f0dc2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -passes=debugify,loop-vectorize -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s -check-prefix DEBUGLOC
+
+; Testing the debug locations of the generated vector intstructions are same as
+; their scalar counterpart.
+
+define i32 @reduction_sum(ptr %A, ptr %B) {
+; DEBUGLOC-LABEL: define i32 @reduction_sum(
+; DEBUGLOC: vector.body:
+; DEBUGLOC:   = load <4 x i32>, ptr %{{.+}}, align 4, !dbg ![[LOADLOC:[0-9]+]]
+; DEBUGLOC:   = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %{{.+}}), !dbg ![[REDLOC:[0-9]+]]
+; DEBUGLOC: loop:
+; DEBUGLOC:   %[[LOAD:.+]] = load i32, ptr %{{.+}}, align 4, !dbg ![[LOADLOC]]
+; DEBUGLOC:   = add i32 %{{.+}}, %[[LOAD]], !dbg ![[REDLOC]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  %load = load i32, ptr %gep, align 4
+  %red.next = add i32 %red, %load
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 256
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %red.lcssa = phi i32 [ %red.next, %loop ]
+  ret i32 %red.lcssa
+}
+
+; DEBUGLOC: ![[LOADLOC]] = !DILocation(line: 5
+; DEBUGLOC: ![[REDLOC]] = !DILocation(line: 6
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-align.ll b/llvm/test/Transforms/LoopVectorize/reduction-align.ll
index 69e3e07..3216c92 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-align.ll
@@ -27,7 +27,7 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[REF]], align 1, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i16> [[BROADCAST_SPLAT]], [[VEC_PHI]]
@@ -36,12 +36,12 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
-; CHECK-NEXT:    store i16 [[TMP3]], ptr [[HBUF]], align 1
+; CHECK-NEXT:    store i16 [[TMP3]], ptr [[HBUF]], align 1, !alias.scope [[META6:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[HEIGHT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -51,7 +51,7 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) {
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[HBUF]], align 1
 ; CHECK-NEXT:    [[INC]] = add i32 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[HEIGHT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
index 2404213..ad2f9c6 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -733,8 +733,8 @@ define i32 @cond-uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX1:%.*]] = phi i32 [ [[ADD2:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -897,8 +897,8 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX1:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -1039,8 +1039,8 @@ define i32 @uncond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -1168,8 +1168,8 @@ define i32 @uncond_cond_uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[ADD3:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 8a8439f..ca971f1 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -89,13 +89,13 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[PREDPHI]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 94fce86..f136b0e 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -185,10 +185,10 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP5]] to i32
@@ -262,11 +262,11 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul float [[DOTCAST]], 2.000000e+00
 ; CHECK-NEXT:    [[IND_END:%.*]] = fadd float [[TMP4]], 0.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = uitofp <vscale x 4 x i32> [[TMP7]] to <vscale x 4 x float>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul <vscale x 4 x float> [[TMP8]], splat (float 2.000000e+00)
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 301526c..550e52d 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1006,11 +1006,11 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC1-NEXT:    [[TMP2]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1048,6 +1048,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1055,13 +1056,9 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
-; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP2]]
-; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[TMP8]] = or <4 x i1> [[VEC_PHI3]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1100,6 +1097,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
 ; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1107,13 +1105,9 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP1]]
-; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP2]]
-; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
index 5e281929..836115f 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
@@ -17,8 +17,8 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[EXTRA_ITER]], 3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[EXTRA_ITER]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -39,8 +39,8 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[NEXT:%.*]] = phi i32 [ [[SEL:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 7590bb9..4ba9cc6 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -281,12 +281,12 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 276
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
index 40b007e..57bc7b8 100644
--- a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
@@ -19,8 +19,8 @@ define void @pr75298_store_reduction_value_in_folded_loop(i64 %iv.start) optsize
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
index 31732f0..892ddcc 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
@@ -10,12 +10,12 @@ define void @tail_fold_switch(ptr %dst, i32 %0) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
index 1a8f29e..b427b43 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -47,10 +47,11 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<e2>:
@@ -129,10 +130,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
@@ -204,10 +206,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT: ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
new file mode 100644
index 0000000..2520613
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
@@ -0,0 +1,52 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug %s 2>&1 | FileCheck %s
+
+
+; CHECK-LABEL: LV: Checking a loop in 'latch_exit_cannot_compute_btc_due_to_step'
+; CHECK: 	   LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Early exit is not the latch predecessor.
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
+
+; CHECK-LABEL: LV: Checking a loop in 'header_exit_cannot_compute_btc_due_to_step'
+; CHECK:       LV: Found an induction variable.
+; CHECK-NEXT:  LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Cannot determine exact exit count for latch block.
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
+
+; CHECK-NOT: vector.body
+define void @latch_exit_cannot_compute_btc_due_to_step(ptr %dst, i64 %step) {
+entry:
+  br label %loop
+
+loop:                                   ; preds = %loop, %for.cond.us
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, %step
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @header_exit_cannot_compute_btc_due_to_step(ptr %dst, i64 %step) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i64 %iv, %step
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop.latch, label %exit
+
+loop.latch:
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  br label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 72011ca..7f5e0f3a 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -133,11 +133,11 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_IND]], <4 x i64> undef
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index fd75177..85b44a7 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -1,19 +1,138 @@
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF8UF1 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=CHECK,VF8UF2 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF16UF1 %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF8UF1 %s
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF8UF2 %s
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF16UF1 %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 ; Check if the vector loop condition can be simplified to true for a given
 ; VF/IC combination.
 define void @test_tc_less_than_16(ptr %A, i64 %N) {
-; CHECK-LABEL: define void @test_tc_less_than_16(
-; VF8UF1:       [[CMP:%.+]] = icmp eq i64 %index.next, %n.vec
-; VF8UF1-NEXT:  br i1 [[CMP]], label %middle.block, label %vector.body
+; VF8UF1-LABEL: define void @test_tc_less_than_16(
+; VF8UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[AND]], 8
+; VF8UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
 ;
-; VF8UF2:       br i1 true, label %middle.block, label %vector.body
+; VF8UF2-LABEL: define void @test_tc_less_than_16(
+; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF2-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
 ;
-; VF16UF1:      br i1 true, label %middle.block, label %vector.body
+; VF16UF1-LABEL: define void @test_tc_less_than_16(
+; VF16UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[AND:%.*]] = and i64 [[N]], 15
+; VF16UF1-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
 ;
 entry:
   %and = and i64 %N, 15
@@ -33,3 +152,1118 @@ loop:
 exit:
   ret void
 }
+
+define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, 10) %N) {
+; VF8UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
+; VF8UF1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1:       [[PRED_STORE_IF]]:
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1:       [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF1:       [[PRED_STORE_IF1]]:
+; VF8UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF1-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1:       [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1:       [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1:       [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1:       [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1:       [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF8UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF2-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF2-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT1]]
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2:       [[PRED_STORE_IF]]:
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP5]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2:       [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF2:       [[PRED_STORE_IF1]]:
+; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP7]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF2:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF2:       [[PRED_STORE_IF3]]:
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP9]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF2:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF2:       [[PRED_STORE_IF5]]:
+; VF8UF2-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP11]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF2:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF2-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF2:       [[PRED_STORE_IF7]]:
+; VF8UF2-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP13]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF2:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF2-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF2:       [[PRED_STORE_IF9]]:
+; VF8UF2-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP15]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF2:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF2-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF2:       [[PRED_STORE_IF11]]:
+; VF8UF2-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP17]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF2:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF2-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF2:       [[PRED_STORE_IF13]]:
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP19]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF2:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF8UF2:       [[PRED_STORE_IF15]]:
+; VF8UF2-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 10
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP21]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF2:       [[PRED_STORE_CONTINUE16]]:
+; VF8UF2-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF8UF2:       [[PRED_STORE_IF17]]:
+; VF8UF2-NEXT:    [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 11
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP23]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF8UF2:       [[PRED_STORE_CONTINUE18]]:
+; VF8UF2-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF8UF2:       [[PRED_STORE_IF19]]:
+; VF8UF2-NEXT:    [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 12
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP25]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF8UF2:       [[PRED_STORE_CONTINUE20]]:
+; VF8UF2-NEXT:    [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF8UF2:       [[PRED_STORE_IF21]]:
+; VF8UF2-NEXT:    [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 13
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP27]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF8UF2:       [[PRED_STORE_CONTINUE22]]:
+; VF8UF2-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF8UF2:       [[PRED_STORE_IF23]]:
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 14
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP29]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF8UF2:       [[PRED_STORE_CONTINUE24]]:
+; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF8UF2:       [[PRED_STORE_IF25]]:
+; VF8UF2-NEXT:    [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 15
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP31]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF8UF2:       [[PRED_STORE_CONTINUE26]]:
+; VF8UF2-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF8UF2:       [[PRED_STORE_IF27]]:
+; VF8UF2-NEXT:    [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 16
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP33]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF8UF2:       [[PRED_STORE_CONTINUE28]]:
+; VF8UF2-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF8UF2:       [[PRED_STORE_IF29]]:
+; VF8UF2-NEXT:    [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 17
+; VF8UF2-NEXT:    store i16 0, ptr [[TMP35]], align 2
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF2-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF16UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT1]]
+; VF16UF1-NEXT:    [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0
+; VF16UF1-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1:       [[PRED_STORE_IF]]:
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP4]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1:       [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT:    [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1
+; VF16UF1-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF16UF1:       [[PRED_STORE_IF1]]:
+; VF16UF1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP6]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF16UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF16UF1-NEXT:    [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP8]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1:       [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP10]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1:       [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP12]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1:       [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP14]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1:       [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP16]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1:       [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP18]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1:       [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 10
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP20]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1:       [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT:    [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 11
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP22]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1:       [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 12
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP24]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1:       [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT:    [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 13
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP26]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1:       [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 14
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP28]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1:       [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 15
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP30]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1:       [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT:    [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 16
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP32]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP33]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1:       [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT:    [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 17
+; VF16UF1-NEXT:    store i16 0, ptr [[TMP34]], align 2
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF16UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF16UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr i16, ptr %dst, i64 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias %src, ptr %dst) {
+; VF8UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF8UF1:       [[OUTER_HEADER]]:
+; VF8UF1-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP0]]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF1-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF1-NEXT:    br label %[[INNER:.*]]
+; VF8UF1:       [[INNER]]:
+; VF8UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF1:       [[OUTER_LATCH]]:
+; VF8UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF8UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF2-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF8UF2:       [[OUTER_HEADER]]:
+; VF8UF2-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF8UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 0
+; VF8UF2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF2-NEXT:    br label %[[INNER:.*]]
+; VF8UF2:       [[INNER]]:
+; VF8UF2-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF2-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF2-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF2-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2:       [[OUTER_LATCH]]:
+; VF8UF2-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF2-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF8UF2-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF16UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
+; VF16UF1:       [[OUTER_HEADER]]:
+; VF16UF1-NEXT:    [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF16UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF16UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 0
+; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF16UF1-NEXT:    br label %[[INNER:.*]]
+; VF16UF1:       [[INNER]]:
+; VF16UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF16UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF16UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1:       [[OUTER_LATCH]]:
+; VF16UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF16UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; VF16UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi ptr [ %src, %entry ], [ %outer.iv.next, %outer.latch ]
+  br label %inner
+
+inner:
+  %inner.iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+  %gep.src = getelementptr i8, ptr %outer.iv, i64 %inner.iv
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr i8, ptr %dst, i64 %inner.iv
+  store i8 %l, ptr %gep.dst, align 1
+  %iv.next = add i64 %inner.iv, 1
+  %c.1 = icmp eq i64 %iv.next, %N
+  br i1 %c.1, label %outer.latch, label %inner
+
+outer.latch:
+  %outer.iv.next = getelementptr i8, ptr %outer.iv, i64 1
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %outer.header, label %exit
+
+exit:
+  ret void
+}
+
+declare void @llvm.assume(i1)
+
+; Test case for https://github.com/llvm/llvm-project/issues/121897.
+define void @scev_expand_step(i64 %x, ptr %dst) {
+; VF8UF1-LABEL: define void @scev_expand_step(
+; VF8UF1-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF8UF1-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF8UF1-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF8UF1-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF8UF1-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 7
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF8UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1:       [[PRED_STORE_IF]]:
+; VF8UF1-NEXT:    [[TMP5:%.*]] = mul i64 0, [[STEP]]
+; VF8UF1-NEXT:    [[TMP6:%.*]] = add i64 0, [[TMP5]]
+; VF8UF1-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1:       [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF1:       [[PRED_STORE_IF1]]:
+; VF8UF1-NEXT:    [[TMP10:%.*]] = mul i64 1, [[STEP]]
+; VF8UF1-NEXT:    [[TMP11:%.*]] = add i64 0, [[TMP10]]
+; VF8UF1-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP13]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF1-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP15:%.*]] = mul i64 2, [[STEP]]
+; VF8UF1-NEXT:    [[TMP16:%.*]] = add i64 0, [[TMP15]]
+; VF8UF1-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1:       [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT:    [[TMP20:%.*]] = mul i64 3, [[STEP]]
+; VF8UF1-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF8UF1-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP22]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP23]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1:       [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT:    [[TMP25:%.*]] = mul i64 4, [[STEP]]
+; VF8UF1-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF8UF1-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP28]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT:    [[TMP29:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1:       [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT:    [[TMP30:%.*]] = mul i64 5, [[STEP]]
+; VF8UF1-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF8UF1-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP33]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1:       [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT:    [[TMP35:%.*]] = mul i64 6, [[STEP]]
+; VF8UF1-NEXT:    [[TMP36:%.*]] = add i64 0, [[TMP35]]
+; VF8UF1-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP37]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP38]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT:    [[TMP39:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1:       [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
+; VF8UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
+; VF8UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @scev_expand_step(
+; VF8UF2-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF8UF2-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF8UF2-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF8UF2-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF8UF2-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 15
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF8UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2:       [[PRED_STORE_IF]]:
+; VF8UF2-NEXT:    [[TMP6:%.*]] = mul i64 0, [[STEP]]
+; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 0, [[TMP6]]
+; VF8UF2-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP9]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2:       [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF2:       [[PRED_STORE_IF1]]:
+; VF8UF2-NEXT:    [[TMP11:%.*]] = mul i64 1, [[STEP]]
+; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
+; VF8UF2-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP14]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF2:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF2-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF2:       [[PRED_STORE_IF3]]:
+; VF8UF2-NEXT:    [[TMP16:%.*]] = mul i64 2, [[STEP]]
+; VF8UF2-NEXT:    [[TMP17:%.*]] = add i64 0, [[TMP16]]
+; VF8UF2-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP19]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF2:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF2:       [[PRED_STORE_IF5]]:
+; VF8UF2-NEXT:    [[TMP21:%.*]] = mul i64 3, [[STEP]]
+; VF8UF2-NEXT:    [[TMP22:%.*]] = add i64 0, [[TMP21]]
+; VF8UF2-NEXT:    [[TMP23:%.*]] = add i64 [[TMP22]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP24]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF2:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF2-NEXT:    [[TMP25:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF2:       [[PRED_STORE_IF7]]:
+; VF8UF2-NEXT:    [[TMP26:%.*]] = mul i64 4, [[STEP]]
+; VF8UF2-NEXT:    [[TMP27:%.*]] = add i64 0, [[TMP26]]
+; VF8UF2-NEXT:    [[TMP28:%.*]] = add i64 [[TMP27]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP28]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP29]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF2:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF2:       [[PRED_STORE_IF9]]:
+; VF8UF2-NEXT:    [[TMP31:%.*]] = mul i64 5, [[STEP]]
+; VF8UF2-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP31]]
+; VF8UF2-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP33]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP34]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF2:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF2-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP35]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF2:       [[PRED_STORE_IF11]]:
+; VF8UF2-NEXT:    [[TMP36:%.*]] = mul i64 6, [[STEP]]
+; VF8UF2-NEXT:    [[TMP37:%.*]] = add i64 0, [[TMP36]]
+; VF8UF2-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP38]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP39]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF2:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF2-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF2:       [[PRED_STORE_IF13]]:
+; VF8UF2-NEXT:    [[TMP41:%.*]] = mul i64 7, [[STEP]]
+; VF8UF2-NEXT:    [[TMP42:%.*]] = add i64 0, [[TMP41]]
+; VF8UF2-NEXT:    [[TMP43:%.*]] = add i64 [[TMP42]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP43]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP44]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF2:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF2-NEXT:    [[TMP45:%.*]] = extractelement <8 x i1> [[TMP4]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP45]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF8UF2:       [[PRED_STORE_IF15]]:
+; VF8UF2-NEXT:    [[TMP46:%.*]] = mul i64 8, [[STEP]]
+; VF8UF2-NEXT:    [[TMP47:%.*]] = add i64 0, [[TMP46]]
+; VF8UF2-NEXT:    [[TMP48:%.*]] = add i64 [[TMP47]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP48]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP49]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF2:       [[PRED_STORE_CONTINUE16]]:
+; VF8UF2-NEXT:    [[TMP50:%.*]] = extractelement <8 x i1> [[TMP4]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP50]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF8UF2:       [[PRED_STORE_IF17]]:
+; VF8UF2-NEXT:    [[TMP51:%.*]] = mul i64 9, [[STEP]]
+; VF8UF2-NEXT:    [[TMP52:%.*]] = add i64 0, [[TMP51]]
+; VF8UF2-NEXT:    [[TMP53:%.*]] = add i64 [[TMP52]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP53]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP54]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF8UF2:       [[PRED_STORE_CONTINUE18]]:
+; VF8UF2-NEXT:    [[TMP55:%.*]] = extractelement <8 x i1> [[TMP4]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP55]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF8UF2:       [[PRED_STORE_IF19]]:
+; VF8UF2-NEXT:    [[TMP56:%.*]] = mul i64 10, [[STEP]]
+; VF8UF2-NEXT:    [[TMP57:%.*]] = add i64 0, [[TMP56]]
+; VF8UF2-NEXT:    [[TMP58:%.*]] = add i64 [[TMP57]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP58]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP59]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF8UF2:       [[PRED_STORE_CONTINUE20]]:
+; VF8UF2-NEXT:    [[TMP60:%.*]] = extractelement <8 x i1> [[TMP4]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF8UF2:       [[PRED_STORE_IF21]]:
+; VF8UF2-NEXT:    [[TMP61:%.*]] = mul i64 11, [[STEP]]
+; VF8UF2-NEXT:    [[TMP62:%.*]] = add i64 0, [[TMP61]]
+; VF8UF2-NEXT:    [[TMP63:%.*]] = add i64 [[TMP62]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP63]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP64]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF8UF2:       [[PRED_STORE_CONTINUE22]]:
+; VF8UF2-NEXT:    [[TMP65:%.*]] = extractelement <8 x i1> [[TMP4]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP65]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF8UF2:       [[PRED_STORE_IF23]]:
+; VF8UF2-NEXT:    [[TMP66:%.*]] = mul i64 12, [[STEP]]
+; VF8UF2-NEXT:    [[TMP67:%.*]] = add i64 0, [[TMP66]]
+; VF8UF2-NEXT:    [[TMP68:%.*]] = add i64 [[TMP67]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP68]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP69]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF8UF2:       [[PRED_STORE_CONTINUE24]]:
+; VF8UF2-NEXT:    [[TMP70:%.*]] = extractelement <8 x i1> [[TMP4]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP70]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF8UF2:       [[PRED_STORE_IF25]]:
+; VF8UF2-NEXT:    [[TMP71:%.*]] = mul i64 13, [[STEP]]
+; VF8UF2-NEXT:    [[TMP72:%.*]] = add i64 0, [[TMP71]]
+; VF8UF2-NEXT:    [[TMP73:%.*]] = add i64 [[TMP72]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP74:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP73]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP74]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF8UF2:       [[PRED_STORE_CONTINUE26]]:
+; VF8UF2-NEXT:    [[TMP75:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP75]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF8UF2:       [[PRED_STORE_IF27]]:
+; VF8UF2-NEXT:    [[TMP76:%.*]] = mul i64 14, [[STEP]]
+; VF8UF2-NEXT:    [[TMP77:%.*]] = add i64 0, [[TMP76]]
+; VF8UF2-NEXT:    [[TMP78:%.*]] = add i64 [[TMP77]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP78]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP79]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF8UF2:       [[PRED_STORE_CONTINUE28]]:
+; VF8UF2-NEXT:    [[TMP80:%.*]] = extractelement <8 x i1> [[TMP4]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP80]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF8UF2:       [[PRED_STORE_IF29]]:
+; VF8UF2-NEXT:    [[TMP81:%.*]] = mul i64 15, [[STEP]]
+; VF8UF2-NEXT:    [[TMP82:%.*]] = add i64 0, [[TMP81]]
+; VF8UF2-NEXT:    [[TMP83:%.*]] = add i64 [[TMP82]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP83]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP84]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF2-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @scev_expand_step(
+; VF16UF1-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF16UF1-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF16UF1-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF16UF1-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF16UF1-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 15
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = extractelement <16 x i1> [[TMP3]], i32 0
+; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1:       [[PRED_STORE_IF]]:
+; VF16UF1-NEXT:    [[TMP5:%.*]] = mul i64 0, [[STEP]]
+; VF16UF1-NEXT:    [[TMP6:%.*]] = add i64 0, [[TMP5]]
+; VF16UF1-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1:       [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP3]], i32 1
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF16UF1:       [[PRED_STORE_IF1]]:
+; VF16UF1-NEXT:    [[TMP10:%.*]] = mul i64 1, [[STEP]]
+; VF16UF1-NEXT:    [[TMP11:%.*]] = add i64 0, [[TMP10]]
+; VF16UF1-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP13]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF16UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF16UF1-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP3]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP15:%.*]] = mul i64 2, [[STEP]]
+; VF16UF1-NEXT:    [[TMP16:%.*]] = add i64 0, [[TMP15]]
+; VF16UF1-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP3]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1:       [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT:    [[TMP20:%.*]] = mul i64 3, [[STEP]]
+; VF16UF1-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF16UF1-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP22]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP23]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP3]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1:       [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT:    [[TMP25:%.*]] = mul i64 4, [[STEP]]
+; VF16UF1-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF16UF1-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP28]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP3]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1:       [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT:    [[TMP30:%.*]] = mul i64 5, [[STEP]]
+; VF16UF1-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF16UF1-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP33]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP3]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1:       [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT:    [[TMP35:%.*]] = mul i64 6, [[STEP]]
+; VF16UF1-NEXT:    [[TMP36:%.*]] = add i64 0, [[TMP35]]
+; VF16UF1-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP37]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP38]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP3]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1:       [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
+; VF16UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
+; VF16UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP3]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1:       [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT:    [[TMP45:%.*]] = mul i64 8, [[STEP]]
+; VF16UF1-NEXT:    [[TMP46:%.*]] = add i64 0, [[TMP45]]
+; VF16UF1-NEXT:    [[TMP47:%.*]] = add i64 [[TMP46]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP47]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP48]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT:    [[TMP49:%.*]] = extractelement <16 x i1> [[TMP3]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP49]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1:       [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT:    [[TMP50:%.*]] = mul i64 9, [[STEP]]
+; VF16UF1-NEXT:    [[TMP51:%.*]] = add i64 0, [[TMP50]]
+; VF16UF1-NEXT:    [[TMP52:%.*]] = add i64 [[TMP51]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP52]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP53]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT:    [[TMP54:%.*]] = extractelement <16 x i1> [[TMP3]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP54]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1:       [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT:    [[TMP55:%.*]] = mul i64 10, [[STEP]]
+; VF16UF1-NEXT:    [[TMP56:%.*]] = add i64 0, [[TMP55]]
+; VF16UF1-NEXT:    [[TMP57:%.*]] = add i64 [[TMP56]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP57]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP58]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT:    [[TMP59:%.*]] = extractelement <16 x i1> [[TMP3]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP59]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1:       [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT:    [[TMP60:%.*]] = mul i64 11, [[STEP]]
+; VF16UF1-NEXT:    [[TMP61:%.*]] = add i64 0, [[TMP60]]
+; VF16UF1-NEXT:    [[TMP62:%.*]] = add i64 [[TMP61]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP62]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP63]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP3]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1:       [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT:    [[TMP65:%.*]] = mul i64 12, [[STEP]]
+; VF16UF1-NEXT:    [[TMP66:%.*]] = add i64 0, [[TMP65]]
+; VF16UF1-NEXT:    [[TMP67:%.*]] = add i64 [[TMP66]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP67]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP68]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT:    [[TMP69:%.*]] = extractelement <16 x i1> [[TMP3]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP69]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1:       [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT:    [[TMP70:%.*]] = mul i64 13, [[STEP]]
+; VF16UF1-NEXT:    [[TMP71:%.*]] = add i64 0, [[TMP70]]
+; VF16UF1-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP72]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP73]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT:    [[TMP74:%.*]] = extractelement <16 x i1> [[TMP3]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP74]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1:       [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT:    [[TMP75:%.*]] = mul i64 14, [[STEP]]
+; VF16UF1-NEXT:    [[TMP76:%.*]] = add i64 0, [[TMP75]]
+; VF16UF1-NEXT:    [[TMP77:%.*]] = add i64 [[TMP76]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP77]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP78]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT:    [[TMP79:%.*]] = extractelement <16 x i1> [[TMP3]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP79]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1:       [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT:    [[TMP80:%.*]] = mul i64 15, [[STEP]]
+; VF16UF1-NEXT:    [[TMP81:%.*]] = add i64 0, [[TMP80]]
+; VF16UF1-NEXT:    [[TMP82:%.*]] = add i64 [[TMP81]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP82]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP83]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF16UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  %c = icmp eq i64 %x, 65536
+  call void @llvm.assume(i1 %c)
+  %fr = freeze i64 %x
+  %step = add i64 %fr, -65534
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, %step
+  %gep.dst = getelementptr i8, ptr %dst, i64 %iv.next
+  store i8 0, ptr %gep.dst, align 1
+  %ec = icmp slt i64 %iv.next, 16
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+;.
+; VF8UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+;.
+; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 930d3cd..791c995 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -499,16 +499,13 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 0, [[G_64]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 0, [[G_64]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 0, [[TMP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3
 ; CHECK-NEXT:    store <4 x i16> splat (i16 -1), ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -521,7 +518,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    store i16 [[G_16]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -558,6 +555,5 @@ exit:
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 95855e8..be1eb78 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -35,10 +35,11 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop.header>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:         IR   %iv.next = add i64 %iv, 1
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index 484e1ea..dd3b50b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -14,7 +14,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: ir-bb<vector.ph>:
 ; CHECK-NEXT:   IR %n.mod.vf = urem i64 %0, 2
 ; CHECK-NEXT:   IR %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:   IR %ind.end = getelementptr i8, ptr %start, i64 %n.vec
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + ir<%n.vec> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -94,7 +94,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi ir<%ind.end>, ir<%start>
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index f07d1af..beb305f 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -19,6 +19,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + vp<[[VTC]]> * ir<-1>
+; CHECK-NEXT:   vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + vp<[[VTC]]> * ir<1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -43,11 +45,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%and>
+; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%A>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:   IR   %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:        IR   %cmp = icmp eq i64 %iv.next, 0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -57,7 +61,6 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ;
 ; CHECK: Executing best plan with VF=8, UF=2
 ; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' {
-; CHECK-NEXT: Live-in ir<[[VFxUF:.+]]> = VF * UF
 ; CHECK-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
@@ -69,29 +72,23 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: ir-bb<vector.ph>:
 ; CHECK-NEXT:  IR   %n.mod.vf = urem i64 %and, 16
 ; CHECK-NEXT:  IR   %n.vec = sub i64 %and, %n.mod.vf
-; CHECK-NEXT:  IR   %ind.end = sub i64 %and, %n.vec
-; CHECK-NEXT:  IR   %ind.end1 = getelementptr i8, ptr %A, i64 %n.vec
-; CHECK-NEXT: Successor(s): vector loop
+; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + ir<[[VTC]]> * ir<-1>
+; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + ir<[[VTC]]> * ir<1>
+; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:     vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
-; CHECK-NEXT:     vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR1]]>
-; CHECK-NEXT:     WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
-; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
-; CHECK-NEXT:     WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
-; CHECK-NEXT:     vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
-; CHECK-NEXT:     vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR3]]>, ir<%add>
-; CHECK-NEXT:     WIDEN store vp<[[VPTR4]]>, ir<%add>.1
-; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, ir<[[VFxUF]]>
-; CHECK-NEXT:     EMIT branch-on-cond ir<true>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]>
+; CHECK-NEXT:   vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:   vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1>
+; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[VPTR1]]>
+; CHECK-NEXT:   WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
+; CHECK-NEXT:   WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:   WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
+; CHECK-NEXT:   vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]>
+; CHECK-NEXT:   vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
+; CHECK-NEXT:   WIDEN store vp<[[VPTR3]]>, ir<%add>
+; CHECK-NEXT:   WIDEN store vp<[[VPTR4]]>, ir<%add>.1
 ; CHECK-NEXT: Successor(s): ir-bb<middle.block>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<middle.block>:
@@ -103,8 +100,8 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi ir<%ind.end>, ir<%and>
-; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi ir<%ind.end1>, ir<%A>
+; CHECK-NEXT:   EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%and>
+; CHECK-NEXT:   EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%A>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index cc2bd4e..5c09ce2 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -42,10 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -112,10 +113,11 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:    EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
@@ -180,11 +182,12 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -246,11 +249,12 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %red = phi float [ %red.next, %for.body ], [ 0.000000e+00, %entry ]
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
@@ -332,10 +336,11 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+; CHECK-NEXT:    IR   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %cmp = icmp ult i64 %i, 5
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -381,6 +386,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * ir<4>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -412,10 +418,11 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %cmp = icmp slt i64 %iv.next, 1024
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -494,12 +501,13 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-; CHECK-NEXT:    IR   %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
+; CHECK-NEXT:    IR   %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] (extra operand: vp<[[RED_RESUME]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -588,10 +596,11 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %cmp1 = icmp slt i32 %lsd, 100
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -648,6 +657,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * vp<[[EXP_SCEV]]>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -671,10 +681,11 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): ir-bb<loop.exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %iv.next = add i64 %iv, %inc
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -738,10 +749,11 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %ec = icmp eq i32 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -805,6 +817,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
@@ -873,10 +886,11 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -961,10 +975,11 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %ifcond = fcmp oeq float %ld.value, 5.0
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -1038,10 +1053,11 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond = icmp eq i64 %iv.next, %n
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
@@ -1152,11 +1168,12 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
+; CHECK-NEXT:   EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
 ; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
-; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
index a939b1e..b6391e0 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
@@ -16,6 +16,7 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<0> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -53,11 +54,13 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) {
 ; CHECK-NEXT:  Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0>
+; CHECK-NEXT:    EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<0>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
-; CHECK-NEXT:    IR   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ]
+; CHECK-NEXT:    IR   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
+; CHECK-NEXT:    IR   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
 ; CHECK:         IR   %tmp5 = trunc i32 %tmp4 to i8
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 5343451..aa05bb1 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -262,6 +262,7 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<21> + vp<[[VEC_TC]]> * ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.A.uniform> = getelementptr inbounds ir<%A>, ir<0>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
@@ -1046,6 +1047,7 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -1086,10 +1088,11 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%n>
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:   IR   %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv.next = add nsw i32 %iv, -1
 ; CHECK-NEXT:   IR   %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
 ; CHECK-NEXT:   IR   %l = load i32, ptr %gep.src, align 16
@@ -1134,6 +1137,7 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT: Successor(s): vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[END:%.+]]> = DERIVED-IV ir<%start> + vp<[[VEC_TC]]> * ir<-1>
 ; CHECK-NEXT: Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
@@ -1177,10 +1181,11 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%start>
 ; CHECK-NEXT: Successor(s): ir-bb<loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+; CHECK-NEXT:   IR   %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 -1
 ; CHECK-NEXT:   IR   %l = load i8, ptr %ptr.iv.next, align 1
 ; CHECK-NEXT:   IR   %c.1 = icmp eq i8 %l, 0
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 39b90ad..65d78f4 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -803,6 +803,19 @@ define void @byval_param_noalias_metadata(ptr align 4 byval(i32) %ptr) {
   ret void
 }
 
+define void @byval_param_profile_metadata(ptr align 4 byval(i32) %ptr) {
+; CHECK-LABEL: @byval_param_profile_metadata(
+; CHECK-NEXT:    store i32 1, ptr [[PTR2:%.*]], align 4
+; CHECK-NEXT:    call void @f_byval(ptr byval(i32) align 4 [[PTR2]]), !prof [[PROF3:![0-9]+]], !memprof [[META4:![0-9]+]], !callsite [[META7:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %tmp = alloca i32, align 4
+  store i32 1, ptr %ptr
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp, ptr align 4 %ptr, i64 4, i1 false)
+  call void @f_byval(ptr align 4 byval(i32) %tmp), !memprof !3, !callsite !6, !prof !7
+  ret void
+}
+
 define void @memcpy_memory_none(ptr %p, ptr %p2, i64 %size) {
 ; CHECK-LABEL: @memcpy_memory_none(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[SIZE:%.*]], i1 false) #[[ATTR7:[0-9]+]]
@@ -897,3 +910,8 @@ define void @memcpy_immut_escape_after(ptr align 4 noalias %val) {
 !0 = !{!0}
 !1 = !{!1, !0}
 !2 = !{!1}
+!3 = !{!4}
+!4 = !{!5, !"cold"}
+!5 = !{i64 123, i64 456}
+!6 = !{i64 123}
+!7 = !{!"branch_weights", i32 10}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll
new file mode 100644
index 0000000..759d511
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll
@@ -0,0 +1,159 @@
+;; Test recursion handling during cloning.
+;;
+;; Original code looks like:
+;;
+;; #include <stdlib.h>
+;; #include <string.h>
+;; #include <unistd.h>
+;; __attribute((noinline)) char *D() {
+;;   return new char[10];
+;; }
+;; __attribute((noinline)) char *B(int n);
+;; __attribute((noinline)) char *C(int n) {
+;;   if (!n) {
+;;     return D();
+;;   }
+;;   return B(n-1);
+;; }
+;; __attribute((noinline)) char *B(int n) {
+;;   return C(n);
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = B(1);
+;;   char *y = B(1);
+;;   char *z = B(0);
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   free(x);
+;;   sleep(200);
+;;   free(y);
+;;   free(z);
+;;   return 0;
+;; }
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+;; By default we should enable cloning of contexts involved with recursive
+;; cycles, but not through the cycle itself. I.e. until full support for
+;; recursion is added, the cloned recursive call from C back to B (line 12) will
+;; not be updated to call a clone.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS
+
+;; Skipping recursive callsites should result in no cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-callsites=false \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="marked with memprof allocation attribute cold" \
+; RUN:  --check-prefix=ALL
+
+;; Skipping recursive contexts should prevent spurious call to cloned version of
+;; B from the context starting at memprof_recursive.cc:19:13, which is actually
+;; recursive (until that support is added).
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-contexts=false \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS
+
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+;; We should only call the cold clone for the recursive context if we enabled
+;; recursive contexts via -memprof-allow-recursive-contexts=true (default).
+; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+;; We should call the original B for the recursive context if we have
+;; disabled recursive contexts via -memprof-allow-recursive-contexts=false.
+; SKIP-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:12:10: call in clone _Z1Ci assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:18:13: call in clone main assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi assigned to call function clone _Z1Ci
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci assigned to call function clone _Z1Dv
+; ALL: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Dv() !dbg !3 {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !dbg !6, !memprof !7, !callsite !14
+  ret ptr null
+}
+
+define ptr @_Z1Ci(i32 %n) !dbg !15 {
+entry:
+  %call = tail call ptr @_Z1Dv(), !dbg !16, !callsite !17
+  br label %return
+
+if.end:                                           ; No predecessors!
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !18, !callsite !19
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Bi(i32 %n) !dbg !20 {
+entry:
+  %call = tail call ptr @_Z1Ci(i32 0), !dbg !21, !callsite !22
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = tail call ptr @_Z1Bi(i32 0), !dbg !23, !callsite !25
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !26, !callsite !27
+  %call2 = tail call ptr @_Z1Bi(i32 0), !dbg !28, !callsite !29
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 7aec6dc477f8148ed066d10dfc7a012a51b6599c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_recursive.cc", directory: ".", checksumkind: CSK_MD5, checksum: "2f15f63b187a0e0d40e7fdd18b10576a")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "D", linkageName: "_Z1Dv", scope: !1, file: !1, line: 4, type: !4, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 5, column: 10, scope: !3)
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 6307901912192269588}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 8632435727821051414}
+!12 = !{!13, !"cold"}
+!13 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 -3421689549917153178}
+!14 = !{i64 6541423618768552252}
+!15 = distinct !DISubprogram(name: "C", linkageName: "_Z1Ci", scope: !1, file: !1, line: 8, type: !4, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DILocation(line: 10, column: 12, scope: !15)
+!17 = !{i64 -200552803509692312}
+!18 = !DILocation(line: 12, column: 10, scope: !15)
+!19 = !{i64 -7155190423157709404}
+!20 = distinct !DISubprogram(name: "B", linkageName: "_Z1Bi", scope: !1, file: !1, line: 14, type: !4, scopeLine: 14, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = !DILocation(line: 15, column: 10, scope: !20)
+!22 = !{i64 -2954124005641725917}
+!23 = !DILocation(line: 18, column: 13, scope: !24)
+!24 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !4, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = !{i64 8632435727821051414}
+!26 = !DILocation(line: 19, column: 13, scope: !24)
+!27 = !{i64 -3421689549917153178}
+!28 = !DILocation(line: 20, column: 13, scope: !24)
+!29 = !{i64 6307901912192269588}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
new file mode 100644
index 0000000..9f3e09d
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
@@ -0,0 +1,806 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+%struct.cmplx_int16_t = type { i16, i16 }
+%struct.compressed_data_8bit = type { i8, [24 x i8] }
+
+define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 {
+; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(
+; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr nocapture noundef readonly [[SRC:%.*]], ptr nocapture noundef writeonly [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0
+; CHECK-NEXT:    br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[CMP31_NOT:%.*]] = icmp eq ptr [[SCALE]], null
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT58:%.*]] = zext i32 [[N_PRB]] to i64
+; CHECK-NEXT:    br i1 [[CMP31_NOT]], label %[[FOR_BODY_US:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY_US]]:
+; CHECK-NEXT:    [[INDVARS_IV55:%.*]] = phi i64 [ [[INDVARS_IV_NEXT56:%.*]], %[[FOR_BODY_US]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052_US:%.*]] = phi ptr [ [[DST_ADDR_1_US:%.*]], %[[FOR_BODY_US]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT:%.*]], ptr [[SRC]], i64 [[INDVARS_IV55]]
+; CHECK-NEXT:    [[MANTISSA_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MANTISSA_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I59_US:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 9
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I56_US:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 17
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I_US:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
+; CHECK-NEXT:    [[CONV_US:%.*]] = sext i8 [[TMP3]] to i16
+; CHECK-NEXT:    [[MUL_US:%.*]] = shl nsw i16 [[CONV_US]], 1
+; CHECK-NEXT:    [[VECINIT_I79_US:%.*]] = insertelement <8 x i16> poison, i16 [[MUL_US]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86_US:%.*]] = shufflevector <8 x i16> [[VECINIT_I79_US]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I59_US]]
+; CHECK-NEXT:    [[MUL_I74_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I56_US]]
+; CHECK-NEXT:    [[MUL_I_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I_US]]
+; CHECK-NEXT:    store <8 x i16> [[MUL_I87_US]], ptr [[DST_ADDR_052_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR47_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[MUL_I74_US]], ptr [[ADD_PTR47_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR50_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[MUL_I_US]], ptr [[ADD_PTR50_US]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1_US]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT56]] = add nuw nsw i64 [[INDVARS_IV55]], 1
+; CHECK-NEXT:    [[EXITCOND59_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT56]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND59_NOT]], label %[[FOR_END]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052:%.*]] = phi ptr [ [[DST_ADDR_1:%.*]], %[[FOR_BODY]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP_COERCE_050:%.*]] = phi i64 [ [[AGG_TMP_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_049:%.*]] = phi i64 [ [[AGG_TMP42_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_048:%.*]] = phi i64 [ [[AGG_TMP37_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT]], ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[MANTISSA:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[MANTISSA]], align 1
+; CHECK-NEXT:    [[VMOVL_I59:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 9
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[VMOVL_I56:%.*]] = sext <8 x i8> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 17
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15]], align 1
+; CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i16 [[CONV]], 1
+; CHECK-NEXT:    [[VECINIT_I79:%.*]] = insertelement <8 x i16> poison, i16 [[MUL]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86:%.*]] = shufflevector <8 x i16> [[VECINIT_I79]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I59]]
+; CHECK-NEXT:    [[MUL_I74:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I56]]
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I]]
+; CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP_COERCE_050]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP_COERCE_0_INSERT_MASK]], [[AGG_TMP_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL33:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I87]], i64 [[AGG_TMP_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    store <8 x i16> [[CALL33]], ptr [[DST_ADDR_052]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP37_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP37_COERCE_048]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP37_COERCE_0_INSERT_MASK]], [[AGG_TMP37_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL38:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I74]], i64 [[AGG_TMP37_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[CALL38]], ptr [[ARRAYIDX39]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP42_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP42_COERCE_049]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP42_COERCE_0_INSERT_MASK]], [[AGG_TMP42_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL43:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I]], i64 [[AGG_TMP42_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[CALL43]], ptr [[ARRAYIDX44]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %__p0.addr.i75 = alloca <8 x i16>, align 16
+  %__p1.addr.i76 = alloca i16, align 2
+  %__ret.i77 = alloca <8 x i16>, align 16
+  %.compoundliteral.i78 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <8 x i16>, align 16
+  %__p1.addr.i63 = alloca i16, align 2
+  %__ret.i64 = alloca <8 x i16>, align 16
+  %.compoundliteral.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i60 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca i16, align 2
+  %__ret.i61 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i57 = alloca <8 x i8>, align 8
+  %__ret.i58 = alloca <8 x i16>, align 16
+  %__p0.addr.i54 = alloca <8 x i8>, align 8
+  %__ret.i55 = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i8>, align 8
+  %__ret.i = alloca <8 x i16>, align 16
+  %n_prb.addr = alloca i32, align 4
+  %src.addr = alloca ptr, align 8
+  %dst.addr = alloca ptr, align 8
+  %scale.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  %prb_comp_in = alloca [3 x <8 x i16>], align 16
+  %__ret = alloca <8 x i8>, align 8
+  %tmp = alloca <8 x i8>, align 8
+  %__ret3 = alloca <8 x i8>, align 8
+  %tmp8 = alloca <8 x i8>, align 8
+  %__ret11 = alloca <8 x i8>, align 8
+  %tmp16 = alloca <8 x i8>, align 8
+  %prb_decomp = alloca [3 x <8 x i16>], align 16
+  %scaling_factor = alloca i16, align 2
+  %__s1 = alloca <8 x i16>, align 16
+  %agg.tmp = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp.coerce = alloca i64, align 8
+  %__s135 = alloca <8 x i16>, align 16
+  %agg.tmp37 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp37.coerce = alloca i64, align 8
+  %__s140 = alloca <8 x i16>, align 16
+  %agg.tmp42 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp42.coerce = alloca i64, align 8
+  %__s145 = alloca <8 x i16>, align 16
+  %__s148 = alloca <8 x i16>, align 16
+  %__s151 = alloca <8 x i16>, align 16
+  store i32 %n_prb, ptr %n_prb.addr, align 4
+  store ptr %src, ptr %src.addr, align 8
+  store ptr %dst, ptr %dst.addr, align 8
+  store ptr %scale, ptr %scale.addr, align 8
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %n_prb.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load ptr, ptr %src.addr, align 8
+  %3 = load i32, ptr %i, align 4
+  %idxprom = zext i32 %3 to i64
+  %arrayidx = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %2, i64 %idxprom
+  %mantissa = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx, i32 0, i32 1
+  %arrayidx1 = getelementptr inbounds [24 x i8], ptr %mantissa, i64 0, i64 0
+  %4 = load <8 x i8>, ptr %arrayidx1, align 1
+  store <8 x i8> %4, ptr %__ret, align 8
+  %5 = load <8 x i8>, ptr %__ret, align 8
+  store <8 x i8> %5, ptr %tmp, align 8
+  %6 = load <8 x i8>, ptr %tmp, align 8
+  store <8 x i8> %6, ptr %__p0.addr.i57, align 8
+  %7 = load <8 x i8>, ptr %__p0.addr.i57, align 8
+  %vmovl.i59 = sext <8 x i8> %7 to <8 x i16>
+  store <8 x i16> %vmovl.i59, ptr %__ret.i58, align 16
+  %8 = load <8 x i16>, ptr %__ret.i58, align 16
+  %arrayidx2 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  store <8 x i16> %8, ptr %arrayidx2, align 16
+  %9 = load ptr, ptr %src.addr, align 8
+  %10 = load i32, ptr %i, align 4
+  %idxprom4 = zext i32 %10 to i64
+  %arrayidx5 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %9, i64 %idxprom4
+  %mantissa6 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx5, i32 0, i32 1
+  %arrayidx7 = getelementptr inbounds [24 x i8], ptr %mantissa6, i64 0, i64 8
+  %11 = load <8 x i8>, ptr %arrayidx7, align 1
+  store <8 x i8> %11, ptr %__ret3, align 8
+  %12 = load <8 x i8>, ptr %__ret3, align 8
+  store <8 x i8> %12, ptr %tmp8, align 8
+  %13 = load <8 x i8>, ptr %tmp8, align 8
+  store <8 x i8> %13, ptr %__p0.addr.i54, align 8
+  %14 = load <8 x i8>, ptr %__p0.addr.i54, align 8
+  %vmovl.i56 = sext <8 x i8> %14 to <8 x i16>
+  store <8 x i16> %vmovl.i56, ptr %__ret.i55, align 16
+  %15 = load <8 x i16>, ptr %__ret.i55, align 16
+  %arrayidx10 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  store <8 x i16> %15, ptr %arrayidx10, align 16
+  %16 = load ptr, ptr %src.addr, align 8
+  %17 = load i32, ptr %i, align 4
+  %idxprom12 = zext i32 %17 to i64
+  %arrayidx13 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %16, i64 %idxprom12
+  %mantissa14 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx13, i32 0, i32 1
+  %arrayidx15 = getelementptr inbounds [24 x i8], ptr %mantissa14, i64 0, i64 16
+  %18 = load <8 x i8>, ptr %arrayidx15, align 1
+  store <8 x i8> %18, ptr %__ret11, align 8
+  %19 = load <8 x i8>, ptr %__ret11, align 8
+  store <8 x i8> %19, ptr %tmp16, align 8
+  %20 = load <8 x i8>, ptr %tmp16, align 8
+  store <8 x i8> %20, ptr %__p0.addr.i, align 8
+  %21 = load <8 x i8>, ptr %__p0.addr.i, align 8
+  %vmovl.i = sext <8 x i8> %21 to <8 x i16>
+  store <8 x i16> %vmovl.i, ptr %__ret.i, align 16
+  %22 = load <8 x i16>, ptr %__ret.i, align 16
+  %arrayidx18 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  store <8 x i16> %22, ptr %arrayidx18, align 16
+  %23 = load ptr, ptr %src.addr, align 8
+  %24 = load i32, ptr %i, align 4
+  %idxprom19 = zext i32 %24 to i64
+  %arrayidx20 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %23, i64 %idxprom19
+  %exp = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx20, i32 0, i32 0
+  %25 = load i8, ptr %exp, align 1
+  %conv = sext i8 %25 to i32
+  %mul = mul nsw i32 %conv, 2
+  %conv21 = trunc i32 %mul to i16
+  store i16 %conv21, ptr %scaling_factor, align 2
+  %arrayidx22 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  %26 = load <8 x i16>, ptr %arrayidx22, align 16
+  %27 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %26, ptr %__p0.addr.i75, align 16
+  store i16 %27, ptr %__p1.addr.i76, align 2
+  %28 = load <8 x i16>, ptr %__p0.addr.i75, align 16
+  %29 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit.i79 = insertelement <8 x i16> poison, i16 %29, i32 0
+  %30 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit1.i80 = insertelement <8 x i16> %vecinit.i79, i16 %30, i32 1
+  %31 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit2.i81 = insertelement <8 x i16> %vecinit1.i80, i16 %31, i32 2
+  %32 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit3.i82 = insertelement <8 x i16> %vecinit2.i81, i16 %32, i32 3
+  %33 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit4.i83 = insertelement <8 x i16> %vecinit3.i82, i16 %33, i32 4
+  %34 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit5.i84 = insertelement <8 x i16> %vecinit4.i83, i16 %34, i32 5
+  %35 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit6.i85 = insertelement <8 x i16> %vecinit5.i84, i16 %35, i32 6
+  %36 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit7.i86 = insertelement <8 x i16> %vecinit6.i85, i16 %36, i32 7
+  store <8 x i16> %vecinit7.i86, ptr %.compoundliteral.i78, align 16
+  %37 = load <8 x i16>, ptr %.compoundliteral.i78, align 16
+  %mul.i87 = mul <8 x i16> %28, %37
+  store <8 x i16> %mul.i87, ptr %__ret.i77, align 16
+  %38 = load <8 x i16>, ptr %__ret.i77, align 16
+  %arrayidx24 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  store <8 x i16> %38, ptr %arrayidx24, align 16
+  %arrayidx25 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  %39 = load <8 x i16>, ptr %arrayidx25, align 16
+  %40 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %39, ptr %__p0.addr.i62, align 16
+  store i16 %40, ptr %__p1.addr.i63, align 2
+  %41 = load <8 x i16>, ptr %__p0.addr.i62, align 16
+  %42 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit.i66 = insertelement <8 x i16> poison, i16 %42, i32 0
+  %43 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit1.i67 = insertelement <8 x i16> %vecinit.i66, i16 %43, i32 1
+  %44 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit2.i68 = insertelement <8 x i16> %vecinit1.i67, i16 %44, i32 2
+  %45 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit3.i69 = insertelement <8 x i16> %vecinit2.i68, i16 %45, i32 3
+  %46 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit4.i70 = insertelement <8 x i16> %vecinit3.i69, i16 %46, i32 4
+  %47 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit5.i71 = insertelement <8 x i16> %vecinit4.i70, i16 %47, i32 5
+  %48 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit6.i72 = insertelement <8 x i16> %vecinit5.i71, i16 %48, i32 6
+  %49 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit7.i73 = insertelement <8 x i16> %vecinit6.i72, i16 %49, i32 7
+  store <8 x i16> %vecinit7.i73, ptr %.compoundliteral.i65, align 16
+  %50 = load <8 x i16>, ptr %.compoundliteral.i65, align 16
+  %mul.i74 = mul <8 x i16> %41, %50
+  store <8 x i16> %mul.i74, ptr %__ret.i64, align 16
+  %51 = load <8 x i16>, ptr %__ret.i64, align 16
+  %arrayidx27 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  store <8 x i16> %51, ptr %arrayidx27, align 16
+  %arrayidx28 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  %52 = load <8 x i16>, ptr %arrayidx28, align 16
+  %53 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %52, ptr %__p0.addr.i60, align 16
+  store i16 %53, ptr %__p1.addr.i, align 2
+  %54 = load <8 x i16>, ptr %__p0.addr.i60, align 16
+  %55 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %55, i32 0
+  %56 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %56, i32 1
+  %57 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %57, i32 2
+  %58 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %58, i32 3
+  %59 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %59, i32 4
+  %60 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %60, i32 5
+  %61 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %61, i32 6
+  %62 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %62, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %63 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  %mul.i = mul <8 x i16> %54, %63
+  store <8 x i16> %mul.i, ptr %__ret.i61, align 16
+  %64 = load <8 x i16>, ptr %__ret.i61, align 16
+  %arrayidx30 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  store <8 x i16> %64, ptr %arrayidx30, align 16
+  %65 = load ptr, ptr %scale.addr, align 8
+  %cmp31 = icmp ne ptr %65, null
+  br i1 %cmp31, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx32 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %66 = load <8 x i16>, ptr %arrayidx32, align 16
+  %67 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp, ptr align 2 %67, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp.coerce, ptr align 2 %agg.tmp, i64 4, i1 false)
+  %68 = load i64, ptr %agg.tmp.coerce, align 8
+  %call33 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %66, i64 %68)
+  store <8 x i16> %call33, ptr %__s1, align 16
+  %69 = load ptr, ptr %dst.addr, align 8
+  %arrayidx34 = getelementptr inbounds %struct.cmplx_int16_t, ptr %69, i64 0
+  %70 = load <8 x i16>, ptr %__s1, align 16
+  %71 = bitcast <8 x i16> %70 to <16 x i8>
+  %72 = bitcast <16 x i8> %71 to <8 x i16>
+  store <8 x i16> %72, ptr %arrayidx34, align 2
+  %arrayidx36 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %73 = load <8 x i16>, ptr %arrayidx36, align 16
+  %74 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp37, ptr align 2 %74, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp37.coerce, ptr align 2 %agg.tmp37, i64 4, i1 false)
+  %75 = load i64, ptr %agg.tmp37.coerce, align 8
+  %call38 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %73, i64 %75)
+  store <8 x i16> %call38, ptr %__s135, align 16
+  %76 = load ptr, ptr %dst.addr, align 8
+  %arrayidx39 = getelementptr inbounds %struct.cmplx_int16_t, ptr %76, i64 4
+  %77 = load <8 x i16>, ptr %__s135, align 16
+  %78 = bitcast <8 x i16> %77 to <16 x i8>
+  %79 = bitcast <16 x i8> %78 to <8 x i16>
+  store <8 x i16> %79, ptr %arrayidx39, align 2
+  %arrayidx41 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %80 = load <8 x i16>, ptr %arrayidx41, align 16
+  %81 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp42, ptr align 2 %81, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp42.coerce, ptr align 2 %agg.tmp42, i64 4, i1 false)
+  %82 = load i64, ptr %agg.tmp42.coerce, align 8
+  %call43 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %80, i64 %82)
+  store <8 x i16> %call43, ptr %__s140, align 16
+  %83 = load ptr, ptr %dst.addr, align 8
+  %arrayidx44 = getelementptr inbounds %struct.cmplx_int16_t, ptr %83, i64 8
+  %84 = load <8 x i16>, ptr %__s140, align 16
+  %85 = bitcast <8 x i16> %84 to <16 x i8>
+  %86 = bitcast <16 x i8> %85 to <8 x i16>
+  store <8 x i16> %86, ptr %arrayidx44, align 2
+  %87 = load ptr, ptr %dst.addr, align 8
+  %add.ptr = getelementptr inbounds %struct.cmplx_int16_t, ptr %87, i64 12
+  store ptr %add.ptr, ptr %dst.addr, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %for.body
+  %arrayidx46 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %88 = load <8 x i16>, ptr %arrayidx46, align 16
+  store <8 x i16> %88, ptr %__s145, align 16
+  %89 = load ptr, ptr %dst.addr, align 8
+  %90 = load <8 x i16>, ptr %__s145, align 16
+  %91 = bitcast <8 x i16> %90 to <16 x i8>
+  %92 = bitcast <16 x i8> %91 to <8 x i16>
+  store <8 x i16> %92, ptr %89, align 2
+  %93 = load ptr, ptr %dst.addr, align 8
+  %add.ptr47 = getelementptr inbounds %struct.cmplx_int16_t, ptr %93, i64 4
+  store ptr %add.ptr47, ptr %dst.addr, align 8
+  %arrayidx49 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %94 = load <8 x i16>, ptr %arrayidx49, align 16
+  store <8 x i16> %94, ptr %__s148, align 16
+  %95 = load ptr, ptr %dst.addr, align 8
+  %96 = load <8 x i16>, ptr %__s148, align 16
+  %97 = bitcast <8 x i16> %96 to <16 x i8>
+  %98 = bitcast <16 x i8> %97 to <8 x i16>
+  store <8 x i16> %98, ptr %95, align 2
+  %99 = load ptr, ptr %dst.addr, align 8
+  %add.ptr50 = getelementptr inbounds %struct.cmplx_int16_t, ptr %99, i64 4
+  store ptr %add.ptr50, ptr %dst.addr, align 8
+  %arrayidx52 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %100 = load <8 x i16>, ptr %arrayidx52, align 16
+  store <8 x i16> %100, ptr %__s151, align 16
+  %101 = load ptr, ptr %dst.addr, align 8
+  %102 = load <8 x i16>, ptr %__s151, align 16
+  %103 = bitcast <8 x i16> %102 to <16 x i8>
+  %104 = bitcast <16 x i8> %103 to <8 x i16>
+  store <8 x i16> %104, ptr %101, align 2
+  %105 = load ptr, ptr %dst.addr, align 8
+  %add.ptr53 = getelementptr inbounds %struct.cmplx_int16_t, ptr %105, i64 4
+  store ptr %add.ptr53, ptr %dst.addr, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %106 = load i32, ptr %i, align 4
+  %inc = add i32 %106, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0
+}
+
+define internal noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %a, i64 %scale.coerce) #0 {
+; CHECK-LABEL: define internal fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(
+; CHECK-SAME: <8 x i16> noundef [[A:%.*]], i64 [[SCALE_COERCE:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SCALE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_COERCE]] to i16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_SHIFT36:%.*]] = lshr i64 [[SCALE_COERCE]], 16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_SROA_2_0_EXTRACT_SHIFT36]] to i16
+; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    [[VECINIT_I19:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_0_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_2_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VECINIT7_I]])
+; CHECK-NEXT:    [[VBSL5_I:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <8 x i32> <i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8>
+; CHECK-NEXT:    [[SHUFFLE_I85:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE_I82:%.*]] = shufflevector <8 x i16> [[VECINIT_I19]], <8 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQDMULL_V2_I72:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I85]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I97:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I97]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[VQDMLAL2_I106:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I79]], <4 x i16> [[SHUFFLE_I76]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I107:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I72]], <4 x i32> [[VQDMLAL2_I106]])
+; CHECK-NEXT:    [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I91]], <4 x i16> [[SHUFFLE_I88]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I]], <4 x i32> [[VQDMLAL2_I]])
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I107]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE_I61:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I61]]
+;
+entry:
+  %__p0.addr.i102 = alloca <4 x i32>, align 16
+  %__p1.addr.i103 = alloca <4 x i16>, align 8
+  %__p2.addr.i104 = alloca <4 x i16>, align 8
+  %__ret.i105 = alloca <4 x i32>, align 16
+  %__p0.addr.i98 = alloca <4 x i32>, align 16
+  %__p1.addr.i99 = alloca <4 x i16>, align 8
+  %__p2.addr.i100 = alloca <4 x i16>, align 8
+  %__ret.i101 = alloca <4 x i32>, align 16
+  %__p0.addr.i95 = alloca <8 x i16>, align 16
+  %__ret.i96 = alloca <4 x i16>, align 8
+  %__p0.addr.i92 = alloca <8 x i16>, align 16
+  %__ret.i93 = alloca <4 x i16>, align 8
+  %__p0.addr.i89 = alloca <8 x i16>, align 16
+  %__ret.i90 = alloca <4 x i16>, align 8
+  %__p0.addr.i86 = alloca <8 x i16>, align 16
+  %__ret.i87 = alloca <4 x i16>, align 8
+  %__p0.addr.i83 = alloca <8 x i16>, align 16
+  %__ret.i84 = alloca <4 x i16>, align 8
+  %__p0.addr.i80 = alloca <8 x i16>, align 16
+  %__ret.i81 = alloca <4 x i16>, align 8
+  %__p0.addr.i77 = alloca <8 x i16>, align 16
+  %__ret.i78 = alloca <4 x i16>, align 8
+  %__p0.addr.i74 = alloca <8 x i16>, align 16
+  %__ret.i75 = alloca <4 x i16>, align 8
+  %__p0.addr.i69 = alloca <4 x i16>, align 8
+  %__p1.addr.i70 = alloca <4 x i16>, align 8
+  %__ret.i71 = alloca <4 x i32>, align 16
+  %__p0.addr.i66 = alloca <4 x i16>, align 8
+  %__p1.addr.i67 = alloca <4 x i16>, align 8
+  %__ret.i68 = alloca <4 x i32>, align 16
+  %__p0.addr.i64 = alloca <4 x i32>, align 16
+  %__ret.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <4 x i32>, align 16
+  %__ret.i63 = alloca <8 x i16>, align 16
+  %__p0.addr.i58 = alloca <8 x i16>, align 16
+  %__p1.addr.i59 = alloca <8 x i16>, align 16
+  %__ret.i60 = alloca <8 x i16>, align 16
+  %__p0.addr.i51 = alloca <4 x i32>, align 16
+  %__p1.addr.i52 = alloca <8 x i16>, align 16
+  %__p2.addr.i53 = alloca <8 x i16>, align 16
+  %__ret.i54 = alloca <4 x i32>, align 16
+  %a.addr.i46 = alloca <4 x i32>, align 16
+  %b.addr.i47 = alloca <8 x i16>, align 16
+  %c.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i40 = alloca <8 x i16>, align 16
+  %__p1.addr.i41 = alloca <8 x i16>, align 16
+  %__ret.i42 = alloca <4 x i32>, align 16
+  %a.addr.i = alloca <8 x i16>, align 16
+  %b.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i38 = alloca <8 x i16>, align 16
+  %__ret.i39 = alloca <8 x i16>, align 16
+  %__p0.addr.i36 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca <8 x i16>, align 16
+  %__p2.addr.i = alloca <8 x i16>, align 16
+  %__ret.i37 = alloca <8 x i16>, align 16
+  %__p0.addr.i29 = alloca i32, align 4
+  %__ret.i30 = alloca <4 x i32>, align 16
+  %.compoundliteral.i31 = alloca <4 x i32>, align 16
+  %__p0.addr.i27 = alloca <4 x i32>, align 16
+  %__ret.i28 = alloca <8 x i16>, align 16
+  %__p0.addr.i16 = alloca i16, align 2
+  %__ret.i17 = alloca <8 x i16>, align 16
+  %.compoundliteral.i18 = alloca <8 x i16>, align 16
+  %__p0.addr.i14 = alloca i16, align 2
+  %__ret.i15 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i16>, align 16
+  %__ret.i = alloca <8 x i16>, align 16
+  %scale = alloca %struct.cmplx_int16_t, align 2
+  %a.addr = alloca <8 x i16>, align 16
+  %a_rev = alloca <8 x i16>, align 16
+  %cc = alloca <8 x i16>, align 16
+  %dd = alloca <8 x i16>, align 16
+  %mult_mask = alloca <8 x i16>, align 16
+  %lo32 = alloca <4 x i32>, align 16
+  %hi32 = alloca <4 x i32>, align 16
+  %coerce.val.ii = trunc i64 %scale.coerce to i32
+  store i32 %coerce.val.ii, ptr %scale, align 2
+  store <8 x i16> %a, ptr %a.addr, align 16
+  %0 = load <8 x i16>, ptr %a.addr, align 16
+  store <8 x i16> %0, ptr %__p0.addr.i, align 16
+  %1 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %2 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %shuffle.i = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  store <8 x i16> %shuffle.i, ptr %__ret.i, align 16
+  %3 = load <8 x i16>, ptr %__ret.i, align 16
+  store <8 x i16> %3, ptr %a_rev, align 16
+  %re = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 0
+  %4 = load i16, ptr %re, align 2
+  store i16 %4, ptr %__p0.addr.i16, align 2
+  %5 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit.i19 = insertelement <8 x i16> poison, i16 %5, i32 0
+  %6 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit1.i20 = insertelement <8 x i16> %vecinit.i19, i16 %6, i32 1
+  %7 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit2.i21 = insertelement <8 x i16> %vecinit1.i20, i16 %7, i32 2
+  %8 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit3.i22 = insertelement <8 x i16> %vecinit2.i21, i16 %8, i32 3
+  %9 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit4.i23 = insertelement <8 x i16> %vecinit3.i22, i16 %9, i32 4
+  %10 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit5.i24 = insertelement <8 x i16> %vecinit4.i23, i16 %10, i32 5
+  %11 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit6.i25 = insertelement <8 x i16> %vecinit5.i24, i16 %11, i32 6
+  %12 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit7.i26 = insertelement <8 x i16> %vecinit6.i25, i16 %12, i32 7
+  store <8 x i16> %vecinit7.i26, ptr %.compoundliteral.i18, align 16
+  %13 = load <8 x i16>, ptr %.compoundliteral.i18, align 16
+  store <8 x i16> %13, ptr %__ret.i17, align 16
+  %14 = load <8 x i16>, ptr %__ret.i17, align 16
+  store <8 x i16> %14, ptr %cc, align 16
+  %im = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 1
+  %15 = load i16, ptr %im, align 2
+  store i16 %15, ptr %__p0.addr.i14, align 2
+  %16 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %16, i32 0
+  %17 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %17, i32 1
+  %18 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %18, i32 2
+  %19 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %19, i32 3
+  %20 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %20, i32 4
+  %21 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %21, i32 5
+  %22 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %22, i32 6
+  %23 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %23, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %24 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  store <8 x i16> %24, ptr %__ret.i15, align 16
+  %25 = load <8 x i16>, ptr %__ret.i15, align 16
+  store <8 x i16> %25, ptr %dd, align 16
+  store i32 65535, ptr %__p0.addr.i29, align 4
+  %26 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit.i32 = insertelement <4 x i32> poison, i32 %26, i32 0
+  %27 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit1.i33 = insertelement <4 x i32> %vecinit.i32, i32 %27, i32 1
+  %28 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit2.i34 = insertelement <4 x i32> %vecinit1.i33, i32 %28, i32 2
+  %29 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit3.i35 = insertelement <4 x i32> %vecinit2.i34, i32 %29, i32 3
+  store <4 x i32> %vecinit3.i35, ptr %.compoundliteral.i31, align 16
+  %30 = load <4 x i32>, ptr %.compoundliteral.i31, align 16
+  store <4 x i32> %30, ptr %__ret.i30, align 16
+  %31 = load <4 x i32>, ptr %__ret.i30, align 16
+  store <4 x i32> %31, ptr %__p0.addr.i27, align 16
+  %32 = load <4 x i32>, ptr %__p0.addr.i27, align 16
+  %33 = bitcast <4 x i32> %32 to <8 x i16>
+  store <8 x i16> %33, ptr %__ret.i28, align 16
+  %34 = load <8 x i16>, ptr %__ret.i28, align 16
+  store <8 x i16> %34, ptr %mult_mask, align 16
+  %35 = load <8 x i16>, ptr %mult_mask, align 16
+  %36 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %36, ptr %__p0.addr.i38, align 16
+  %37 = load <8 x i16>, ptr %__p0.addr.i38, align 16
+  %38 = bitcast <8 x i16> %37 to <16 x i8>
+  %vqnegq_v1.i = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %37)
+  %vqnegq_v2.i = bitcast <8 x i16> %vqnegq_v1.i to <16 x i8>
+  store <8 x i16> %vqnegq_v1.i, ptr %__ret.i39, align 16
+  %39 = load <8 x i16>, ptr %__ret.i39, align 16
+  %40 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %35, ptr %__p0.addr.i36, align 16
+  store <8 x i16> %39, ptr %__p1.addr.i, align 16
+  store <8 x i16> %40, ptr %__p2.addr.i, align 16
+  %41 = load <8 x i16>, ptr %__p0.addr.i36, align 16
+  %42 = bitcast <8 x i16> %41 to <16 x i8>
+  %43 = load <8 x i16>, ptr %__p1.addr.i, align 16
+  %44 = bitcast <8 x i16> %43 to <16 x i8>
+  %45 = load <8 x i16>, ptr %__p2.addr.i, align 16
+  %46 = bitcast <8 x i16> %45 to <16 x i8>
+  %vbsl3.i = and <8 x i16> %41, %43
+  %47 = xor <8 x i16> %41, splat (i16 -1)
+  %vbsl4.i = and <8 x i16> %47, %45
+  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
+  store <8 x i16> %vbsl5.i, ptr %__ret.i37, align 16
+  %48 = load <8 x i16>, ptr %__ret.i37, align 16
+  store <8 x i16> %48, ptr %dd, align 16
+  %49 = load <8 x i16>, ptr %a.addr, align 16
+  %50 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %49, ptr %a.addr.i, align 16
+  store <8 x i16> %50, ptr %b.addr.i, align 16
+  %51 = load <8 x i16>, ptr %a.addr.i, align 16
+  store <8 x i16> %51, ptr %__p0.addr.i83, align 16
+  %52 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %53 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %shuffle.i85 = shufflevector <8 x i16> %52, <8 x i16> %53, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i85, ptr %__ret.i84, align 8
+  %54 = load <4 x i16>, ptr %__ret.i84, align 8
+  %55 = load <8 x i16>, ptr %b.addr.i, align 16
+  store <8 x i16> %55, ptr %__p0.addr.i80, align 16
+  %56 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %57 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %shuffle.i82 = shufflevector <8 x i16> %56, <8 x i16> %57, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i82, ptr %__ret.i81, align 8
+  %58 = load <4 x i16>, ptr %__ret.i81, align 8
+  store <4 x i16> %54, ptr %__p0.addr.i69, align 8
+  store <4 x i16> %58, ptr %__p1.addr.i70, align 8
+  %59 = load <4 x i16>, ptr %__p0.addr.i69, align 8
+  %60 = bitcast <4 x i16> %59 to <8 x i8>
+  %61 = load <4 x i16>, ptr %__p1.addr.i70, align 8
+  %62 = bitcast <4 x i16> %61 to <8 x i8>
+  %vqdmull_v2.i72 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %59, <4 x i16> %61)
+  %vqdmull_v3.i73 = bitcast <4 x i32> %vqdmull_v2.i72 to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i72, ptr %__ret.i71, align 16
+  %63 = load <4 x i32>, ptr %__ret.i71, align 16
+  store <4 x i32> %63, ptr %lo32, align 16
+  %64 = load <8 x i16>, ptr %a.addr, align 16
+  %65 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %64, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %65, ptr %__p1.addr.i41, align 16
+  %66 = load <8 x i16>, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %66, ptr %__p0.addr.i95, align 16
+  %67 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %68 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %shuffle.i97 = shufflevector <8 x i16> %67, <8 x i16> %68, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i97, ptr %__ret.i96, align 8
+  %69 = load <4 x i16>, ptr %__ret.i96, align 8
+  %70 = load <8 x i16>, ptr %__p1.addr.i41, align 16
+  store <8 x i16> %70, ptr %__p0.addr.i92, align 16
+  %71 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %72 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %shuffle.i94 = shufflevector <8 x i16> %71, <8 x i16> %72, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i94, ptr %__ret.i93, align 8
+  %73 = load <4 x i16>, ptr %__ret.i93, align 8
+  store <4 x i16> %69, ptr %__p0.addr.i66, align 8
+  store <4 x i16> %73, ptr %__p1.addr.i67, align 8
+  %74 = load <4 x i16>, ptr %__p0.addr.i66, align 8
+  %75 = bitcast <4 x i16> %74 to <8 x i8>
+  %76 = load <4 x i16>, ptr %__p1.addr.i67, align 8
+  %77 = bitcast <4 x i16> %76 to <8 x i8>
+  %vqdmull_v2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %74, <4 x i16> %76)
+  %vqdmull_v3.i = bitcast <4 x i32> %vqdmull_v2.i to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i, ptr %__ret.i68, align 16
+  %78 = load <4 x i32>, ptr %__ret.i68, align 16
+  store <4 x i32> %78, ptr %__ret.i42, align 16
+  %79 = load <4 x i32>, ptr %__ret.i42, align 16
+  store <4 x i32> %79, ptr %hi32, align 16
+  %80 = load <4 x i32>, ptr %lo32, align 16
+  %81 = load <8 x i16>, ptr %a_rev, align 16
+  %82 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %80, ptr %a.addr.i46, align 16
+  store <8 x i16> %81, ptr %b.addr.i47, align 16
+  store <8 x i16> %82, ptr %c.addr.i, align 16
+  %83 = load <4 x i32>, ptr %a.addr.i46, align 16
+  %84 = load <8 x i16>, ptr %b.addr.i47, align 16
+  store <8 x i16> %84, ptr %__p0.addr.i77, align 16
+  %85 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %86 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %shuffle.i79 = shufflevector <8 x i16> %85, <8 x i16> %86, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i79, ptr %__ret.i78, align 8
+  %87 = load <4 x i16>, ptr %__ret.i78, align 8
+  %88 = load <8 x i16>, ptr %c.addr.i, align 16
+  store <8 x i16> %88, ptr %__p0.addr.i74, align 16
+  %89 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %90 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %shuffle.i76 = shufflevector <8 x i16> %89, <8 x i16> %90, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i76, ptr %__ret.i75, align 8
+  %91 = load <4 x i16>, ptr %__ret.i75, align 8
+  store <4 x i32> %83, ptr %__p0.addr.i102, align 16
+  store <4 x i16> %87, ptr %__p1.addr.i103, align 8
+  store <4 x i16> %91, ptr %__p2.addr.i104, align 8
+  %92 = load <4 x i32>, ptr %__p0.addr.i102, align 16
+  %93 = bitcast <4 x i32> %92 to <16 x i8>
+  %94 = load <4 x i16>, ptr %__p1.addr.i103, align 8
+  %95 = bitcast <4 x i16> %94 to <8 x i8>
+  %96 = load <4 x i16>, ptr %__p2.addr.i104, align 8
+  %97 = bitcast <4 x i16> %96 to <8 x i8>
+  %vqdmlal2.i106 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %94, <4 x i16> %96)
+  %vqdmlal_v3.i107 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %92, <4 x i32> %vqdmlal2.i106)
+  store <4 x i32> %vqdmlal_v3.i107, ptr %__ret.i105, align 16
+  %98 = load <4 x i32>, ptr %__ret.i105, align 16
+  store <4 x i32> %98, ptr %lo32, align 16
+  %99 = load <4 x i32>, ptr %hi32, align 16
+  %100 = load <8 x i16>, ptr %a_rev, align 16
+  %101 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %99, ptr %__p0.addr.i51, align 16
+  store <8 x i16> %100, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %101, ptr %__p2.addr.i53, align 16
+  %102 = load <4 x i32>, ptr %__p0.addr.i51, align 16
+  %103 = load <8 x i16>, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %103, ptr %__p0.addr.i89, align 16
+  %104 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %105 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %shuffle.i91 = shufflevector <8 x i16> %104, <8 x i16> %105, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i91, ptr %__ret.i90, align 8
+  %106 = load <4 x i16>, ptr %__ret.i90, align 8
+  %107 = load <8 x i16>, ptr %__p2.addr.i53, align 16
+  store <8 x i16> %107, ptr %__p0.addr.i86, align 16
+  %108 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %109 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %shuffle.i88 = shufflevector <8 x i16> %108, <8 x i16> %109, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i88, ptr %__ret.i87, align 8
+  %110 = load <4 x i16>, ptr %__ret.i87, align 8
+  store <4 x i32> %102, ptr %__p0.addr.i98, align 16
+  store <4 x i16> %106, ptr %__p1.addr.i99, align 8
+  store <4 x i16> %110, ptr %__p2.addr.i100, align 8
+  %111 = load <4 x i32>, ptr %__p0.addr.i98, align 16
+  %112 = bitcast <4 x i32> %111 to <16 x i8>
+  %113 = load <4 x i16>, ptr %__p1.addr.i99, align 8
+  %114 = bitcast <4 x i16> %113 to <8 x i8>
+  %115 = load <4 x i16>, ptr %__p2.addr.i100, align 8
+  %116 = bitcast <4 x i16> %115 to <8 x i8>
+  %vqdmlal2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %113, <4 x i16> %115)
+  %vqdmlal_v3.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %111, <4 x i32> %vqdmlal2.i)
+  store <4 x i32> %vqdmlal_v3.i, ptr %__ret.i101, align 16
+  %117 = load <4 x i32>, ptr %__ret.i101, align 16
+  store <4 x i32> %117, ptr %__ret.i54, align 16
+  %118 = load <4 x i32>, ptr %__ret.i54, align 16
+  store <4 x i32> %118, ptr %hi32, align 16
+  %119 = load <4 x i32>, ptr %lo32, align 16
+  store <4 x i32> %119, ptr %__p0.addr.i64, align 16
+  %120 = load <4 x i32>, ptr %__p0.addr.i64, align 16
+  %121 = bitcast <4 x i32> %120 to <8 x i16>
+  store <8 x i16> %121, ptr %__ret.i65, align 16
+  %122 = load <8 x i16>, ptr %__ret.i65, align 16
+  %123 = load <4 x i32>, ptr %hi32, align 16
+  store <4 x i32> %123, ptr %__p0.addr.i62, align 16
+  %124 = load <4 x i32>, ptr %__p0.addr.i62, align 16
+  %125 = bitcast <4 x i32> %124 to <8 x i16>
+  store <8 x i16> %125, ptr %__ret.i63, align 16
+  %126 = load <8 x i16>, ptr %__ret.i63, align 16
+  store <8 x i16> %122, ptr %__p0.addr.i58, align 16
+  store <8 x i16> %126, ptr %__p1.addr.i59, align 16
+  %127 = load <8 x i16>, ptr %__p0.addr.i58, align 16
+  %128 = load <8 x i16>, ptr %__p1.addr.i59, align 16
+  %shuffle.i61 = shufflevector <8 x i16> %127, <8 x i16> %128, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x i16> %shuffle.i61, ptr %__ret.i60, align 16
+  %129 = load <8 x i16>, ptr %__ret.i60, align 16
+  ret <8 x i16> %129
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) #2
+
+attributes #0 = { mustprogress noinline uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 1}
+!3 = !{!"clang version 20.0.0git"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+;.
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
index 6c45442..13ea35a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
index b2d6455..a38413f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx11.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 82b1cf9..f583a61 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target triple = "arm64-apple-darwin"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index 2f61c89..801a8a0 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes='default<O3>' -S -o - %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx14.0.0"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
index b14a36c..2703d23 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -enable-loop-flatten -loop-flatten-cost-threshold=3 -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 7fccfee..886e7a7 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -enable-matrix -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -enable-matrix -S %s | FileCheck %s
 
 target triple = "arm64-apple-ios"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
index 10b07ad..d340638 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt %s -mtriple=arm64-apple-macosx -passes='default<O3>' -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
+; RUN: opt %s -mtriple=arm64-apple-macosx -passes="default<O3>" -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s
 
 declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1)
 declare ptr @__memcpy_chk(ptr, ptr, i64, i64)
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index b1d0c70..76d9d14 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
index 5b7622b..2ab6f2b 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                                        | FileCheck --check-prefix=OLDPM %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck --check-prefix=NEWPM %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck --check-prefix=NEWPM %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index aab787b..778f25f 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index ef65101..9032c36 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; This test after a lot of cleanup should produce pick a tail-predicated 8x
 ; vector loop. The 8x will be more profitable, to pick a VQDMULH.s16 instruction.
@@ -15,7 +15,7 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; CHECK:       while.body.preheader:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER18:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER15:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
 ; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
@@ -48,18 +48,18 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER18]]
-; CHECK:       while.body.preheader18:
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER15]]
+; CHECK:       while.body.preheader15:
 ; CHECK-NEXT:    [[BLKCNT_06_PH:%.*]] = phi i32 [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PSRCA_ADDR_05_PH:%.*]] = phi ptr [ [[PSRCA]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_04_PH:%.*]] = phi ptr [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PSRCB_ADDR_03_PH:%.*]] = phi ptr [ [[PSRCB]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[BLKCNT_06:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_06_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER18]] ]
-; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER18]] ]
+; CHECK-NEXT:    [[BLKCNT_06:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_06_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER15]] ]
+; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER15]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[PSRCA_ADDR_05]], i32 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll b/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
index 6cbba5c..664953a 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/mve-floatreduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                   | FileCheck %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-none-none-eabi"
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 57a3d81..5a3742c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -unroll-runtime -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -unroll-runtime -S %s | FileCheck %s
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 798824b..a4aea02 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
@@ -78,30 +78,16 @@ define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v8i16_u1234567(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; SSE4-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
+; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @add_v8i16_u1234567(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:    ret <8 x i16> [[RESULT]]
+; AVX-NEXT:    ret <8 x i16> [[TMP7]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -172,13 +158,10 @@ define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_u123(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -202,13 +185,10 @@ define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_0u23(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -232,40 +212,28 @@ define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: @add_v4i32_01u3(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v4i32_01u3(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE4-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
 ; SSE4-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; AVX2-LABEL: @add_v4i32_01u3(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX2-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; AVX512-LABEL: @add_v4i32_01u3(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
 ; AVX512-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -289,13 +257,10 @@ define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
 
 define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_v4i32_012u(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -420,46 +385,30 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
 
 define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
 ; SSE2-LABEL: @add_v8i32_01234u67(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
-; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
-; SSE2-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
 ; SSE2-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v8i32_01234u67(
 ; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
 ; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
 ; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
-; SSE4-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP5]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8i32_01234u67(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; AVX-NEXT:    ret <8 x i32> [[RESULT]]
+; AVX-NEXT:    ret <8 x i32> [[TMP7]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -530,13 +479,10 @@ define <4 x float> @add_v4f32_0123(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @add_v4f32_u123(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -560,13 +506,10 @@ define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @add_v4f32_0u23(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -589,41 +532,11 @@ define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: @add_v4f32_01u3(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
-;
-; SSE4-LABEL: @add_v4f32_01u3(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; SSE4-NEXT:    ret <4 x float> [[RESULT]]
-;
-; AVX2-LABEL: @add_v4f32_01u3(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
-; AVX2-NEXT:    ret <4 x float> [[RESULT]]
-;
-; AVX512-LABEL: @add_v4f32_01u3(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
-; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
+; CHECK-LABEL: @add_v4f32_01u3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -647,39 +560,27 @@ define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: @add_v4f32_012u(
-; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
 ;
 ; SSE4-LABEL: @add_v4f32_012u(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
 ; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE4-NEXT:    ret <4 x float> [[RESULT]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX2-LABEL: @add_v4f32_012u(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
 ; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; AVX2-NEXT:    ret <4 x float> [[RESULT]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX512-LABEL: @add_v4f32_012u(
-; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX512-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 ; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
@@ -804,33 +705,40 @@ define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @add_v8f32_012u4567(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
-; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:    ret <8 x float> [[RESULT]]
+; SSE2-LABEL: @add_v8f32_012u4567(
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8f32_012u4567(
+; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8f32_012u4567(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[HADD5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 14, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
 ; AVX-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:    ret <8 x float> [[RESULT]]
+; AVX-NEXT:    ret <8 x float> [[TMP7]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -983,13 +891,10 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_u123(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 1, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1034,13 +939,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_0u23(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1085,13 +987,10 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_01u3(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 6>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1136,13 +1035,10 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_012u(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
new file mode 100644
index 0000000..bcb316a
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -0,0 +1,1141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; PR34072 - failure to canonicalize to (sub (shuffle a, b),(shuffle a, b)) for optimal horizontal sub patterns (with undemanded elements)
+
+;
+; v8i16
+;
+
+define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @sub_v8i16_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = sub i16 %a0, %a1
+  %a23 = sub i16 %a2, %a3
+  %a45 = sub i16 %a4, %a5
+  %a67 = sub i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = sub i16 %b0, %b1
+  %b23 = sub i16 %b2, %b3
+  %b45 = sub i16 %b4, %b5
+  %b67 = sub i16 %b6, %b7
+  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: @sub_v8i16_u1234567(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
+; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
+; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
+; SSE2-NEXT:    [[A23:%.*]] = sub i16 [[A2]], [[A3]]
+; SSE2-NEXT:    [[A45:%.*]] = sub i16 [[A4]], [[A5]]
+; SSE2-NEXT:    [[A67:%.*]] = sub i16 [[A6]], [[A7]]
+; SSE2-NEXT:    [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
+; SSE2-NEXT:    [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
+; SSE2-NEXT:    [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8i16_u1234567(
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
+;
+; AVX-LABEL: @sub_v8i16_u1234567(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x i16> [[TMP7]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = sub i16 %a0, %a1
+  %a23 = sub i16 %a2, %a3
+  %a45 = sub i16 %a4, %a5
+  %a67 = sub i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = sub i16 %b0, %b1
+  %b23 = sub i16 %b2, %b3
+  %b45 = sub i16 %b4, %b5
+  %b67 = sub i16 %b6, %b7
+  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+;
+; v4i32
+;
+
+define <4 x i32> @sub_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_u123(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0u23(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: @sub_v4i32_01u3(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4i32_01u3(
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4i32_01u3(
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4i32_01u3(
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX512-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_012u(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x i32> %result
+}
+
+;
+; v8i32
+;
+
+define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @sub_v8i32_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %a45 = sub i32 %a4, %a5
+  %a67 = sub i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %b45 = sub i32 %b4, %b5
+  %b67 = sub i32 %b6, %b7
+  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: @sub_v8i32_01234u67(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v8i32_01234u67(
+; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT:    [[A45:%.*]] = sub i32 [[A4]], [[A5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8i32_01234u67(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %a45 = sub i32 %a4, %a5
+  %a67 = sub i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %b45 = sub i32 %b4, %b5
+  %b67 = sub i32 %b6, %b7
+  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+;
+; v4f32
+;
+
+define <4 x float> @sub_v4f32_0123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_u123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_u123(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0u23(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_01u3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @sub_v4f32_012u(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4f32_012u(
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4f32_012u(
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4f32_012u(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX512-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x float> %result
+}
+
+;
+; v8f32
+;
+
+define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @sub_v8f32_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %a45 = fsub float %a4, %a5
+  %a67 = fsub float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %b45 = fsub float %b4, %b5
+  %b67 = fsub float %b6, %b7
+  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: @sub_v8f32_012u4567(
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8f32_012u4567(
+; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8f32_012u4567(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x float> [[TMP7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %a45 = fsub float %a4, %a5
+  %a67 = fsub float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %b45 = fsub float %b4, %b5
+  %b67 = fsub float %b6, %b7
+  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+;
+; v2f64
+;
+
+define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_u1(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_0u(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
+  ret <2 x double> %result
+}
+
+;
+; v4f64
+;
+
+define <4 x double> @sub_v4f64_0123(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @sub_v4f64_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_u123(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_u123(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_u123(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_0u23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_0u23(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 2, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_0u23(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01u3(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_01u3(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_01u3(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_012u(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE2-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_012u(
+; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE4-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_012u(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_uu23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4f64_uu23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; AVX-LABEL: @sub_v4f64_uu23(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[RESULT1]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01uu(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    ret <4 x double> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f64_01uu(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX-LABEL: @sub_v4f64_01uu(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x double> %result
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index 4e1051d..d92df9741 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -31,13 +31,10 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
 ; SSE4-NEXT:    ret <4 x double> [[SHUFFLE]]
 ;
 ; AVX-LABEL: @PR50392(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[B:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B1:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B1]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    ret <4 x double> [[SHUFFLE]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
 ;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index fe5bba1a..7bb22e2 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll b/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
index 2843a7e..126be02 100644
--- a/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
+++ b/llvm/test/Transforms/PhaseOrdering/bitfield-bittests.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 ; These are tests that check for set/clear bits in a bitfield based on PR37098:
 ; https://bugs.llvm.org/show_bug.cgi?id=37098
diff --git a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
index 7ff3c5d..7cdddd1 100644
--- a/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
+++ b/llvm/test/Transforms/PhaseOrdering/dae-dce.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s --check-prefixes=CHECK,DEFAULT
 ; RUN: opt -S -passes='lto<O3>' < %s | FileCheck %s --check-prefixes=CHECK,LTO
 
 declare void @llvm.trap()
diff --git a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
index 689f4a9..641f216 100644
--- a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
+++ b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s --check-prefixes=ALL,O3
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=ALL,O3
 ; RUN: opt -passes='default<O2>' -S < %s  | FileCheck %s --check-prefixes=ALL,O2
 ; RUN: opt -passes='default<O1>' -S < %s  | FileCheck %s --check-prefixes=ALL,O1
 
diff --git a/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll b/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
index 2139542..82a453d 100644
--- a/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
+++ b/llvm/test/Transforms/PhaseOrdering/globalaa-retained.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes='default<O3>' -S < %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll b/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
index ba6c36a..cc20233a 100644
--- a/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/instcombine-sroa-inttoptr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S                                        | FileCheck %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; This is based on the following most basic C++ code:
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll b/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
index 21fa234..1239b18 100644
--- a/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
+++ b/llvm/test/Transforms/PhaseOrdering/lifetime-sanitizer.ll
@@ -5,7 +5,7 @@
 ; RUN: opt < %s -passes='default<O0>' -S | FileCheck %s --check-prefixes=CHECK,NOOPT
 ; RUN: opt < %s -passes='default<O1>' -S | FileCheck %s --check-prefixes=CHECK,OPT
 ; RUN: opt < %s -passes='default<O2>' -S | FileCheck %s --check-prefixes=CHECK,OPT
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s --check-prefixes=CHECK,OPT
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
index bae3e26..c6b5e5f 100644
--- a/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
+++ b/llvm/test/Transforms/PhaseOrdering/loop-rotation-vs-common-code-hoisting.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefix=HOIST
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefix=HOIST
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefix=ROTATE
-; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefix=ROTATE
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=0 -S < %s  | FileCheck %s --check-prefix=HOIST
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=1 -S < %s  | FileCheck %s --check-prefix=HOIST
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=2 -S < %s  | FileCheck %s --check-prefix=ROTATE
+; RUN: opt -passes="default<O3>" -rotation-max-header-size=3 -S < %s  | FileCheck %s --check-prefix=ROTATE
 
 ; This example is produced from a very basic C code:
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/pr32544.ll b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
index 421260b..135084d 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr32544.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr32544.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 define void @foo(i1 %which, i32 %a, i32 %b, ptr %result) {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/PhaseOrdering/pr45682.ll b/llvm/test/Transforms/PhaseOrdering/pr45682.ll
index 2230580..46ee191 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr45682.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr45682.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 define void @PR45682(i32 %x, i32 %y) {
 ; CHECK-LABEL: @PR45682(
diff --git a/llvm/test/Transforms/PhaseOrdering/pr62311.ll b/llvm/test/Transforms/PhaseOrdering/pr62311.ll
index 03276d8..027df7d 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr62311.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr62311.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
 
 ; C++ version of test case
 ; #include <x86intrin.h>
diff --git a/llvm/test/Transforms/PhaseOrdering/pr95152.ll b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
index 016460f..6941ea2 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr95152.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 ; Make sure that interaction of "writable" with various passes does not
 ; result in the elimination of the store prior to @j().
diff --git a/llvm/test/Transforms/PhaseOrdering/rotate.ll b/llvm/test/Transforms/PhaseOrdering/rotate.ll
index 9ce1969..9179edc 100644
--- a/llvm/test/Transforms/PhaseOrdering/rotate.ll
+++ b/llvm/test/Transforms/PhaseOrdering/rotate.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O3 -S < %s                    | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
 ; This should become a single funnel shift through a combination
 ; of aggressive-instcombine, simplifycfg, and instcombine.
diff --git a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
index 03df138..9da46bd 100644
--- a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
+++ b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes='default<O1>' -S < %s | FileCheck %s
 ; RUN: opt -passes='default<O2>' -S < %s | FileCheck %s
-; RUN: opt -passes='default<O3>' -S < %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S < %s | FileCheck %s
 
 ; We are worse at propagating correlation facts when in select form
 ; as compared to the PHI form, so if we lower switches to early,
diff --git a/llvm/test/Transforms/PhaseOrdering/switch-sext.ll b/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
index 0e352ba..3fbb02d 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch-sext.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 define i8 @test_switch_with_sext_phi(i8 %code) {
 ; CHECK-LABEL: define noundef i8 @test_switch_with_sext_phi(
diff --git a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
index d2f33f9..33266ca 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes='default<O1>' < %s | FileCheck %s
 ; RUN: opt -S -passes='default<O2>' < %s | FileCheck %s
-; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s
+; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
 
 target datalayout = "n64"
 
diff --git a/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll b/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll
new file mode 100644
index 0000000..e81c5d7
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/range-and-or-bit-masked.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=ipsccp %s | FileCheck %s
+
+declare void @use(i1)
+
+define i1 @test1(i64 %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i64 [[X:%.*]], 65535
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X]], -65521
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cond = icmp ugt i64 %x, 65535
+  call void @llvm.assume(i1 %cond)
+  %mask = and i64 %x, -65521
+  %cmp = icmp eq i64 %mask, 0
+  ret i1 %cmp
+}
+
+define void @test.and(i64 %x, i64 %y) {
+; CHECK-LABEL: @test.and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0:%.*]] = icmp uge i64 [[X:%.*]], 138
+; CHECK-NEXT:    [[C1:%.*]] = icmp ule i64 [[X]], 161
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C0]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i64 [[Y:%.*]], 186
+; CHECK-NEXT:    [[C3:%.*]] = icmp ule i64 [[Y]], 188
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C3]])
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[R1:%.*]] = icmp ult i64 [[AND]], 137
+; CHECK-NEXT:    call void @use(i1 [[R1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0 = icmp uge i64 %x, 138 ; 0b10001010
+  %c1 = icmp ule i64 %x, 161 ; 0b10100000
+  call void @llvm.assume(i1 %c0)
+  call void @llvm.assume(i1 %c1)
+  %c2 = icmp uge i64 %y, 186 ; 0b10111010
+  %c3 = icmp ule i64 %y, 188 ; 0b10111110
+  call void @llvm.assume(i1 %c2)
+  call void @llvm.assume(i1 %c3)
+  %and = and i64 %x, %y
+  %r0 = icmp ult i64 %and, 136 ; 0b10001000
+  call void @use(i1 %r0) ; false
+  %r1 = icmp ult i64 %and, 137
+  call void @use(i1 %r1) ; unknown
+  ret void
+}
+
+define void @test.or(i64 %x, i64 %y) {
+; CHECK-LABEL: @test.or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0:%.*]] = icmp ule i64 [[X:%.*]], 117
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i64 [[X]], 95
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C0]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C1]])
+; CHECK-NEXT:    [[C2:%.*]] = icmp ule i64 [[Y:%.*]], 69
+; CHECK-NEXT:    [[C3:%.*]] = icmp uge i64 [[Y]], 67
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C3]])
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[R1:%.*]] = icmp ugt i64 [[OR]], 118
+; CHECK-NEXT:    call void @use(i1 [[R1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0 = icmp ule i64 %x, 117 ; 0b01110101
+  %c1 = icmp uge i64 %x, 95  ; 0b01011111
+  call void @llvm.assume(i1 %c0)
+  call void @llvm.assume(i1 %c1)
+  %c2 = icmp ule i64 %y, 69  ; 0b01000101
+  %c3 = icmp uge i64 %y, 67  ; 0b01000011
+  call void @llvm.assume(i1 %c2)
+  call void @llvm.assume(i1 %c3)
+  %or = or i64 %x, %y
+  %r0 = icmp ugt i64 %or, 119 ; 0b01110111
+  call void @use(i1 %r0) ; false
+  %r1 = icmp ugt i64 %or, 118
+  call void @use(i1 %r1) ; unknown
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 6dceabe..00a4417 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -80,16 +80,11 @@ define half @reduce_fast_half8(<8 x half> %vec8) {
 ; NOFP16-LABEL: define half @reduce_fast_half8(
 ; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
 ; NOFP16-NEXT:  [[ENTRY:.*:]]
-; NOFP16-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; NOFP16-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; NOFP16-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; NOFP16-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
 ; NOFP16-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; NOFP16-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
-; NOFP16-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
-; NOFP16-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
-; NOFP16-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
-; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
+; NOFP16-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NOFP16-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]])
+; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
 ; NOFP16-NEXT:    ret half [[OP_RDX3]]
 ;
 ; FULLFP16-LABEL: define half @reduce_fast_half8(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 289807a..3cab4a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -48,12 +48,12 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP17]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x float> [ [[TMP31]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ]
 ; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 2, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 2, i32 5, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 2, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 18, i32 6, i32 7, i32 8, i32 20, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 18, i32 6, i32 7, i32 8, i32 20, i32 10, i32 11, i32 12, i32 21, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x float> [[TMP40]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 5, i32 13, i32 9, i32 9>
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index 8093285..a504f3e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -8,56 +8,34 @@
 define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
-; CHECK-NEXT:    [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00
-; CHECK-NEXT:    [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00
-; CHECK-NEXT:    [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP2]], <2 x float> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
 ; CHECK:       bb18:
-; CHECK-NEXT:    [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ]
-; CHECK-NEXT:    [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ]
-; CHECK-NEXT:    [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ]
-; CHECK-NEXT:    [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x float> [ [[TMP7]], [[BB:%.*]] ]
+; CHECK-NEXT:    [[VAL16:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
 ; CHECK-NEXT:    [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00
+; CHECK-NEXT:    [[VAL17:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
 ; CHECK-NEXT:    [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00
 ; CHECK-NEXT:    br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
 ; CHECK:       bb25:
-; CHECK-NEXT:    [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP8]], [[BB18]] ]
 ; CHECK-NEXT:    br label [[BB30:%.*]]
 ; CHECK:       bb30:
 ; CHECK-NEXT:    [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
 ; CHECK-NEXT:    [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT:    [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT:    [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float
-; CHECK-NEXT:    [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1
-; CHECK-NEXT:    [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1
-; CHECK-NEXT:    [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float
-; CHECK-NEXT:    [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2
-; CHECK-NEXT:    [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1
-; CHECK-NEXT:    [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float
-; CHECK-NEXT:    [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3
-; CHECK-NEXT:    [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1
-; CHECK-NEXT:    [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float
-; CHECK-NEXT:    [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]]
-; CHECK-NEXT:    [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]]
-; CHECK-NEXT:    [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]]
-; CHECK-NEXT:    [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]]
-; CHECK-NEXT:    [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]]
-; CHECK-NEXT:    [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]]
-; CHECK-NEXT:    [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]]
-; CHECK-NEXT:    [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]]
-; CHECK-NEXT:    [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]]
-; CHECK-NEXT:    [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]]
-; CHECK-NEXT:    [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP15]])
 ; CHECK-NEXT:    [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
 ; CHECK-NEXT:    [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]])
 ; CHECK-NEXT:    call void @ham(float [[VAL55]], float [[VAL56]])
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 912d60d..257e466 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -30,11 +30,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; CHECK-NEXT:    [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
@@ -50,7 +50,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; CHECK-NEXT:    [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
@@ -61,14 +61,14 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
+; CHECK-NEXT:    [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
-; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP45]], [[TMP43]]
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
+; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]]
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = load i8, ptr null, align 1
@@ -79,12 +79,12 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
 ; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP76]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], splat (i32 16)
-; CHECK-NEXT:    [[TMP90:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
+; CHECK-NEXT:    [[TMP81:%.*]] = sub <2 x i32> [[TMP48]], [[TMP76]]
+; CHECK-NEXT:    [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16)
+; CHECK-NEXT:    [[TMP75:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]]
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
@@ -93,236 +93,236 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
 ; CHECK-NEXT:    [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]]
-; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison)
-; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; CHECK-NEXT:    [[TMP100:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]]
-; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16)
-; CHECK-NEXT:    [[TMP74:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
-; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
-; CHECK-NEXT:    [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
-; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]]
-; CHECK-NEXT:    [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP43]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[ADD48_3]], i32 0
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i32> [[TMP43]], i32 [[ADD55_3]], i32 0
-; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP122]], [[TMP72]]
-; CHECK-NEXT:    [[ADD55_4:%.*]] = add i32 [[TMP107]], [[SUB51_3]]
-; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <2 x i32> [[TMP126]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[TMP107]], i32 0
-; CHECK-NEXT:    [[TMP143:%.*]] = sub <2 x i32> [[TMP129]], [[TMP130]]
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD48_2]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_4]]
-; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
-; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
-; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
+; CHECK-NEXT:    [[TMP170:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison)
+; CHECK-NEXT:    [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32>
+; CHECK-NEXT:    [[TMP172:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32>
+; CHECK-NEXT:    [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16)
+; CHECK-NEXT:    [[TMP69:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]]
+; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0
+; CHECK-NEXT:    [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]]
+; CHECK-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP176]], [[TMP197]]
+; CHECK-NEXT:    [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0
+; CHECK-NEXT:    [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = add i32 [[XOR_I63_2]], [[ADD112_2]]
+; CHECK-NEXT:    [[SUB47_3:%.*]] = sub i32 [[ADD112_2]], [[XOR_I63_2]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
+; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i32> [[TMP70]], i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP71]], [[TMP72]]
+; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
+; CHECK-NEXT:    [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[SUB47_3]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
+; CHECK-NEXT:    [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]]
+; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]]
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP45]], 15
+; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
+; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
+; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[CONV9_2]], 15
 ; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[ADD55_4]], [[ADD55_2]]
-; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_4]]
-; CHECK-NEXT:    [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
-; CHECK-NEXT:    [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
-; CHECK-NEXT:    [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
-; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
+; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15
 ; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT:    [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
-; CHECK-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP123]], i32 0
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP123]], i32 1
-; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP144]], [[TMP145]]
-; CHECK-NEXT:    [[TMP169:%.*]] = sub i32 [[TMP145]], [[TMP144]]
+; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]]
+; CHECK-NEXT:    [[ADD112_1:%.*]] = sub i32 [[TMP87]], [[TMP86]]
 ; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
-; CHECK-NEXT:    [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP146]], [[TMP147]]
-; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP147]], [[TMP146]]
-; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
-; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
-; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT:    [[TMP148:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP149:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
-; CHECK-NEXT:    [[TMP150:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP109:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]]
-; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], splat (i32 16)
-; CHECK-NEXT:    [[TMP112:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
-; CHECK-NEXT:    [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]]
-; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], splat (i32 16)
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP120]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]]
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
-; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP101:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]]
-; CHECK-NEXT:    [[TMP151:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
-; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
-; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]]
-; CHECK-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP111]], [[TMP99]]
-; CHECK-NEXT:    [[TMP153:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0
-; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP157]], [[TMP153]]
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP153]], [[TMP157]]
-; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
+; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
+; CHECK-NEXT:    [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32>
+; CHECK-NEXT:    [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; CHECK-NEXT:    [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
+; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]]
+; CHECK-NEXT:    [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16)
+; CHECK-NEXT:    [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
+; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32>
+; CHECK-NEXT:    [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]]
+; CHECK-NEXT:    [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16)
+; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
+; CHECK-NEXT:    [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]]
+; CHECK-NEXT:    [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]]
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]]
+; CHECK-NEXT:    [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP126:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]]
+; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]]
+; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0
+; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP127]], [[TMP120]]
+; CHECK-NEXT:    [[TMP166:%.*]] = sub i32 [[TMP120]], [[TMP127]]
+; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0
+; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1
+; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]]
+; CHECK-NEXT:    [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]]
+; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP127]], 15
+; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
+; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP129]], 15
 ; CHECK-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
 ; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; CHECK-NEXT:    [[SHR_I59_4:%.*]] = lshr i32 [[TMP157]], 15
-; CHECK-NEXT:    [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
-; CHECK-NEXT:    [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
-; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
-; CHECK-NEXT:    [[TMP158:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT:    [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT:    [[TMP121:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP116:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; CHECK-NEXT:    [[TMP159:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]]
-; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], splat (i32 16)
-; CHECK-NEXT:    [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; CHECK-NEXT:    [[TMP160:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
-; CHECK-NEXT:    [[TMP171:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32>
-; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP161]], [[TMP172]]
-; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16)
-; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP137]], [[TMP191]]
-; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP136]], [[TMP173]]
-; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP133]]
-; CHECK-NEXT:    [[TMP192:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP174]], [[TMP192]]
-; CHECK-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP192]], [[TMP174]]
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT:    [[TMP142:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP142]], [[TMP139]]
-; CHECK-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP139]], [[TMP142]]
-; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0
-; CHECK-NEXT:    [[SUB47_1:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[SUB47_1]], [[TMP138]]
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP138]], [[SUB47_1]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP142]], 15
+; CHECK-NEXT:    [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
+; CHECK-NEXT:    [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32>
+; CHECK-NEXT:    [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32>
+; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]]
+; CHECK-NEXT:    [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16)
+; CHECK-NEXT:    [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32>
+; CHECK-NEXT:    [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
+; CHECK-NEXT:    [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32>
+; CHECK-NEXT:    [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]]
+; CHECK-NEXT:    [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16)
+; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]]
+; CHECK-NEXT:    [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]]
+; CHECK-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]]
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]]
+; CHECK-NEXT:    [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]]
+; CHECK-NEXT:    [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]]
+; CHECK-NEXT:    [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]]
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]]
+; CHECK-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0
+; CHECK-NEXT:    [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1
+; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
+; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15
 ; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT:    [[TMP154:%.*]] = lshr <2 x i32> [[TMP110]], splat (i32 15)
-; CHECK-NEXT:    [[TMP184:%.*]] = and <2 x i32> [[TMP154]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP184]], splat (i32 65535)
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD55]]
-; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD55]], [[ADD48_1]]
-; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
-; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
-; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]]
+; CHECK-NEXT:    [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15)
+; CHECK-NEXT:    [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535)
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
+; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD78]]
+; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]]
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]]
+; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]]
-; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]]
+; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP45]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP142]]
-; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
-; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP99]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]]
+; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP127]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
+; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
+; CHECK-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
 ; CHECK-NEXT:    [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
 ; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
-; CHECK-NEXT:    [[ADD103_2:%.*]] = add i32 [[ADD94_5]], [[ADD103_1]]
-; CHECK-NEXT:    [[SUB104_2:%.*]] = sub i32 [[ADD103_1]], [[ADD94_5]]
-; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB104_1]]
-; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB104_1]], [[SUB102_1]]
-; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]]
-; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]]
-; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]]
-; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]]
-; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_2]]
-; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
-; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]]
-; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP157]]
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
+; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
+; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[CONV9_2]]
+; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
+; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]]
+; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
+; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]]
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]]
 ; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]]
-; CHECK-NEXT:    [[TMP170:%.*]] = sub i32 [[SUB51]], [[SUB45_1]]
-; CHECK-NEXT:    [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
-; CHECK-NEXT:    [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]]
-; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
-; CHECK-NEXT:    [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]]
-; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]]
-; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
-; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT:    [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]]
-; CHECK-NEXT:    [[TMP152:%.*]] = xor <2 x i32> [[TMP197]], [[TMP110]]
-; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP111]], 15
-; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
-; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP111]]
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP175]]
-; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP176]]
-; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
-; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
+; CHECK-NEXT:    [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
+; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]]
+; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]]
+; CHECK-NEXT:    [[TMP204:%.*]] = sub i32 [[TMP166]], [[SUB51_1]]
 ; CHECK-NEXT:    [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
 ; CHECK-NEXT:    [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
 ; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]]
-; CHECK-NEXT:    [[TMP182:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]]
-; CHECK-NEXT:    [[TMP183:%.*]] = shufflevector <2 x i32> [[TMP181]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
-; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
-; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]]
-; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
+; CHECK-NEXT:    [[TMP199:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT:    [[TMP200:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]]
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP204]], [[ADD112_1]]
+; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD113_1]]
+; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
+; CHECK-NEXT:    [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP201]]
+; CHECK-NEXT:    [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]]
+; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15
+; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
+; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
+; CHECK-NEXT:    [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]]
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]]
+; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1
+; CHECK-NEXT:    [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]]
+; CHECK-NEXT:    [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]]
+; CHECK-NEXT:    [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]]
+; CHECK-NEXT:    [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]]
+; CHECK-NEXT:    [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0
+; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]]
+; CHECK-NEXT:    [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]]
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]]
+; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]]
+; CHECK-NEXT:    [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]]
+; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]]
 ; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP183]]
+; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]]
 ; CHECK-NEXT:    [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]]
 ; CHECK-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
 ; CHECK-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
 ; CHECK-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]]
 ; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0
 ; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]]
 ; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll
new file mode 100644
index 0000000..cf1ed54
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v < %s |  FileCheck %s
+
+define i64 @test(ptr %arg, i32 %arg1, i64 %i) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: ptr [[ARG:%.*]], i32 [[ARG1:%.*]], i64 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[I]]
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr i8, ptr [[I2]], i64 [[I]]
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr i8, ptr [[I3]], i64 [[I]]
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr [[I4]], i64 [[I]]
+; CHECK-NEXT:    [[I6:%.*]] = getelementptr i8, ptr [[I5]], i64 [[I]]
+; CHECK-NEXT:    [[I7:%.*]] = getelementptr i8, ptr [[I6]], i64 [[I]]
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr [[I7]], i64 [[I]]
+; CHECK-NEXT:    [[I9:%.*]] = getelementptr i8, ptr [[I8]], i64 [[I]]
+; CHECK-NEXT:    [[I10:%.*]] = getelementptr i8, ptr [[I9]], i64 [[I]]
+; CHECK-NEXT:    [[I11:%.*]] = getelementptr i8, ptr [[I10]], i64 [[I]]
+; CHECK-NEXT:    [[I12:%.*]] = getelementptr i8, ptr [[I11]], i64 [[I]]
+; CHECK-NEXT:    [[I13:%.*]] = getelementptr i8, ptr [[I12]], i64 [[I]]
+; CHECK-NEXT:    [[I14:%.*]] = getelementptr i8, ptr [[I13]], i64 [[I]]
+; CHECK-NEXT:    [[I140:%.*]] = load i8, ptr [[I14]], align 1
+; CHECK-NEXT:    [[I1412:%.*]] = zext i8 [[I140]] to i32
+; CHECK-NEXT:    [[I142:%.*]] = mul i32 [[ARG1]], [[I1412]]
+; CHECK-NEXT:    [[I143:%.*]] = getelementptr i8, ptr [[I13]], i64 15
+; CHECK-NEXT:    [[I144:%.*]] = load i8, ptr [[I143]], align 1
+; CHECK-NEXT:    [[I1453:%.*]] = zext i8 [[I144]] to i32
+; CHECK-NEXT:    [[I146:%.*]] = mul i32 [[ARG1]], [[I1453]]
+; CHECK-NEXT:    [[I147:%.*]] = getelementptr i8, ptr [[I13]], i64 14
+; CHECK-NEXT:    [[I148:%.*]] = load i8, ptr [[I147]], align 1
+; CHECK-NEXT:    [[I1494:%.*]] = zext i8 [[I148]] to i32
+; CHECK-NEXT:    [[I150:%.*]] = mul i32 [[ARG1]], [[I1494]]
+; CHECK-NEXT:    [[I151:%.*]] = getelementptr i8, ptr [[I13]], i64 13
+; CHECK-NEXT:    [[I152:%.*]] = load i8, ptr [[I151]], align 1
+; CHECK-NEXT:    [[I1535:%.*]] = zext i8 [[I152]] to i32
+; CHECK-NEXT:    [[I154:%.*]] = mul i32 [[ARG1]], [[I1535]]
+; CHECK-NEXT:    [[I1311:%.*]] = or i32 [[I142]], [[I146]]
+; CHECK-NEXT:    [[I1312:%.*]] = or i32 [[I1311]], [[I150]]
+; CHECK-NEXT:    [[I1313:%.*]] = or i32 [[I1312]], [[I154]]
+; CHECK-NEXT:    [[I1536:%.*]] = zext i32 [[I1313]] to i64
+; CHECK-NEXT:    ret i64 [[I1536]]
+;
+bb:
+  %i2 = getelementptr i8, ptr %arg, i64 %i
+  %i3 = getelementptr i8, ptr %i2, i64 %i
+  %i4 = getelementptr i8, ptr %i3, i64 %i
+  %i5 = getelementptr i8, ptr %i4, i64 %i
+  %i6 = getelementptr i8, ptr %i5, i64 %i
+  %i7 = getelementptr i8, ptr %i6, i64 %i
+  %i8 = getelementptr i8, ptr %i7, i64 %i
+  %i9 = getelementptr i8, ptr %i8, i64 %i
+  %i10 = getelementptr i8, ptr %i9, i64 %i
+  %i11 = getelementptr i8, ptr %i10, i64 %i
+  %i12 = getelementptr i8, ptr %i11, i64 %i
+  %i13 = getelementptr i8, ptr %i12, i64 %i
+  %i14 = getelementptr i8, ptr %i13, i64 %i
+  %i140 = load i8, ptr %i14, align 1
+  %i1412 = zext i8 %i140 to i32
+  %i142 = mul i32 %arg1, %i1412
+  %i143 = getelementptr i8, ptr %i13, i64 15
+  %i144 = load i8, ptr %i143, align 1
+  %i1453 = zext i8 %i144 to i32
+  %i146 = mul i32 %arg1, %i1453
+  %i147 = getelementptr i8, ptr %i13, i64 14
+  %i148 = load i8, ptr %i147, align 1
+  %i1494 = zext i8 %i148 to i32
+  %i150 = mul i32 %arg1, %i1494
+  %i151 = getelementptr i8, ptr %i13, i64 13
+  %i152 = load i8, ptr %i151, align 1
+  %i1535 = zext i8 %i152 to i32
+  %i154 = mul i32 %arg1, %i1535
+  %i1311 = or i32 %i142, %i146
+  %i1312 = or i32 %i1311, %i150
+  %i1313 = or i32 %i1312, %i154
+  %i1536 = zext i32 %i1313 to i64
+  ret i64 %i1536
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 0771fab..e0b3ff7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -428,7 +428,7 @@ define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 [[C:%.*]], i1 false
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[C:%.*]], i1 [[TMP3]], i1 false
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
@@ -456,7 +456,7 @@ define i1 @logical_or_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 true, i1 [[C:%.*]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[C:%.*]], i1 true, i1 [[TMP3]]
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index 360b258..f875d45 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -14,7 +14,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
index 371b230..afca39a 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
@@ -12,7 +12,8 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 0, i16 poison>, i16 [[CALL37]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 5
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 3, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, i16 [[CALL37]], i32 6
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL]], i32 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret void
 ;
@@ -43,7 +44,8 @@ define void @test1() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison, i16 0>, i16 [[CALL]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL37]], i32 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, i16 [[CALL]], i32 6
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL37]], i32 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 261ec2b..40568f9 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -1,31 +1,56 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix AARCH64 %}
 
 define i1 @test(float %0, double %1) {
-; CHECK-LABEL: define i1 @test
-; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP10]], i64 0)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float>
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]])
-; CHECK-NEXT:    ret i1 [[TMP22]]
+; X86-LABEL: define i1 @test
+; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
+; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
+; X86-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
+; X86-NEXT:    [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; X86-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP10]], i64 0)
+; X86-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
+; X86-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4)
+; X86-NEXT:    [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]]
+; X86-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]]
+; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float>
+; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer
+; X86-NEXT:    [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer
+; X86-NEXT:    [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]]
+; X86-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]])
+; X86-NEXT:    ret i1 [[TMP22]]
+;
+; AARCH64-LABEL: define i1 @test
+; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
+; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
+; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
+; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 poison, i32 7>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP8]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; AARCH64-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
+; AARCH64-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP12]], i64 0)
+; AARCH64-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP14]], <2 x double> [[TMP6]], i64 4)
+; AARCH64-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP13]], [[TMP15]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP13]], [[TMP15]]
+; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
+; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
+; AARCH64-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
+; AARCH64-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
+; AARCH64-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
+; AARCH64-NEXT:    ret i1 [[TMP23]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll
index dbd9119..c704baaa 100644
--- a/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/full-overlap-non-schedulable.ll
@@ -24,7 +24,9 @@ define void @test(ptr %p1, ptr %0, i32 %1, i1 %c1, ptr %p2) {
 ; CHECK:       [[L47]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x ptr> [[TMP25]], <2 x ptr> [[TMP26]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <2 x ptr> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> zeroinitializer, <2 x i32> [[TMP16]]
diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
index a854c61..a42c8f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
@@ -11,8 +11,8 @@ define i32 @test(i8 %0) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i8> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr null, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 poison, i32 poison, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> <i8 0, i8 0, i8 poison, i8 0, i8 0, i8 poison, i8 0, i8 0>, <8 x i32> <i32 8, i32 9, i32 0, i32 11, i32 12, i32 1, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> <i8 0, i8 0, i8 poison, i8 0, i8 0, i8 poison, i8 0, i8 0>, <8 x i8> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i8> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD:%.*]] = load i48, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i48> <i48 poison, i48 0, i48 0, i48 0>, i48 [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD]], i32 0
@@ -21,9 +21,9 @@ define i32 @test(i8 %0) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(21) null, align 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = load volatile i8, ptr null, align 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 0, i8 0, i8 0>, <8 x i32> <i32 8, i32 poison, i32 10, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 0, i8 0, i8 0>, i8 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 0, i8 0, i8 0, i8 0>, i8 [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x i8> [[TMP18]], i8 [[TMP13]], i32 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP17]], [[TMP19]]
diff --git a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
new file mode 100644
index 0000000..f0cfd99
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define i1 @test(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 -1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X0]], 0
+; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 0
+; CHECK-NEXT:    [[C2:%.*]] = icmp sgt i32 [[X2]], 0
+; CHECK-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze i1 [[C3]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 [[C1]], i1 false
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP1]], i1 [[OP_RDX]], i1 false
+; CHECK-NEXT:    ret i1 [[OP_RDX1]]
+;
+  %x0 = extractelement <4 x i32> %x, i32 0
+  %x1 = extractelement <4 x i32> %x, i32 -1
+  %x2 = extractelement <4 x i32> %x, i32 2
+  %x3 = extractelement <4 x i32> %x, i32 3
+  %2 = icmp ugt i32 %x0, 0
+  %c1 = icmp slt i32 %x1, 0
+  %c2 = icmp sgt i32 %x2, 0
+  %c3 = icmp slt i32 %x3, 0
+  %s1 = select i1 %2, i1 %c1, i1 false
+  %s2 = select i1 %s1, i1 %c3, i1 false
+  %s3 = select i1 %s2, i1 %c3, i1 false
+  ret i1 %s3
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
index 561182d..940ee5b 100644
--- a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
@@ -1,30 +1,54 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s --check-prefix X86 %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s --check-prefix AARCH64 %}
 
 define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
-; CHECK-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
-; CHECK-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
-; CHECK-NEXT:    ret i1 [[OP_RDX]]
+; X86-LABEL: @test(
+; X86-NEXT:  bb:
+; X86-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
+; X86-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
+; X86-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
+; X86-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
+; X86-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
+; X86-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
+; X86-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
+; X86-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1
+; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2
+; X86-NEXT:    [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]]
+; X86-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]])
+; X86-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP13]], false
+; X86-NEXT:    ret i1 [[OP_RDX]]
+;
+; AARCH64-LABEL: @test(
+; AARCH64-NEXT:  bb:
+; AARCH64-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
+; AARCH64-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
+; AARCH64-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
+; AARCH64-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
+; AARCH64-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
+; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
+; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
+; AARCH64-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
+; AARCH64-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
+; AARCH64-NEXT:    ret i1 [[OP_RDX]]
 ;
 bb:
   %i226 = getelementptr ptr, ptr %arg, i32 7
diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
index 61a84a6..056b622 100644
--- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
@@ -13,9 +13,9 @@ define void @func(i32 %0) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <32 x i32> <i32 poison, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[TMP11]], i32 30
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i32> [[TMP12]], <32 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 30, i32 30>
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24)
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index 7f484e2..ffbacc1 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -34,11 +34,14 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: @switch.table.unreachable_case = private unnamed_addr constant [9 x i32] [i32 0, i32 0, i32 0, i32 2, i32 -1, i32 1, i32 1, i32 1, i32 1], align 4
 ; CHECK: @switch.table.unreachable_default = private unnamed_addr constant [4 x i32] [i32 42, i32 52, i32 1, i32 2], align 4
 ; CHECK: @switch.table.nodefaultnoholes = private unnamed_addr constant [4 x i32] [i32 55, i32 123, i32 0, i32 -1], align 4
-; CHECK: @switch.table.nodefaultwithholes = private unnamed_addr constant [6 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 55, i32 -1], align 4
+; CHECK: @switch.table.nodefaultwithholes = private unnamed_addr constant [6 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 poison, i32 -1], align 4
 ; CHECK: @switch.table.threecases = private unnamed_addr constant [3 x i32] [i32 10, i32 7, i32 5], align 4
-; CHECK: @switch.table.covered_switch_with_bit_tests = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 1, i32 1], align 4
+; CHECK: @switch.table.covered_switch_with_bit_tests = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1], align 4
 ; CHECK: @switch.table.signed_overflow1 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 1111, i32 2222], align 4
-; CHECK: @switch.table.signed_overflow2 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 2222, i32 2222], align 4
+; CHECK: @switch.table.signed_overflow2 = private unnamed_addr constant [4 x i32] [i32 3333, i32 4444, i32 poison, i32 2222], align 4
+; CHECK: @switch.table.constant_hole_unreachable_default_firstundef = private unnamed_addr constant [5 x i32] [i32 undef, i32 poison, i32 1, i32 1, i32 1], align 4
+; CHECK: @switch.table.constant_hole_unreachable_default_lastundef = private unnamed_addr constant [5 x i32] [i32 1, i32 poison, i32 1, i32 1, i32 undef], align 4
+; CHECK: @switch.table.linearmap_hole_unreachable_default = private unnamed_addr constant [5 x i32] [i32 1, i32 poison, i32 5, i32 7, i32 9], align 4
 ;.
 define i32 @f(i32 %c) {
 ; CHECK-LABEL: @f(
@@ -2184,3 +2187,226 @@ return:                                           ; preds = %sw.default, %entry,
   %retval.0 = phi { i8, i8 } [ undef, %entry ], [ undef, %entry ], [ undef, %entry ], [ %1, %sw.default ]
   ret { i8, i8 } %retval.0
 }
+
+; The switch has a hole which falls through to an unreachable default case, but it can still be optimized into a constant load because
+; the poison value used for the hole is ignored.
+define i32 @constant_hole_unreachable_default(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb0
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+
+return:
+  %res = phi i32 [ 1, %bb0 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns undef, yet it cannot be optimized into a simple
+; constant because we actually treat undef as a unique value rather than ignoring it.
+define i32 @constant_hole_unreachable_default_firstundef(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_firstundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.constant_hole_unreachable_default_firstundef, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb.undef
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb0
+  ]
+
+sw.default: unreachable
+bb.undef: br label %return
+bb0: br label %return
+
+return:
+  %res = phi i32 [ undef, %bb.undef ], [ 1, %bb0 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case and the last case explicitly returns undef, yet it cannot be optimized into a simple
+; constant because we actually treat undef as a unique value rather than ignoring it.
+define i32 @constant_hole_unreachable_default_lastundef(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_lastundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.constant_hole_unreachable_default_lastundef, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb.undef
+  ]
+
+sw.default: unreachable
+bb.undef: br label %return
+bb0: br label %return
+
+return:
+  %res = phi i32 [ undef, %bb.undef ], [ 1, %bb0 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns poison, but it can still
+; be optimized into a constant load because the poison values are ignored.
+define i32 @constant_hole_unreachable_default_firstpoison(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_firstpoison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb.poison
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb0
+  ]
+
+sw.default: unreachable
+bb.poison: br label %return
+bb0: br label %return
+
+return:
+  %res = phi i32 [ poison, %bb.poison ], [ 1, %bb0 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case and the first case explicitly returns poison, but it can still
+; be optimized into a constant load because the poison values are ignored.
+define i32 @constant_hole_unreachable_default_lastpoison(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_lastpoison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb0
+  i32 3, label %bb0
+  i32 4, label %bb.poison
+  ]
+
+sw.default: unreachable
+bb.poison: br label %return
+bb0: br label %return
+
+return:
+  %res = phi i32 [ poison, %bb.poison ], [ 1, %bb0 ]
+  ret i32 %res
+}
+
+define i32 @constant_hole_unreachable_default_undef_poison(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_undef_poison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb.undef
+  i32 2, label %bb.poison
+  i32 3, label %bb.poison
+  i32 4, label %bb.poison
+  ]
+
+sw.default: unreachable
+bb.undef: br label %return
+bb.poison: br label %return
+
+return:
+  %res = phi i32 [ undef, %bb.undef ], [ poison, %bb.poison ]
+  ret i32 %res
+}
+
+define i32 @constant_hole_unreachable_default_poison_undef(i32 %x) {
+; CHECK-LABEL: @constant_hole_unreachable_default_poison_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb.poison
+  i32 2, label %bb.poison
+  i32 3, label %bb.poison
+  i32 4, label %bb.undef
+  ]
+
+sw.default: unreachable
+bb.undef: br label %return
+bb.poison: br label %return
+
+return:
+  %res = phi i32 [ undef, %bb.undef ], [ poison, %bb.poison ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case, which prevents it from being optimized into a linear mapping 2*x+1.
+; TODO: We should add support for this, at least in certain cases.
+define i32 @linearmap_hole_unreachable_default(i32 %x) {
+; CHECK-LABEL: @linearmap_hole_unreachable_default(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.linearmap_hole_unreachable_default, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb2
+  i32 3, label %bb3
+  i32 4, label %bb4
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+bb2: br label %return
+bb3: br label %return
+bb4: br label %return
+
+return:
+  %res = phi i32 [ 1, %bb0 ], [ 5, %bb2 ], [ 7, %bb3 ], [ 9, %bb4 ]
+  ret i32 %res
+}
+
+; The switch has a hole which falls through to an unreachable default case, but it can still be optimized into a bitmask extraction because
+; the poison value used for the hole is simply replaced with zero.
+define i1 @bitset_hole_unreachable_default(i32 %x) {
+; CHECK-LABEL: @bitset_hole_unreachable_default(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i32 [[X:%.*]] to i5
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i5 [[SWITCH_CAST]], 1
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i5 8, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i5 [[SWITCH_DOWNSHIFT]] to i1
+; CHECK-NEXT:    ret i1 [[SWITCH_MASKED]]
+;
+entry:
+  switch i32 %x, label %sw.default [
+  i32 0, label %bb0
+  i32 2, label %bb0
+  i32 3, label %bb1
+  i32 4, label %bb0
+  ]
+
+sw.default: unreachable
+bb0: br label %return
+bb1: br label %return
+
+return:
+  %res = phi i1 [ 0, %bb0 ], [ 1, %bb1 ]
+  ret i1 %res
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
index 7988e30..4ebf09a 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table_big.ll
@@ -7,11 +7,11 @@ target triple = "i386-pc-linux-gnu"
 ;.
 ; CHECK: @switch.table.reachable_default_dense_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1], align 4
 ; CHECK: @switch.table.unreachable_default_dense_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1], align 4
-; CHECK: @switch.table.reachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1], align 4
-; CHECK: @switch.table.unreachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1], align 4
+; CHECK: @switch.table.reachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1], align 4
+; CHECK: @switch.table.unreachable_default_holes_0to31 = private unnamed_addr constant [32 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1], align 4
 ; CHECK: @switch.table.reachable_default_dense_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0], align 4
 ; CHECK: @switch.table.unreachable_default_dense_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0], align 4
-; CHECK: @switch.table.unreachable_default_holes_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 0, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 0, i32 7, i32 0, i32 5, i32 4, i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 5, i32 0, i32 3, i32 2, i32 1, i32 0], align 4
+; CHECK: @switch.table.unreachable_default_holes_0to32 = private unnamed_addr constant [33 x i32] [i32 0, i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 2, i32 1, i32 0, i32 7, i32 poison, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0, i32 7, i32 6, i32 5, i32 poison, i32 3, i32 2, i32 1, i32 0], align 4
 ;.
 define i32 @reachable_default_dense_0to31(i32 %x, i32 %y) {
 ; CHECK-LABEL: @reachable_default_dense_0to31(
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll
new file mode 100644
index 0000000..10c6aeb
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/merge-calls-memprof.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+
+;; Test to ensure that memprof related metadata is not dropped when
+;; instructions are combined. Currently the metadata from the first instruction
+;; is kept, which prevents full loss of profile context information.
+
+; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local noundef nonnull ptr @_Z4testb(i1 noundef zeroext %b) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local noundef nonnull ptr @_Z4testb(
+; CHECK-SAME: i1 noundef zeroext [[B:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof [[META0:![0-9]+]], !callsite [[META3:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !0, !callsite !3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call noalias noundef nonnull dereferenceable(4) ptr @_Znwm(i64 noundef 4), !memprof !4, !callsite !7
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+
+declare ptr @_Znwm(i64) nounwind readonly
+
+!0 = !{!1}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434}
+!3 = !{i64 -852997907418798798}
+!4 = !{!5}
+!5 = !{!6, !"cold"}
+!6 = !{i64 123, i64 -2101080423462424381, i64 5188446645037944434}
+!7 = !{i64 123}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], !"notcold"}
+; CHECK: [[META2]] = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434}
+; CHECK: [[META3]] = !{i64 -852997907418798798}
+;.
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
index 92766d5..420e844 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
@@ -11,7 +11,7 @@ target triple = "nvptx64-nvidia-cuda"
 ;       use((b + i) * s);
 ;   }
 ; }
-define void @foo(i32 %b, i32 %s) {
+define ptx_kernel void @foo(i32 %b, i32 %s) {
 ; CHECK-LABEL: .visible .entry foo(
 entry:
 ; CHECK: ld.param.u32 [[s:%r[0-9]+]], [foo_param_1];
@@ -65,7 +65,3 @@ for.inc.3:                                        ; preds = %if.then.3, %for.inc
 declare zeroext i1 @cond(i32)
 
 declare void @use(i32)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
index 057d9af..c3639ba 100644
--- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll
@@ -80,13 +80,29 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 }
 
 define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: @movmsk_i64_v64i8_v16i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
-; CHECK-NEXT:    ret i64 [[OR]]
+; SSE-LABEL: @movmsk_i64_v64i8_v16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v64i8_v16i8(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer
+; AVX512-NEXT:    [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <16 x i8> %v0, zeroinitializer
   %c1 = icmp slt <16 x i8> %v1, zeroinitializer
@@ -110,14 +126,32 @@ define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2,
 }
 
 define i64 @movmsk_i64_v32i32_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: @movmsk_i64_v32i32_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
-; CHECK-NEXT:    ret i64 [[OR]]
+; SSE-LABEL: @movmsk_i64_v32i32_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; SSE-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; SSE-NEXT:    ret i64 [[OR]]
+;
+; AVX2-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; AVX2-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; AVX2-NEXT:    ret i64 [[OR]]
+;
+; AVX512-LABEL: @movmsk_i64_v32i32_v4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> [[V0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V3:%.*]], <4 x i32> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[TMP4:%.*]] = icmp slt <16 x i32> [[TMP3]], zeroinitializer
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; AVX512-NEXT:    [[OR:%.*]] = zext i16 [[TMP5]] to i64
+; AVX512-NEXT:    ret i64 [[OR]]
 ;
   %c0 = icmp slt <4 x i32> %v0, zeroinitializer
   %c1 = icmp slt <4 x i32> %v1, zeroinitializer
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 800f576..f3b7f7b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -465,33 +465,13 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @PR34724(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 poison, i32 2, i32 4, i32 poison>
-; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @PR34724(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @PR34724(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[V3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 307fbf7..c125b73 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -465,34 +465,19 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @PR34724(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
-; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @PR34724(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @PR34724(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+; CHECK-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
index 5c856ce..cd2bc75 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -91,20 +91,11 @@ define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
   ret <4 x double> %r
 }
 
-; The vector fneg would cost twice as much as the scalar op with SSE,
-; so we don't transform there (the shuffle would also be more expensive).
-
 define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
-; SSE-LABEL: @ext7_v8f32(
-; SSE-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
-; SSE-NEXT:    [[N:%.*]] = fneg float [[E]]
-; SSE-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R]]
-;
-; AVX-LABEL: @ext7_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
-; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
-; AVX-NEXT:    ret <8 x float> [[R]]
+; CHECK-LABEL: @ext7_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %e = extractelement <8 x float> %x, i32 7
   %n = fneg float %e
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
new file mode 100644
index 0000000..0acfecc
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+
+; infinite loop if we add the erased instructions to the work list in the wrong order.
+define void @multiple_extract(ptr %p) {
+; CHECK-LABEL: @multiple_extract(
+; CHECK-NEXT:    [[VP:%.*]] = load ptr, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 0
+; CHECK-NEXT:    [[E0:%.*]] = load i32, ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[VP]], i32 0, i64 1
+; CHECK-NEXT:    [[E1:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[E0]], ptr [[P]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    store i32 [[E1]], ptr [[P1]], align 4
+; CHECK-NEXT:    ret void
+;
+  %vp = load ptr, ptr %p, align 8
+  %v = load <2 x i32>, ptr %vp, align 16
+  %e0 = extractelement <2 x i32> %v, i64 0
+  %e1 = extractelement <2 x i32> %v, i64 1
+  store i32 %e0, ptr %p, align 4
+  %p1 = getelementptr inbounds nuw i8, ptr %p, i64 4
+  store i32 %e1, ptr %p1, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 937d404..2db1e21 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -544,10 +544,7 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc
 ; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
 ; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
 ; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
-; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; CHECK-NEXT:    store <2 x float> [[T3]], ptr [[RESULTPTR:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index b3360b6..f9108ef 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
 
 declare void @use(<4 x i1>)
 
@@ -105,8 +105,8 @@ define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z,
 define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; SSE2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
 ; SSE2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
 ; SSE2-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; SSE2-NEXT:    ret <4 x i32> [[R]]
@@ -115,21 +115,29 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
 ; SSE4-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
 ; SSE4-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
 ; SSE4-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
 ; SSE4-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; SSE4-NEXT:    ret <4 x i32> [[R]]
 ;
-; AVX-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
-; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
-; AVX-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; AVX-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX-NEXT:    ret <4 x i32> [[R]]
+; AVX2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; AVX2-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
+; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
+; AVX512-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp oeq <4 x float> %x, %y
   %b1 = fcmp oeq <4 x float> %x, %z
-  %s = shufflevector <4 x i1> %b0, <4 x i1> %b1, <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+  %s = shufflevector <4 x i1> %b0, <4 x i1> %b1, <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
   %r = sext <4 x i1> %s to <4 x i32>
   ret <4 x i32> %r
 }
@@ -137,29 +145,13 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
 ; For commutative instructions, common operand may be swapped
 
 define <4 x i32> @shuf_fcmp_one_v4f32_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; SSE-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; SSE-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; SSE-NEXT:    ret <4 x i32> [[R]]
-;
-; AVX2-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[B0:%.*]] = fcmp one <4 x float> [[X]], [[Y]]
-; AVX2-NEXT:    [[B1:%.*]] = fcmp one <4 x float> [[Z]], [[X]]
-; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX2-NEXT:    ret <4 x i32> [[R]]
-;
-; AVX512-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
-; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; AVX512-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX512-NEXT:    ret <4 x i32> [[R]]
+; CHECK-LABEL: define <4 x i32> @shuf_fcmp_one_v4f32_swap(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[S:%.*]] = fcmp one <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp one <4 x float> %x, %y
   %b1 = fcmp one <4 x float> %z, %x
@@ -275,3 +267,33 @@ define <4 x i32> @shuf_icmp_ugt_v4i32_use(<4 x i32> %x, <4 x i32> %y, <4 x i32>
   %r = sext <4 x i1> %s to <4 x i32>
   ret <4 x i32> %r
 }
+
+; PR121110 - don't merge equivalent (but not matching) predicates
+
+define <2 x i1> @PR121110() {
+; CHECK-LABEL: define <2 x i1> @PR121110(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[UGT]], <2 x i1> [[SGT]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
+  %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
+  %res = shufflevector <2 x i1> %ugt, <2 x i1> %sgt, <2 x i32> <i32 0, i32 3>
+  ret <2 x i1> %res
+}
+
+define <2 x i1> @PR121110_commute() {
+; CHECK-LABEL: define <2 x i1> @PR121110_commute(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[SGT]], <2 x i1> [[UGT]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x i1> [[RES]]
+;
+  %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
+  %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
+  %res = shufflevector <2 x i1> %sgt, <2 x i1> %ugt, <2 x i32> <i32 0, i32 3>
+  ret <2 x i1> %res
+}
diff --git a/llvm/test/tools/UpdateTestChecks/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
index a954eb7..7147769 100644
--- a/llvm/test/tools/UpdateTestChecks/lit.local.cfg
+++ b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
@@ -10,7 +10,7 @@ except ImportError:
     from pipes import quote as shell_quote
 
 
-def add_update_script_substition(
+def add_update_script_substitution(
     name, python_exe=config.python_executable, extra_args=""
 ):
     assert name.startswith("%")
@@ -33,26 +33,26 @@ llc_path = os.path.join(config.llvm_tools_dir, "llc")
 if os.path.isfile(llc_path):
     config.available_features.add("llc-binary")
     llc_arg = "--llc-binary " + shell_quote(llc_path)
-    add_update_script_substition("%update_llc_test_checks", extra_args=llc_arg)
-    add_update_script_substition("%update_mir_test_checks", extra_args=llc_arg)
+    add_update_script_substitution("%update_llc_test_checks", extra_args=llc_arg)
+    add_update_script_substitution("%update_mir_test_checks", extra_args=llc_arg)
 
 opt_path = os.path.join(config.llvm_tools_dir, "opt")
 if os.path.isfile(opt_path):
     config.available_features.add("opt-binary")
     opt_arg = "--opt-binary " + shell_quote(opt_path)
-    add_update_script_substition("%update_test_checks", extra_args=opt_arg)
-    add_update_script_substition("%update_analyze_test_checks", extra_args=opt_arg)
+    add_update_script_substitution("%update_test_checks", extra_args=opt_arg)
+    add_update_script_substitution("%update_analyze_test_checks", extra_args=opt_arg)
 
 llvm_mca_path = os.path.join(config.llvm_tools_dir, "llvm-mca")
 if os.path.isfile(llvm_mca_path):
     config.available_features.add("llvm-mca-binary")
     mca_arg = "--llvm-mca-binary " + shell_quote(llvm_mca_path)
-    add_update_script_substition("%update_test_checks", extra_args=mca_arg)
+    add_update_script_substitution("%update_test_checks", extra_args=mca_arg)
 
 split_file_path = os.path.join(config.llvm_tools_dir, "split-file")
 if os.path.isfile(split_file_path):
-    add_update_script_substition("%update_test_body")
+    add_update_script_substitution("%update_test_body")
 
 llvm_mc_path = os.path.join(config.llvm_tools_dir, "llvm-mc")
 if os.path.isfile(llvm_mc_path):
-    add_update_script_substition("%update_mc_test_checks")
+    add_update_script_substitution("%update_mc_test_checks")
diff --git a/llvm/test/tools/dxil-dis/fastmath.ll b/llvm/test/tools/dxil-dis/fastmath.ll
new file mode 100644
index 0000000..7f4ba5b
--- /dev/null
+++ b/llvm/test/tools/dxil-dis/fastmath.ll
@@ -0,0 +1,23 @@
+; RUN: llc %s --filetype=obj -o - | dxil-dis -o - | FileCheck %s
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+define float @fma(float %0, float %1, float %2) #0 {
+  ; verify reassoc and contract are converted to fast
+  ; CHECK: %4 = fmul fast float %0, %1
+  %4 = fmul reassoc float %0, %1
+  ; CHECK-NEXT: %5 = fadd fast float %4, %2
+  %5 = fadd contract float %4, %2
+  ; verify these are converted to a single fast flag
+  ; CHECK-NEXT: %6 = fmul fast float %0, %1
+  %6 = fmul reassoc contract float %0, %1
+  ; verify these flags are maintained
+  ; CHECK-NEXT: %7 = fadd nnan ninf nsz arcp float %0, %1
+  %7 = fadd nnan ninf nsz arcp float %0, %1
+  ; verify that afn is removed
+  ; CHECK-NEXT: %8 = fmul float %0, %1
+  %8 = fmul afn float %0, %1
+  ret float %5
+}
+
+attributes #0 = { norecurse nounwind readnone willreturn "disable-tail-calls"="false" "waveops-include-helper-lanes" "fp32-denorm-mode"="any" "hlsl.export" }
+
diff --git a/llvm/test/tools/llvm-cov/branch-macros.test b/llvm/test/tools/llvm-cov/branch-macros.test
index e4bd14e..b16ef9d 100644
--- a/llvm/test/tools/llvm-cov/branch-macros.test
+++ b/llvm/test/tools/llvm-cov/branch-macros.test
@@ -1,5 +1,6 @@
 // RUN: llvm-profdata merge %S/Inputs/branch-macros.proftext -o %t.profdata
 // RUN: llvm-cov show --show-expansions --show-branches=count %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck %S/Inputs/branch-macros.cpp -check-prefixes=CHECK,BRCOV -D#C=999
+// RUN: llvm-cov show --binary-counters --show-expansions --show-branches=count %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck %S/Inputs/branch-macros.cpp -check-prefixes=CHECK,BRCOV -D#C=1
 // RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-macros.o32l -instr-profile %t.profdata -show-functions -path-equivalence=/tmp,%S/Inputs %S/Inputs/branch-macros.cpp | FileCheck %s -check-prefix=REPORT
 
 // RUN: yaml2obj %S/Inputs/branch-macros-single.yaml -o %t.o
diff --git a/llvm/test/tools/llvm-cov/branch-noShowBranch.test b/llvm/test/tools/llvm-cov/branch-noShowBranch.test
index cabeeb0..9f3cfd5 100644
--- a/llvm/test/tools/llvm-cov/branch-noShowBranch.test
+++ b/llvm/test/tools/llvm-cov/branch-noShowBranch.test
@@ -12,7 +12,7 @@
 // REPORT-NOT: conditionals                     24       0 100.00%        15       0 100.00%        16       2  87.50%
 // REPORT-NOT: early_exits                      20       4  80.00%        25       2  92.00%        16       6  62.50%
 // REPORT-NOT: jumps                            39      12  69.23%        48       2  95.83%        26       9  65.38%
-// REPORT-NOT: switches                         28       5  82.14%        38       4  89.47%        30       9  70.00%
+// REPORT-NOT: switches                         28       5  82.14%        38       4  89.47%        28       7  75.00%
 // REPORT-NOT: big_switch                       25       1  96.00%        32       0 100.00%        30       6  80.00%
 // REPORT-NOT: boolean_operators                16       0 100.00%        13       0 100.00%        22       2  90.91%
 // REPORT-NOT: boolop_loops                     19       0 100.00%        14       0 100.00%        16       2  87.50%
@@ -21,5 +21,4 @@
 // REPORT-NOT: main                              1       0 100.00%        16       0 100.00%         0       0   0.00%
 // REPORT-NOT: c-general.c:static_func           4       0 100.00%         4       0 100.00%         2       0 100.00%
 // REPORT:     TOTAL                           197      24  87.82%       234       8  96.58%
-// REPORT-NOT: TOTAL                           197      24  87.82%       234      13  94.44%       174      38  78.16%
-
+// REPORT-NOT: TOTAL                           197      24  87.82%       234       8  96.58%       172      36  79.07%
diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts.test
index 4f505f96..a165d8d 100644
--- a/llvm/test/tools/llvm-cov/showLineExecutionCounts.test
+++ b/llvm/test/tools/llvm-cov/showLineExecutionCounts.test
@@ -3,6 +3,7 @@
 // RUN: llvm-profdata merge %S/Inputs/lineExecutionCounts.proftext -o %t.profdata
 
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck -check-prefixes=TEXT,WHOLE-FILE -D#C=999 -DC16K2=16.2k -DC16K1=16.1k %S/Inputs/showLineExecutionCounts.cpp
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -binary-counters -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs | FileCheck -check-prefixes=TEXT,WHOLE-FILE -D#C=1 -DC16K2=1 -DC16K1=1 %S/Inputs/showLineExecutionCounts.cpp
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs -name=main | FileCheck -check-prefixes=TEXT,FILTER -D#C=999 -DC16K2=16.2k -DC16K1=16.1k %S/Inputs/showLineExecutionCounts.cpp
 
 // Test -output-dir.
@@ -16,8 +17,10 @@
 //
 // Test html output.
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html.binary -binary-counters -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs
 // RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -format html -o %t.dir/html.filtered -instr-profile %t.profdata -path-equivalence=/tmp,%S/Inputs -name=main
 // RUN: FileCheck -check-prefixes=HTML,HTML-WHOLE-FILE -input-file %t.dir/html/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp
+// RUN: FileCheck -check-prefixes=HTML-BINARY,HTML-WHOLE-FILE -input-file %t.dir/html.binary/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp
 // RUN: FileCheck -check-prefixes=HTML,HTML-FILTER -input-file %t.dir/html.filtered/coverage/tmp/showLineExecutionCounts.cpp.html %S/Inputs/showLineExecutionCounts.cpp
 //
 // Test index creation.
diff --git a/llvm/test/tools/llvm-dlltool/machine-opt.def b/llvm/test/tools/llvm-dlltool/machine-opt.def
index 6dce825..fcb8529 100644
--- a/llvm/test/tools/llvm-dlltool/machine-opt.def
+++ b/llvm/test/tools/llvm-dlltool/machine-opt.def
@@ -6,6 +6,8 @@
 ; RUN: llvm-readobj %t.a | FileCheck --check-prefix=ARM %s
 ; RUN: llvm-dlltool -m arm64 -d %s -l %t.a
 ; RUN: llvm-readobj %t.a | FileCheck --check-prefix=ARM64 %s
+; RUN: llvm-dlltool -m r4000 -d %s -l %t.a
+; RUN: llvm-readobj %t.a | FileCheck --check-prefix=MIPS %s
 
 LIBRARY test.dll
 EXPORTS
@@ -15,3 +17,4 @@ TestFunction
 ; X86_64: Format: COFF-x86-64
 ; ARM: Format: COFF-ARM{{$}}
 ; ARM64: Format: COFF-ARM64
+; MIPS: Format: COFF-MIPS
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s b/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s
index 0ea3752..e7430e4 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/cpu-pinning.s
@@ -1,5 +1,5 @@
 # REQUIRES: exegesis-can-measure-latency, x86_64-linux
 
-# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -opcode-name=ADD64rr -execution-mode=subprocess | FileCheck %s
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -opcode-name=ADD64rr -execution-mode=subprocess --benchmark-process-cpu=0 | FileCheck %s
 
 # CHECK: - { key: latency, value: {{[0-9.]*}}, per_snippet_value: {{[0-9.]*}}
diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
index 94a162c..522c576 100644
--- a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
+++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
@@ -64,6 +64,18 @@
 # CHECK-GSYM-KEEP-NEXT:       0x{{[0-9a-fA-F]+}} /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:10
 # CHECK-GSYM-KEEP-NEXT:       0x{{[0-9a-fA-F]+}} /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:6
 
+## Test the lookup functionality for merged functions:
+# RUN: llvm-gsymutil --verify %t.keep.gSYM --address 0x248 --merged-functions | FileCheck --check-prefix=CHECK-MERGED-LOOKUP %s
+# RUN: llvm-gsymutil --verify %t.keep.gSYM --address 0x248 | FileCheck --check-prefix=CHECK-NORMAL-LOOKUP %s
+
+#### TODO: Fix non-determinism leading that is currently worked around with `{{[1-3]}}` below.
+
+# CHECK-MERGED-LOOKUP: Found 3 functions at address 0x0000000000000248:
+# CHECK-MERGED-LOOKUP-NEXT:       0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
+# CHECK-MERGED-LOOKUP-NEXT-NEXT:  0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
+# CHECK-MERGED-LOOKUP-NEXT-NEXT:  0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
+ 
+# CHECK-NORMAL-LOOKUP: 0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
 
 
 --- !mach-o
diff --git a/llvm/test/tools/llvm-lib/Inputs/mips.ll b/llvm/test/tools/llvm-lib/Inputs/mips.ll
new file mode 100644
index 0000000..dd0f8338
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/Inputs/mips.ll
@@ -0,0 +1,7 @@
+target triple = "mipsel-windows-coff"
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @"?f@@YAXXZ"() #0 {
+entry:
+  ret void
+}
diff --git a/llvm/test/tools/llvm-lib/infer-machine.test b/llvm/test/tools/llvm-lib/infer-machine.test
new file mode 100644
index 0000000..23ecf25
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/infer-machine.test
@@ -0,0 +1,21 @@
+REQUIRES: mips-registered-target
+
+RUN: rm -rf %t && mkdir -p %t
+
+RUN: llc -mtriple=i386-windows-coff -filetype=obj -o %t/i386.obj %S/Inputs/i386.ll
+RUN: llvm-as %S/Inputs/i386.ll -o %t/i386.bc
+RUN: llvm-lib %t/i386.obj %t/i386.bc /out:%t/i386.lib
+RUN: llvm-objdump -h %t/i386.lib | FileCheck %s --check-prefix=I386
+I386: file format coff-i386
+
+RUN: llc -mtriple=x86_64-windows-coff -filetype=obj -o %t/x86_64.obj %S/Inputs/x86_64.ll
+RUN: llvm-as %S/Inputs/x86_64.ll -o %t/x86_64.bc
+RUN: llvm-lib %t/x86_64.obj %t/x86_64.bc /out:%t/x86_64.lib
+RUN: llvm-objdump -h %t/x86_64.lib | FileCheck %s --check-prefix=X86_64
+X86_64: file format coff-x86-64
+
+RUN: llc -mtriple=mipsel-windows-coff -filetype=obj -o %t/mips.obj %S/Inputs/mips.ll
+RUN: llvm-as %S/Inputs/mips.ll -o %t/mips.bc
+RUN: llvm-lib %t/mips.obj %t/mips.bc /out:%t/mips.lib
+RUN: llvm-objdump -h %t/mips.lib | FileCheck %s --check-prefix=MIPS
+MIPS: file format coff-mips
diff --git a/llvm/test/tools/llvm-lib/machine-opt.test b/llvm/test/tools/llvm-lib/machine-opt.test
new file mode 100644
index 0000000..e5ade82
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/machine-opt.test
@@ -0,0 +1,13 @@
+RUN: rm -f %t.lib
+
+RUN: llvm-lib /out:%t.lib /machine:i386 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+RUN: llvm-lib /out:%t.lib /machine:amd64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+
+RUN: llvm-lib /out:%t.lib /machine:mips 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+
+RUN: llvm-lib /out:%t.lib /machine:arm 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+RUN: llvm-lib /out:%t.lib /machine:arm64 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+RUN: llvm-lib /out:%t.lib /machine:arm64x 2>&1 | FileCheck --check-prefix=EMPTYWARN %s
+
+EMPTYWARN: warning: no input files, not writing output file
+
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
index f4c4a20..cf1cf0e 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
@@ -1891,7 +1891,7 @@ drps
 # CHECK-NEXT:  1      2     0.50                        fmov	s0, s1
 # CHECK-NEXT:  1      2     0.50                        fabs	s2, s3
 # CHECK-NEXT:  1      2     0.50                        fneg	s4, s5
-# CHECK-NEXT:  1      9     1.00                        fsqrt	s6, s7
+# CHECK-NEXT:  9      9     9.00                        fsqrt	s6, s7
 # CHECK-NEXT:  1      3     1.00                        fcvt	d8, s9
 # CHECK-NEXT:  1      3     1.00                        fcvt	h10, s11
 # CHECK-NEXT:  1      3     1.00                        frintn	s12, s13
@@ -1904,7 +1904,7 @@ drps
 # CHECK-NEXT:  1      2     0.50                        fmov	d0, d1
 # CHECK-NEXT:  1      2     0.50                        fabs	d2, d3
 # CHECK-NEXT:  1      2     0.50                        fneg	d4, d5
-# CHECK-NEXT:  1      16    1.00                        fsqrt	d6, d7
+# CHECK-NEXT:  16     16    16.00                       fsqrt	d6, d7
 # CHECK-NEXT:  1      3     1.00                        fcvt	s8, d9
 # CHECK-NEXT:  1      3     1.00                        fcvt	h10, d11
 # CHECK-NEXT:  1      3     1.00                        frintn	d12, d13
@@ -1917,7 +1917,7 @@ drps
 # CHECK-NEXT:  1      3     1.00                        fcvt	s26, h27
 # CHECK-NEXT:  1      3     1.00                        fcvt	d28, h29
 # CHECK-NEXT:  1      3     0.50                        fmul	s20, s19, s17
-# CHECK-NEXT:  1      10    1.00                        fdiv	s1, s2, s3
+# CHECK-NEXT:  10     10    10.00                       fdiv	s1, s2, s3
 # CHECK-NEXT:  1      2     0.50                        fadd	s4, s5, s6
 # CHECK-NEXT:  1      2     0.50                        fsub	s7, s8, s9
 # CHECK-NEXT:  1      2     0.50                        fmax	s10, s11, s12
@@ -1926,7 +1926,7 @@ drps
 # CHECK-NEXT:  1      2     0.50                        fminnm	s19, s20, s21
 # CHECK-NEXT:  1      3     0.50                        fnmul	s22, s23, s2
 # CHECK-NEXT:  1      3     0.50                        fmul	d20, d19, d17
-# CHECK-NEXT:  1      15    1.00                        fdiv	d1, d2, d3
+# CHECK-NEXT:  15     15    15.00                       fdiv	d1, d2, d3
 # CHECK-NEXT:  1      2     0.50                        fadd	d4, d5, d6
 # CHECK-NEXT:  1      2     0.50                        fsub	d7, d8, d9
 # CHECK-NEXT:  1      2     0.50                        fmax	d10, d11, d12
@@ -2557,7 +2557,7 @@ drps
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]
-# CHECK-NEXT: 11.00  11.00  33.00  33.00  87.33  151.33 151.33 517.00 251.00 162.50 162.50 169.50 85.50
+# CHECK-NEXT: 11.00  11.00  33.00  33.00  87.33  151.33 151.33 517.00 251.00 162.50 162.50 215.50 85.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    Instructions:
@@ -3075,7 +3075,7 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmov	s0, s1
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fabs	s2, s3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fneg	s4, s5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fsqrt	s6, s7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     9.00    -     fsqrt	s6, s7
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	d8, s9
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	h10, s11
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     frintn	s12, s13
@@ -3088,7 +3088,7 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmov	d0, d1
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fabs	d2, d3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fneg	d4, d5
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fsqrt	d6, d7
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     16.00   -     fsqrt	d6, d7
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	s8, d9
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	h10, d11
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     frintn	d12, d13
@@ -3101,7 +3101,7 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	s26, h27
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fcvt	d28, h29
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmul	s20, s19, s17
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fdiv	s1, s2, s3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     10.00   -     fdiv	s1, s2, s3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fadd	s4, s5, s6
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fsub	s7, s8, s9
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmax	s10, s11, s12
@@ -3110,7 +3110,7 @@ drps
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fminnm	s19, s20, s21
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fnmul	s22, s23, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmul	d20, d19, d17
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     fdiv	d1, d2, d3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     15.00   -     fdiv	d1, d2, d3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fadd	d4, d5, d6
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fsub	d7, d8, d9
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   fmax	d10, d11, d12
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
index 04c95f62f..36a2f04 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
@@ -325,11 +325,11 @@
 # CHECK-NEXT:  2      1     1.00           *            strd	r4, r5, [r12], -r10
 # CHECK-NEXT:  1      1     1.00           *            strh	r3, [r4]
 # CHECK-NEXT:  1      1     1.00           *            strh	r2, [r7, #4]
-# CHECK-NEXT:  2      1     1.00                  U     strh	r1, [r8, #64]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r1, [r8, #64]!
 # CHECK-NEXT:  2      1     1.00           *            strh	r12, [sp], #4
 # CHECK-NEXT:  1      1     1.00           *            strh	r6, [r5, r4]
-# CHECK-NEXT:  2      1     1.00                  U     strh	r3, [r8, r11]!
-# CHECK-NEXT:  2      1     1.00                  U     strh	r1, [r2, -r1]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r3, [r8, r11]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r1, [r2, -r1]!
 # CHECK-NEXT:  2      1     1.00           *            strh	r9, [r7], r2
 # CHECK-NEXT:  2      1     1.00           *            strh	r4, [r3], -r2
 # CHECK-NEXT:  2      1     1.00                  U     strht	r2, [r5], #76
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s
new file mode 100644
index 0000000..c42b4a9
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s
@@ -0,0 +1,1009 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p470 -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m8, tu, mu
+vdiv.vv v8, v16, v24
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m8, tu, mu
+vdiv.vx v8, v16, a0
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m8, tu, mu
+vfdiv.vv v8, v16, v24
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m8, tu, mu
+vfsqrt.v v8, v16
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      320
+# CHECK-NEXT: Total Cycles:      22358
+# CHECK-NEXT: Total uOps:        320
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.01
+# CHECK-NEXT: IPC:               0.01
+# CHECK-NEXT: Block RThroughput: 14361.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      102   102.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      204   204.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      90    90.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      180   180.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      360   360.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      84    84.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      168   168.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      336   336.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      144   144.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      288   288.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      576   576.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      102   102.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      204   204.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      90    90.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      180   180.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      360   360.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      84    84.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      168   168.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      336   336.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      144   144.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      288   288.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      576   576.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfsqrt.v	v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP400Div
+# CHECK-NEXT: [1]   - SiFiveP400FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP400FloatDiv
+# CHECK-NEXT: [3]   - SiFiveP400IEXQ0
+# CHECK-NEXT: [4]   - SiFiveP400IEXQ1
+# CHECK-NEXT: [5]   - SiFiveP400IEXQ2
+# CHECK-NEXT: [6]   - SiFiveP400Load
+# CHECK-NEXT: [7]   - SiFiveP400Store
+# CHECK-NEXT: [8]   - SiFiveP400VDiv
+# CHECK-NEXT: [9]   - SiFiveP400VEXQ0
+# CHECK-NEXT: [10]  - SiFiveP400VFloatDiv
+# CHECK-NEXT: [11]  - SiFiveP400VLD
+# CHECK-NEXT: [12]  - SiFiveP400VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -      -      -      -     160.00  -      -      -     12186.00 725.00 14361.00  -   -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     102.00 2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     204.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     90.00  2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     180.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     360.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     84.00  2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     168.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     336.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     144.00 2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     288.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     576.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     102.00 2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     204.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     90.00  2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     180.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     360.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     84.00  2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     168.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     336.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     144.00 2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     288.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     576.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfsqrt.v	v8, v16
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s
new file mode 100644
index 0000000..5f7a1d1
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p470 -iterations=1 < %s | FileCheck %s
+
+mul s6, s6, s7
+
+mulw s4, s4, a2
+
+cpop t1, t1
+
+cpopw t2, t2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     1.00                        mul	s6, s6, s7
+# CHECK-NEXT:  1      2     1.00                        mulw	s4, s4, a2
+# CHECK-NEXT:  1      2     1.00                        cpop	t1, t1
+# CHECK-NEXT:  1      2     1.00                        cpopw	t2, t2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP400Div
+# CHECK-NEXT: [1]   - SiFiveP400FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP400FloatDiv
+# CHECK-NEXT: [3]   - SiFiveP400IEXQ0
+# CHECK-NEXT: [4]   - SiFiveP400IEXQ1
+# CHECK-NEXT: [5]   - SiFiveP400IEXQ2
+# CHECK-NEXT: [6]   - SiFiveP400Load
+# CHECK-NEXT: [7]   - SiFiveP400Store
+# CHECK-NEXT: [8]   - SiFiveP400VDiv
+# CHECK-NEXT: [9]   - SiFiveP400VEXQ0
+# CHECK-NEXT: [10]  - SiFiveP400VFloatDiv
+# CHECK-NEXT: [11]  - SiFiveP400VLD
+# CHECK-NEXT: [12]  - SiFiveP400VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -      -      -      -      -     4.00    -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     mul	s6, s6, s7
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     mulw	s4, s4, a2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     cpop	t1, t1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     cpopw	t2, t2
diff --git a/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml b/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml
new file mode 100644
index 0000000..02a6e9d
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/COFF/Inputs/i386-debug-rdata.yaml
@@ -0,0 +1,63 @@
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4096
+  ImageBase:       268435456
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [  ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  Debug:
+    RelativeVirtualAddress: 8192
+    Size:            28
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE, IMAGE_FILE_DLL ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     18
+    SectionData:     5589E58B45108B450C8B450831C05DC20C00
+    SizeOfRawData:   512
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     109
+    SectionData:     000000008D6978670000000002000000510000001C2000001C060000525344538B301061671ED0994C4C44205044422E010000002F686F6D652F6D652F446F63756D656E74732F6C6C766D2D6D696E67772F6C6C766D2D70726F6A6563742F6C6C766D2F746573742E70646200
+    SizeOfRawData:   512
+  - Name:            .debug_abbrev
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  12288
+    VirtualSize:     78
+    SectionData:     011101250E1305030E10171B0E110112060000022E011101120640186E0E030E3A0B3B0B2719360B49133F1900000305000218030E3A0B3B0B49130000042400030E3E0B0B0B0000050F00000000
+    SizeOfRawData:   512
+  - Name:            .debug_info
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  16384
+    VirtualSize:     116
+    SectionData:     700000000400000000000401000000001D006E000000000000007500000000100010120000000200100010120000000155A5000000BC0000000101B16B00000003029108D70000000101720000000302910CD500000001016B00000003029110D30000000101720000000004CF00000005040500
+    SizeOfRawData:   512
+  - Name:            .debug_line
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  20480
+    VirtualSize:     60
+    SectionData:     3800000004001E000000010101FB0E0D00010101010000000100000100746573742E6300000000000005020010001001053D0ABA060B2E0204000101
+    SizeOfRawData:   512
+  - Name:            .debug_str
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  24576
+    VirtualSize:     217
+    SectionData:     636C616E672076657273696F6E2032302E302E30676974202868747470733A2F2F6769746875622E636F6D2F62796C6177732F6C6C766D2D70726F6A6563742E67697420393963353263306236613662396366303765383365656265393364323831333635656165383732332900746573742E63002F686F6D652F6D652F446F63756D656E74732F6C6C766D2D6D696E67772F6C6C766D2D70726F6A6563742F6C6C766D005F5F446C6C4D61696E43525453746172747570403132005F446C6C4D61696E4352545374617274757000696E7400630062006100
+    SizeOfRawData:   512
+symbols:         []
+...
diff --git a/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test b/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test
new file mode 100644
index 0000000..affd4b6
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/COFF/only-keep-debug-rdata.test
@@ -0,0 +1,45 @@
+RUN: yaml2obj %p/Inputs/i386-debug-rdata.yaml -o %t.in.exe
+
+RUN: llvm-objcopy --only-keep-debug %t.in.exe %t.out.exe
+RUN: llvm-readobj --sections %t.out.exe | FileCheck %s
+
+Check that all non-debug/rodata (which contains the debug directory in this case)
+sections with IMAGE_SCN_CNT_CODE or IMAGE_SCN_CNT_INITIALIZED_DATA are truncated,
+and no others.
+
+CHECK:       Section {
+CHECK-NEXT:   Number: 1
+CHECK-NEXT:   Name: .text (2E 74 65 78 74 00 00 00)
+CHECK-NEXT:   VirtualSize: 0x12
+CHECK-NEXT:   VirtualAddress: 0x1000
+CHECK-NEXT:   RawDataSize: 0
+CHECK:       Section {
+CHECK-NEXT:   Number: 2
+CHECK-NEXT:   Name: .rdata (2E 72 64 61 74 61 00 00)
+CHECK-NEXT:   VirtualSize: 0x6D
+CHECK-NEXT:   VirtualAddress: 0x2000
+CHECK-NEXT:   RawDataSize: 512
+CHECK:       Section {
+CHECK-NEXT:   Number: 3
+CHECK-NEXT:   Name: .debug_abbrev (2F 34 00 00 00 00 00 00)
+CHECK-NEXT:   VirtualSize: 0x4E
+CHECK-NEXT:   VirtualAddress: 0x3000
+CHECK-NEXT:   RawDataSize: 512
+CHECK:       Section {
+CHECK-NEXT:   Number: 4
+CHECK-NEXT:   Name: .debug_info (2F 32 39 00 00 00 00 00)
+CHECK-NEXT:   VirtualSize: 0x74
+CHECK-NEXT:   VirtualAddress: 0x4000
+CHECK-NEXT:   RawDataSize: 512
+CHECK:       Section {
+CHECK-NEXT:   Number: 5
+CHECK-NEXT:   Name: .debug_line (2F 34 31 00 00 00 00 00)
+CHECK-NEXT:   VirtualSize: 0x3C
+CHECK-NEXT:   VirtualAddress: 0x5000
+CHECK-NEXT:   RawDataSize: 512
+CHECK:       Section {
+CHECK-NEXT:   Number: 6
+CHECK-NEXT:   Name: .debug_str (2F 31 38 00 00 00 00 00)
+CHECK-NEXT:   VirtualSize: 0xD9
+CHECK-NEXT:   VirtualAddress: 0x6000
+CHECK-NEXT:   RawDataSize: 512
diff --git a/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test
new file mode 100644
index 0000000..ea47d44
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/globalize-symbol.test
@@ -0,0 +1,134 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --wildcard --globalize-symbol="*" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# RUN: echo "*" > %t-star.txt
+# RUN: llvm-objcopy --wildcard --globalize-symbols="%t-star.txt" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# CHECK:      Symbols [
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x1
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateExternalSymbol
+# CHECK-NEXT:     PrivateExtern
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x2
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x3
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _UndefinedExternalSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Undef (0x0)
+# CHECK-NEXT:     Section:  (0x0)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000FF8
+        size:            8
+        offset:          0xFF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00008052C0035FD6
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        67
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           4
+    stroff:          4164
+    strsize:         79
+LinkEditData:
+  NameList:
+    - n_strx:          2
+      n_type:          0x0E
+      n_sect:          1
+      n_desc:          0
+      n_value:         1
+    - n_strx:          17
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         2
+    - n_strx:          40
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3
+    - n_strx:          54
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _PrivateSymbol
+    - _PrivateExternalSymbol
+    - _CommonSymbol
+    - _UndefinedExternalSymbol
+...
diff --git a/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test
new file mode 100644
index 0000000..009a732
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/keep-global-symbol.test
@@ -0,0 +1,147 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --keep-global-symbol _CommonSymbol %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# RUN: echo _CommonSymbol > %t-sym-list.txt
+# RUN: llvm-objcopy --wildcard --keep-global-symbols="%t-sym-list.txt" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# CHECK:      Symbols [
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateSymbol
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x1
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateExternalSymbol
+# CHECK-NEXT:     PrivateExtern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x2
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol2
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x4
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x3
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _UndefinedExternalSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Undef (0x0)
+# CHECK-NEXT:     Section:  (0x0)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000FF8
+        size:            8
+        offset:          0xFF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00008052C0035FD6
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        94
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           5
+    stroff:          4176
+    strsize:         94
+LinkEditData:
+  NameList:
+    - n_strx:          2
+      n_type:          0x0E
+      n_sect:          1
+      n_desc:          0
+      n_value:         1
+    - n_strx:          17
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         2
+    - n_strx:          40
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3
+    - n_strx:          54
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4
+    - n_strx:          69
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _PrivateSymbol
+    - _PrivateExternalSymbol
+    - _CommonSymbol
+    - _CommonSymbol2
+    - _UndefinedExternalSymbol
+...
diff --git a/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test
new file mode 100644
index 0000000..131d3bf
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/localize-symbol.test
@@ -0,0 +1,131 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --wildcard --localize-symbol="*" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# RUN: echo "*" > %t-star.txt
+# RUN: llvm-objcopy --wildcard --localize-symbols="%t-star.txt" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# CHECK:      Symbols [
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateSymbol
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x1
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateExternalSymbol
+# CHECK-NEXT:     PrivateExtern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x2
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x3
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _UndefinedExternalSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Undef (0x0)
+# CHECK-NEXT:     Section:  (0x0)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000FF8
+        size:            8
+        offset:          0xFF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00008052C0035FD6
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        67
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           4
+    stroff:          4164
+    strsize:         79
+LinkEditData:
+  NameList:
+    - n_strx:          2
+      n_type:          0x0E
+      n_sect:          1
+      n_desc:          0
+      n_value:         1
+    - n_strx:          17
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         2
+    - n_strx:          40
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3
+    - n_strx:          54
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _PrivateSymbol
+    - _PrivateExternalSymbol
+    - _CommonSymbol
+    - _UndefinedExternalSymbol
+...
diff --git a/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test b/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test
new file mode 100644
index 0000000..0991fb3
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/skip-symbol.test
@@ -0,0 +1,148 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objcopy --wildcard --localize-symbol="*" --skip-symbol _CommonSymbol %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# RUN: echo "*" > %t-star.txt
+# RUN: echo _CommonSymbol > %t-sym-list.txt
+# RUN: llvm-objcopy --wildcard --localize-symbols="%t-star.txt" --skip-symbols="%t-sym-list.txt" %t %t.copy
+# RUN: llvm-readobj --symbols %t.copy | FileCheck %s
+
+# CHECK:      Symbols [
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateSymbol
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x1
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _PrivateExternalSymbol
+# CHECK-NEXT:     PrivateExtern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x2
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol2
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x4
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _CommonSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x3
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: _UndefinedExternalSymbol
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Undef (0x0)
+# CHECK-NEXT:     Section:  (0x0)
+# CHECK-NEXT:     RefType: UndefinedNonLazy (0x0)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value: 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000FF8
+        size:            8
+        offset:          0xFF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00008052C0035FD6
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        94
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           5
+    stroff:          4176
+    strsize:         94
+LinkEditData:
+  NameList:
+    - n_strx:          2
+      n_type:          0x0E
+      n_sect:          1
+      n_desc:          0
+      n_value:         1
+    - n_strx:          17
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         2
+    - n_strx:          40
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3
+    - n_strx:          54
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4
+    - n_strx:          69
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _PrivateSymbol
+    - _PrivateExternalSymbol
+    - _CommonSymbol
+    - _CommonSymbol2
+    - _UndefinedExternalSymbol
+...
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
new file mode 100644
index 0000000..19b06b1
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
@@ -0,0 +1,217 @@
+# RUN: rm -rf %t && mkdir %t
+# RUN: yaml2obj %s -o %t/original
+# RUN: llvm-strip --strip-all %t/original -o %t/stripped
+# RUN: llvm-readobj --macho-segment %t/stripped | FileCheck %s
+
+# CHECK-LABEL: Name: __PAGEZERO
+# CHECK:       fileoff: 16384
+
+# CHECK-LABEL: Name: __TEXT
+# CHECK:       fileoff: 16384
+
+# The YAML below is the following code
+# int main(int argc, char **argv) { return 0; }
+# Compiled on macOS against the macOS SDK and passing `-Wl,-encryptable`
+# Contents are removed, since they are not important for the test. We need a
+# small text segment (smaller than a page).
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           15
+  sizeofcmds:      696
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          32768
+    fileoff:         0
+    filesize:        32768
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100004000
+        size:            32
+        offset:          0x4000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x100004020
+        size:            4152
+        offset:          0x4020
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4295000064
+    vmsize:          592
+    fileoff:         32768
+    filesize:        592
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_CHAINED_FIXUPS
+    cmdsize:         16
+    dataoff:         32768
+    datasize:        48
+  - cmd:             LC_DYLD_EXPORTS_TRIE
+    cmdsize:         16
+    dataoff:         32816
+    datasize:        48
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          32872
+    nsyms:           2
+    stroff:          32904
+    strsize:         32
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      2
+    iundefsym:       2
+    nundefsym:       0
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ENCRYPTION_INFO_64
+    cmdsize:         24
+    cryptoff:        16384
+    cryptsize:       16384
+    cryptid:         0
+    pad:             0
+  - cmd:             LC_LOAD_DYLINKER
+    cmdsize:         32
+    name:            12
+    Content:         '/usr/lib/dyld'
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C4447-5555-3144-A18A-01E9EB7E7D92
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           983040
+    sdk:             983552
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1310720
+  - cmd:             LC_MAIN
+    cmdsize:         24
+    entryoff:        16384
+    stacksize:       0
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         32864
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         32872
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         32944
+    datasize:        416
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      5
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    4
+            NodeOffset:      33
+            Name:            main
+            Flags:           0x0
+            Address:         0x4000
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    2
+            NodeOffset:      39
+            Name:            _mh_execute_header
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294983680
+    - n_strx:          8
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+  StringTable:
+    - ' '
+    - _main
+    - __mh_execute_header
+    - ''
+    - ''
+    - ''
+    - ''
+  FunctionStarts:  [ 0x4000 ]
+  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x30, 0x0, 
+                     0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+...
+
diff --git a/llvm/test/tools/llvm-profgen/context-depth.test b/llvm/test/tools/llvm-profgen/context-depth.test
new file mode 100644
index 0000000..4eaa5fa
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/context-depth.test
@@ -0,0 +1,125 @@
+; Test --csprof-max-context-depth and --csprof-max-unsymbolized-context-depth
+
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csspgo-preinliner=0 --gen-cs-nested-profile=0 --skip-symbolization
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-UNSYM-CTX-DEPTH
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-UNSYM-CTX-DEPTH-PROF
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csprof-max-context-depth=0  --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
+
+
+; CHECK-MAX-CTX-DEPTH: [fb]:19:6
+; CHECK-MAX-CTX-DEPTH:  1: 6
+; CHECK-MAX-CTX-DEPTH:  2: 3
+; CHECK-MAX-CTX-DEPTH:  3: 3
+; CHECK-MAX-CTX-DEPTH:  4: 0
+; CHECK-MAX-CTX-DEPTH:  5: 4 fb:4
+; CHECK-MAX-CTX-DEPTH:  6: 3 fa:3
+; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563022570642068
+; CHECK-MAX-CTX-DEPTH: [fa]:14:4
+; CHECK-MAX-CTX-DEPTH:  1: 4
+; CHECK-MAX-CTX-DEPTH:  3: 4
+; CHECK-MAX-CTX-DEPTH:  4: 2
+; CHECK-MAX-CTX-DEPTH:  5: 1
+; CHECK-MAX-CTX-DEPTH:  6: 0
+; CHECK-MAX-CTX-DEPTH:  7: 2 fb:2
+; CHECK-MAX-CTX-DEPTH:  8: 1 fa:1
+; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563070469352221
+
+
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7ab @ 0x7ab]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7ab:3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7ab->7a0:4
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7ab @ 0x7b5]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7d4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7d4->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7b5 @ 0x7d4]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7cd:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7db-7e0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7cd->7db:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7e0->7a0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7b5 @ 0x7e0]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7d4 @ 0x7e0]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7e0 @ 0x7b5]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7cd:2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7db-7e0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7cd->7db:2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7e0->7a0:1
+
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:5 @ fb:5 @ fb]:13:4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 3
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 4 fb:4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fa:7 @ fb:6 @ fa]:6:2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 1 fb:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:5 @ fb:6 @ fa]:4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:6 @ fa:8 @ fa]:4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 1 fb:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fa:8 @ fa:7 @ fb]:3:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:6 @ fa:7 @ fb]:3:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
index c673028..b8e3e24 100644
--- a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
@@ -9,9 +9,6 @@
 ; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER
 ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe-nommap.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --profile-summary-hot-count=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
 ; RUN: FileCheck %s --input-file %t
-; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
-; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
-
 
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa]:4:1
 ; CHECK-UNCOMPRESS:  1: 1
@@ -68,23 +65,6 @@
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:1:0
 ; CHECK-UNCOMPRESS:  5: 1 fb:1
 ; CHECK-UNCOMPRESS:  !CFGChecksum: 563022570642068
-; CHECK-MAX-CTX-DEPTH: [fb]:19:6
-; CHECK-MAX-CTX-DEPTH:  1: 6
-; CHECK-MAX-CTX-DEPTH:  2: 3
-; CHECK-MAX-CTX-DEPTH:  3: 3
-; CHECK-MAX-CTX-DEPTH:  4: 0
-; CHECK-MAX-CTX-DEPTH:  5: 4 fb:4
-; CHECK-MAX-CTX-DEPTH:  6: 3 fa:3
-; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563022570642068
-; CHECK-MAX-CTX-DEPTH: [fa]:14:4
-; CHECK-MAX-CTX-DEPTH:  1: 4
-; CHECK-MAX-CTX-DEPTH:  3: 4
-; CHECK-MAX-CTX-DEPTH:  4: 2
-; CHECK-MAX-CTX-DEPTH:  5: 1
-; CHECK-MAX-CTX-DEPTH:  6: 0
-; CHECK-MAX-CTX-DEPTH:  7: 2 fb:2
-; CHECK-MAX-CTX-DEPTH:  8: 1 fa:1
-; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563070469352221
 
 
 ; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4
diff --git a/llvm/test/tools/llvm-xray/X86/account-empty-stack-error.yaml b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-empty-stack-error.yaml
index d02d070a..d02d070a 100644
--- a/llvm/test/tools/llvm-xray/X86/account-empty-stack-error.yaml
+++ b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-empty-stack-error.yaml
diff --git a/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml
new file mode 100644
index 0000000..7233110
--- /dev/null
+++ b/llvm/test/tools/llvm-xray/X86/account-exit-mismatch-non-empty-stack-error.yaml
@@ -0,0 +1,31 @@
+#RUN: not llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -d 2>&1 | FileCheck %s
+#RUN: llvm-xray account %s -k -o - -m %S/Inputs/simple-instrmap.yaml -d 2>&1 | FileCheck %s --check-prefix=KEEPGOING
+
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 0
+records:
+# An exit record doesn't match an entry record on a non empty stack with sibling call deduction. 
+# This can happen for example when an instrumented function does a 'fork()', 
+# where the child process will not see
+# the entry record but see the exit record. This is completely valid data,
+# which should be handled with grace (i.e. we treat it as an error, but since
+# the llvm-xray account tool has an option to keep going, gives the user a
+# chance to retry).
+  - { type: 0, func-id: 1, cpu: 1, thread: 1, kind: function-enter, tsc: 10000 }
+  - { type: 0, func-id: 4, cpu: 1, thread: 1, kind: function-exit,  tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 1, kind: function-exit,  tsc: 10002 }
+...
+
+#CHECK:      Error processing record: {{.*}}
+#CHECK-NEXT: Thread ID: 1
+#CHECK-NEXT:   #1 @(1)
+#CHECK-NEXT: llvm-xray: Failed accounting function calls in file '{{.*}}'.
+
+#KEEPGOING:      Error processing record: {{.*}}
+#KEEPGOING-NEXT: Thread ID: 1
+#KEEPGOING-NEXT:   #1 @(1)
diff --git a/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml b/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
index 76011ee..fb1a8f4 100644
--- a/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
+++ b/llvm/test/tools/llvm-xray/X86/account-keep-going.yaml
@@ -7,8 +7,8 @@ header:
   nonstop-tsc: true
   cycle-frequency: 0
 records:
-# We want to test the case for when we see spurious exits, but keep going
-# anyway ignoring the records in the process.
+# We want to test the case for when we see spurious exits without sibling call deduction, 
+# but keep going anyway ignoring the records in the process.
   - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 }
   - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
   - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 5db5c2e..921f283 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -1023,6 +1023,12 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
   cl::alias ShowOutputDirectoryA("o", cl::desc("Alias for --output-dir"),
                                  cl::aliasopt(ShowOutputDirectory));
 
+  cl::opt<bool> BinaryCounters(
+      "binary-counters", cl::Optional,
+      cl::desc("Show binary counters (1/0) in lines and branches instead of "
+               "integer execution counts"),
+      cl::cat(ViewCategory));
+
   cl::opt<uint32_t> TabSize(
       "tab-size", cl::init(2),
       cl::desc(
@@ -1100,6 +1106,7 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
   ViewOpts.ShowFunctionInstantiations = ShowInstantiations;
   ViewOpts.ShowDirectoryCoverage = ShowDirectoryCoverage;
   ViewOpts.ShowOutputDirectory = ShowOutputDirectory;
+  ViewOpts.BinaryCounters = BinaryCounters;
   ViewOpts.TabSize = TabSize;
   ViewOpts.ProjectTitle = ProjectTitle;
 
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
index 58e7918..5c002a6 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
@@ -16,8 +16,9 @@
 using namespace llvm;
 using namespace coverage;
 
-static void sumBranches(size_t &NumBranches, size_t &CoveredBranches,
-                        const ArrayRef<CountedRegion> &Branches) {
+static auto sumBranches(const ArrayRef<CountedRegion> &Branches) {
+  size_t NumBranches = 0;
+  size_t CoveredBranches = 0;
   for (const auto &BR : Branches) {
     if (!BR.TrueFolded) {
       // "True" Condition Branches.
@@ -32,20 +33,22 @@ static void sumBranches(size_t &NumBranches, size_t &CoveredBranches,
         ++CoveredBranches;
     }
   }
+  return BranchCoverageInfo(CoveredBranches, NumBranches);
 }
 
-static void sumBranchExpansions(size_t &NumBranches, size_t &CoveredBranches,
-                                const CoverageMapping &CM,
-                                ArrayRef<ExpansionRecord> Expansions) {
+static BranchCoverageInfo
+sumBranchExpansions(const CoverageMapping &CM,
+                    ArrayRef<ExpansionRecord> Expansions) {
+  BranchCoverageInfo BranchCoverage;
   for (const auto &Expansion : Expansions) {
     auto CE = CM.getCoverageForExpansion(Expansion);
-    sumBranches(NumBranches, CoveredBranches, CE.getBranches());
-    sumBranchExpansions(NumBranches, CoveredBranches, CM, CE.getExpansions());
+    BranchCoverage += sumBranches(CE.getBranches());
+    BranchCoverage += sumBranchExpansions(CM, CE.getExpansions());
   }
+  return BranchCoverage;
 }
 
-static std::pair<size_t, size_t>
-sumMCDCPairs(const ArrayRef<MCDCRecord> &Records) {
+auto sumMCDCPairs(const ArrayRef<MCDCRecord> &Records) {
   size_t NumPairs = 0, CoveredPairs = 0;
   for (const auto &Record : Records) {
     const auto NumConditions = Record.getNumConditions();
@@ -56,15 +59,14 @@ sumMCDCPairs(const ArrayRef<MCDCRecord> &Records) {
         ++CoveredPairs;
     }
   }
-  return {NumPairs, CoveredPairs};
+  return MCDCCoverageInfo(CoveredPairs, NumPairs);
 }
 
-FunctionCoverageSummary
-FunctionCoverageSummary::get(const CoverageMapping &CM,
-                             const coverage::FunctionRecord &Function) {
+static std::pair<RegionCoverageInfo, LineCoverageInfo>
+sumRegions(ArrayRef<CountedRegion> CodeRegions, const CoverageData &CD) {
   // Compute the region coverage.
   size_t NumCodeRegions = 0, CoveredRegions = 0;
-  for (auto &CR : Function.CountedRegions) {
+  for (auto &CR : CodeRegions) {
     if (CR.Kind != CounterMappingRegion::CodeRegion)
       continue;
     ++NumCodeRegions;
@@ -74,7 +76,6 @@ FunctionCoverageSummary::get(const CoverageMapping &CM,
 
   // Compute the line coverage
   size_t NumLines = 0, CoveredLines = 0;
-  CoverageData CD = CM.getCoverageForFunction(Function);
   for (const auto &LCS : getLineCoverageStats(CD)) {
     if (!LCS.isMapped())
       continue;
@@ -83,20 +84,31 @@ FunctionCoverageSummary::get(const CoverageMapping &CM,
       ++CoveredLines;
   }
 
+  return {RegionCoverageInfo(CoveredRegions, NumCodeRegions),
+          LineCoverageInfo(CoveredLines, NumLines)};
+}
+
+CoverageDataSummary::CoverageDataSummary(const CoverageData &CD,
+                                         ArrayRef<CountedRegion> CodeRegions) {
+  std::tie(RegionCoverage, LineCoverage) = sumRegions(CodeRegions, CD);
+  BranchCoverage = sumBranches(CD.getBranches());
+  MCDCCoverage = sumMCDCPairs(CD.getMCDCRecords());
+}
+
+FunctionCoverageSummary
+FunctionCoverageSummary::get(const CoverageMapping &CM,
+                             const coverage::FunctionRecord &Function) {
+  CoverageData CD = CM.getCoverageForFunction(Function);
+
+  auto Summary =
+      FunctionCoverageSummary(Function.Name, Function.ExecutionCount);
+
+  Summary += CoverageDataSummary(CD, Function.CountedRegions);
+
   // Compute the branch coverage, including branches from expansions.
-  size_t NumBranches = 0, CoveredBranches = 0;
-  sumBranches(NumBranches, CoveredBranches, CD.getBranches());
-  sumBranchExpansions(NumBranches, CoveredBranches, CM, CD.getExpansions());
+  Summary.BranchCoverage += sumBranchExpansions(CM, CD.getExpansions());
 
-  size_t NumPairs = 0, CoveredPairs = 0;
-  std::tie(NumPairs, CoveredPairs) = sumMCDCPairs(CD.getMCDCRecords());
-
-  return FunctionCoverageSummary(
-      Function.Name, Function.ExecutionCount,
-      RegionCoverageInfo(CoveredRegions, NumCodeRegions),
-      LineCoverageInfo(CoveredLines, NumLines),
-      BranchCoverageInfo(CoveredBranches, NumBranches),
-      MCDCCoverageInfo(CoveredPairs, NumPairs));
+  return Summary;
 }
 
 FunctionCoverageSummary
@@ -111,8 +123,7 @@ FunctionCoverageSummary::get(const InstantiationGroup &Group,
        << Group.getColumn();
   }
 
-  FunctionCoverageSummary Summary(Name);
-  Summary.ExecutionCount = Group.getTotalExecutionCount();
+  FunctionCoverageSummary Summary(Name, Group.getTotalExecutionCount());
   Summary.RegionCoverage = Summaries[0].RegionCoverage;
   Summary.LineCoverage = Summaries[0].LineCoverage;
   Summary.BranchCoverage = Summaries[0].BranchCoverage;
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.h b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
index 64c2c84..d921067 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
@@ -223,26 +223,32 @@ public:
   }
 };
 
-/// A summary of function's code coverage.
-struct FunctionCoverageSummary {
-  std::string Name;
-  uint64_t ExecutionCount;
+struct CoverageDataSummary {
   RegionCoverageInfo RegionCoverage;
   LineCoverageInfo LineCoverage;
   BranchCoverageInfo BranchCoverage;
   MCDCCoverageInfo MCDCCoverage;
 
-  FunctionCoverageSummary(const std::string &Name)
-      : Name(Name), ExecutionCount(0) {}
+  CoverageDataSummary() = default;
+  CoverageDataSummary(const coverage::CoverageData &CD,
+                      ArrayRef<coverage::CountedRegion> CodeRegions);
 
-  FunctionCoverageSummary(const std::string &Name, uint64_t ExecutionCount,
-                          const RegionCoverageInfo &RegionCoverage,
-                          const LineCoverageInfo &LineCoverage,
-                          const BranchCoverageInfo &BranchCoverage,
-                          const MCDCCoverageInfo &MCDCCoverage)
-      : Name(Name), ExecutionCount(ExecutionCount),
-        RegionCoverage(RegionCoverage), LineCoverage(LineCoverage),
-        BranchCoverage(BranchCoverage), MCDCCoverage(MCDCCoverage) {}
+  auto &operator+=(const CoverageDataSummary &RHS) {
+    RegionCoverage += RHS.RegionCoverage;
+    LineCoverage += RHS.LineCoverage;
+    BranchCoverage += RHS.BranchCoverage;
+    MCDCCoverage += RHS.MCDCCoverage;
+    return *this;
+  }
+};
+
+/// A summary of function's code coverage.
+struct FunctionCoverageSummary : CoverageDataSummary {
+  std::string Name;
+  uint64_t ExecutionCount;
+
+  FunctionCoverageSummary(const std::string &Name, uint64_t ExecutionCount = 0)
+      : Name(Name), ExecutionCount(ExecutionCount) {}
 
   /// Compute the code coverage summary for the given function coverage
   /// mapping record.
@@ -257,12 +263,8 @@ struct FunctionCoverageSummary {
 };
 
 /// A summary of file's code coverage.
-struct FileCoverageSummary {
+struct FileCoverageSummary : CoverageDataSummary {
   StringRef Name;
-  RegionCoverageInfo RegionCoverage;
-  LineCoverageInfo LineCoverage;
-  BranchCoverageInfo BranchCoverage;
-  MCDCCoverageInfo MCDCCoverage;
   FunctionCoverageInfo FunctionCoverage;
   FunctionCoverageInfo InstantiationCoverage;
 
@@ -270,11 +272,8 @@ struct FileCoverageSummary {
   FileCoverageSummary(StringRef Name) : Name(Name) {}
 
   FileCoverageSummary &operator+=(const FileCoverageSummary &RHS) {
-    RegionCoverage += RHS.RegionCoverage;
-    LineCoverage += RHS.LineCoverage;
+    *static_cast<CoverageDataSummary *>(this) += RHS;
     FunctionCoverage += RHS.FunctionCoverage;
-    BranchCoverage += RHS.BranchCoverage;
-    MCDCCoverage += RHS.MCDCCoverage;
     InstantiationCoverage += RHS.InstantiationCoverage;
     return *this;
   }
diff --git a/llvm/tools/llvm-cov/CoverageViewOptions.h b/llvm/tools/llvm-cov/CoverageViewOptions.h
index 015c92a1..81e69c3 100644
--- a/llvm/tools/llvm-cov/CoverageViewOptions.h
+++ b/llvm/tools/llvm-cov/CoverageViewOptions.h
@@ -45,6 +45,7 @@ struct CoverageViewOptions {
   bool SkipExpansions;
   bool SkipFunctions;
   bool SkipBranches;
+  bool BinaryCounters;
   OutputFormat Format;
   BranchOutputType ShowBranches;
   std::string ShowOutputDirectory;
diff --git a/llvm/tools/llvm-cov/SourceCoverageView.h b/llvm/tools/llvm-cov/SourceCoverageView.h
index 2b1570d..cff32b7 100644
--- a/llvm/tools/llvm-cov/SourceCoverageView.h
+++ b/llvm/tools/llvm-cov/SourceCoverageView.h
@@ -180,6 +180,8 @@ class SourceCoverageView {
   /// on display.
   std::vector<InstantiationView> InstantiationSubViews;
 
+  bool BinaryCounters;
+
   /// Get the first uncovered line number for the source file.
   unsigned getFirstUncoveredLineNo();
 
@@ -266,6 +268,14 @@ protected:
   /// digits.
   static std::string formatCount(uint64_t N);
 
+  uint64_t BinaryCount(uint64_t N) const {
+    return (N && BinaryCounters ? 1 : N);
+  }
+
+  std::string formatBinaryCount(uint64_t N) const {
+    return formatCount(BinaryCount(N));
+  }
+
   /// Check if region marker output is expected for a line.
   bool shouldRenderRegionMarkers(const LineCoverageStats &LCS) const;
 
@@ -276,7 +286,9 @@ protected:
                      const CoverageViewOptions &Options,
                      CoverageData &&CoverageInfo)
       : SourceName(SourceName), File(File), Options(Options),
-        CoverageInfo(std::move(CoverageInfo)) {}
+        CoverageInfo(std::move(CoverageInfo)),
+        BinaryCounters(Options.BinaryCounters ||
+                       CoverageInfo.getSingleByteCoverage()) {}
 
 public:
   static std::unique_ptr<SourceCoverageView>
diff --git a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
index e2be576..c94d385 100644
--- a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp
@@ -1019,19 +1019,22 @@ void SourceCoverageViewHTML::renderLine(raw_ostream &OS, LineRef L,
     // Just consider the segments which start *and* end on this line.
     for (unsigned I = 0, E = Segments.size() - 1; I < E; ++I) {
       const auto *CurSeg = Segments[I];
+      auto CurSegCount = BinaryCount(CurSeg->Count);
+      auto LCSCount = BinaryCount(LCS.getExecutionCount());
       if (!CurSeg->IsRegionEntry)
         continue;
-      if (CurSeg->Count == LCS.getExecutionCount())
+      if (CurSegCount == LCSCount)
         continue;
 
       Snippets[I + 1] =
-          tag("div", Snippets[I + 1] + tag("span", formatCount(CurSeg->Count),
-                                           "tooltip-content"),
+          tag("div",
+              Snippets[I + 1] +
+                  tag("span", formatCount(CurSegCount), "tooltip-content"),
               "tooltip");
 
       if (getOptions().Debug)
         errs() << "Marker at " << CurSeg->Line << ":" << CurSeg->Col << " = "
-               << formatCount(CurSeg->Count) << "\n";
+               << formatCount(CurSegCount) << "\n";
     }
   }
 
@@ -1051,7 +1054,7 @@ void SourceCoverageViewHTML::renderLineCoverageColumn(
     raw_ostream &OS, const LineCoverageStats &Line) {
   std::string Count;
   if (Line.isMapped())
-    Count = tag("pre", formatCount(Line.getExecutionCount()));
+    Count = tag("pre", formatBinaryCount(Line.getExecutionCount()));
   std::string CoverageClass =
       (Line.getExecutionCount() > 0)
           ? "covered-line"
@@ -1106,7 +1109,7 @@ void SourceCoverageViewHTML::renderBranchView(raw_ostream &OS, BranchView &BRV,
 
     OS << tag("span", Label, (Count ? "None" : "red branch")) << ": ";
     if (getOptions().ShowBranchCounts)
-      OS << tag("span", formatCount(Count),
+      OS << tag("span", formatBinaryCount(Count),
                 (Count ? "covered-line" : "uncovered-line"));
     else
       OS << format("%0.2f", (Total != 0 ? 100.0 * Count / Total : 0.0)) << "%";
diff --git a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
index 63f8248..765f8bb 100644
--- a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp
@@ -216,7 +216,7 @@ void SourceCoverageViewText::renderLineCoverageColumn(
     OS.indent(LineCoverageColumnWidth) << '|';
     return;
   }
-  std::string C = formatCount(Line.getExecutionCount());
+  std::string C = formatBinaryCount(Line.getExecutionCount());
   OS.indent(LineCoverageColumnWidth - C.size());
   colored_ostream(OS, raw_ostream::MAGENTA,
                   Line.hasMultipleRegions() && getOptions().Colors)
@@ -263,7 +263,7 @@ void SourceCoverageViewText::renderRegionMarkers(raw_ostream &OS,
 
     if (getOptions().Debug)
       errs() << "Marker at " << S->Line << ":" << S->Col << " = "
-            << formatCount(S->Count) << "\n";
+             << formatBinaryCount(S->Count) << "\n";
   }
   OS << '\n';
 }
@@ -307,7 +307,7 @@ void SourceCoverageViewText::renderBranchView(raw_ostream &OS, BranchView &BRV,
         << Label;
 
     if (getOptions().ShowBranchCounts)
-      OS << ": " << formatCount(Count);
+      OS << ": " << formatBinaryCount(Count);
     else
       OS << ": " << format("%0.2f", (Total != 0 ? 100.0 * Count / Total : 0.0))
          << "%";
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 9116b5c..a7771b9 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -422,8 +422,9 @@ private:
            "Expected getcpu call to succeed.");
     assert(static_cast<int>(CurrentCPU) == CPUToUse &&
            "Expected current CPU to equal the CPU requested by the user");
-#endif // defined(__x86_64__) && defined(SYS_getcpu)
+#else
     exit(ChildProcessExitCodeE::SetCPUAffinityFailed);
+#endif // defined(__x86_64__) && defined(SYS_getcpu)
   }
 
   Error createSubProcessAndRunBenchmark(
diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
index 41d3615..5636782 100644
--- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
@@ -24,6 +24,8 @@
 namespace llvm {
 namespace exegesis {
 
+#include "RISCVGenExegesis.inc"
+
 namespace {
 
 // Stores constant value to a general-purpose (integer) register.
@@ -132,8 +134,7 @@ public:
 };
 
 ExegesisRISCVTarget::ExegesisRISCVTarget()
-    : ExegesisTarget(ArrayRef<CpuAndPfmCounters>{},
-                     RISCV_MC::isOpcodeAvailable) {}
+    : ExegesisTarget(RISCVCpuPfmCounters, RISCV_MC::isOpcodeAvailable) {}
 
 bool ExegesisRISCVTarget::matchesArch(Triple::ArchType Arch) const {
   return Arch == Triple::riscv32 || Arch == Triple::riscv64;
diff --git a/llvm/tools/llvm-gsymutil/Opts.td b/llvm/tools/llvm-gsymutil/Opts.td
index d61b418..89cd3ce 100644
--- a/llvm/tools/llvm-gsymutil/Opts.td
+++ b/llvm/tools/llvm-gsymutil/Opts.td
@@ -17,7 +17,10 @@ defm convert :
   Eq<"convert",
      "Convert the specified file to the GSYM format.\nSupported files include ELF and mach-o files that will have their debug info (DWARF) and symbol table converted">;
 def merged_functions :
-  FF<"merged-functions", "Encode merged function information for functions in debug info that have matching address ranges.\nWithout this option one function per unique address range will be emitted.">;
+  FF<"merged-functions", "When used with --convert, encodes merged function information for functions in debug info that have matching address ranges.\n"
+                         "Without this option one function per unique address range will be emitted.\n"
+                         "When used with --address/--addresses-from-stdin, all merged functions for a particular address will be displayed.\n"
+                         "Without this option only one function will be displayed.">;
 def dwarf_callsites : FF<"dwarf-callsites", "Load call site info from DWARF, if available">;
 defm callsites_yaml_file :
   Eq<"callsites-yaml-file", "Load call site info from YAML file. Useful for testing.">, Flags<[HelpHidden]>;
diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
index aed4ae7..654da68 100644
--- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
@@ -98,7 +98,7 @@ static uint64_t SegmentSize;
 static bool Quiet;
 static std::vector<uint64_t> LookupAddresses;
 static bool LookupAddressesFromStdin;
-static bool StoreMergedFunctionInfo = false;
+static bool UseMergedFunctions = false;
 static bool LoadDwarfCallSites = false;
 static std::string CallSiteYamlPath;
 
@@ -181,7 +181,7 @@ static void parseArgs(int argc, char **argv) {
   }
 
   LookupAddressesFromStdin = Args.hasArg(OPT_addresses_from_stdin);
-  StoreMergedFunctionInfo = Args.hasArg(OPT_merged_functions);
+  UseMergedFunctions = Args.hasArg(OPT_merged_functions);
 
   if (Args.hasArg(OPT_callsites_yaml_file_EQ)) {
     CallSiteYamlPath = Args.getLastArgValue(OPT_callsites_yaml_file_EQ);
@@ -380,7 +380,7 @@ static llvm::Error handleObjectFile(ObjectFile &Obj, const std::string &OutFile,
   // functions in the first FunctionInfo with that address range. Do this right
   // after loading the DWARF data so we don't have to deal with functions from
   // the symbol table.
-  if (StoreMergedFunctionInfo)
+  if (UseMergedFunctions)
     Gsym.prepareMergedFunctions(Out);
 
   // Get the UUID and convert symbol table to GSYM.
@@ -508,24 +508,37 @@ static llvm::Error convertFileToGSYM(OutputAggregator &Out) {
 }
 
 static void doLookup(GsymReader &Gsym, uint64_t Addr, raw_ostream &OS) {
-  if (auto Result = Gsym.lookup(Addr)) {
-    // If verbose is enabled dump the full function info for the address.
-    if (Verbose) {
-      if (auto FI = Gsym.getFunctionInfo(Addr)) {
-        OS << "FunctionInfo for " << HEX64(Addr) << ":\n";
-        Gsym.dump(OS, *FI);
-        OS << "\nLookupResult for " << HEX64(Addr) << ":\n";
+  if (UseMergedFunctions) {
+    if (auto Results = Gsym.lookupAll(Addr)) {
+      OS << "Found " << Results->size() << " functions at address "
+         << HEX64(Addr) << ":\n";
+      for (size_t i = 0; i < Results->size(); ++i) {
+        OS << "   " << Results->at(i);
+
+        if (i != Results->size() - 1)
+          OS << "\n";
       }
     }
-    OS << Result.get();
-  } else {
+  } else { /* UseMergedFunctions == false */
+    if (auto Result = Gsym.lookup(Addr)) {
+      // If verbose is enabled dump the full function info for the address.
+      if (Verbose) {
+        if (auto FI = Gsym.getFunctionInfo(Addr)) {
+          OS << "FunctionInfo for " << HEX64(Addr) << ":\n";
+          Gsym.dump(OS, *FI);
+          OS << "\nLookupResult for " << HEX64(Addr) << ":\n";
+        }
+      }
+      OS << Result.get();
+    } else {
+      if (Verbose)
+        OS << "\nLookupResult for " << HEX64(Addr) << ":\n";
+      OS << HEX64(Addr) << ": ";
+      logAllUnhandledErrors(Result.takeError(), OS, "error: ");
+    }
     if (Verbose)
-      OS << "\nLookupResult for " << HEX64(Addr) << ":\n";
-    OS << HEX64(Addr) << ": ";
-    logAllUnhandledErrors(Result.takeError(), OS, "error: ");
+      OS << "\n";
   }
-  if (Verbose)
-    OS << "\n";
 }
 
 int llvm_gsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
index 5271fdb..6db78926 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
@@ -66,6 +66,8 @@ static Expected<Symbol &> getCOFFStubTarget(LinkGraph &G, Block &B) {
 
 namespace llvm {
 Error registerCOFFGraphInfo(Session &S, LinkGraph &G) {
+  std::lock_guard<std::mutex> Lock(S.M);
+
   auto FileName = sys::path::filename(G.getName());
   if (S.FileInfos.count(FileName)) {
     return make_error<StringError>("When -check is passed, file names must be "
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
index a8c804a..6aa8941 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
@@ -101,6 +101,8 @@ static Error registerSymbol(LinkGraph &G, Symbol &Sym, Session::FileInfo &FI,
 namespace llvm {
 
 Error registerELFGraphInfo(Session &S, LinkGraph &G) {
+  std::lock_guard<std::mutex> Lock(S.M);
+
   auto FileName = sys::path::filename(G.getName());
   if (S.FileInfos.count(FileName)) {
     return make_error<StringError>("When -check is passed, file names must be "
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
index 2c60c80..2fc56c9 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
@@ -69,6 +69,8 @@ static Expected<Symbol &> getMachOStubTarget(LinkGraph &G, Block &B) {
 namespace llvm {
 
 Error registerMachOGraphInfo(Session &S, LinkGraph &G) {
+  std::lock_guard<std::mutex> Lock(S.M);
+
   auto FileName = sys::path::filename(G.getName());
   if (S.FileInfos.count(FileName)) {
     return make_error<StringError>("When -check is passed, file names must be "
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 96a3e5b..963c363 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -91,6 +91,10 @@ static cl::list<std::string> InputFiles(cl::Positional, cl::OneOrMore,
                                         cl::desc("input files"),
                                         cl::cat(JITLinkCategory));
 
+static cl::opt<size_t> MaterializationThreads(
+    "num-threads", cl::desc("Number of materialization threads to use"),
+    cl::init(std::numeric_limits<size_t>::max()), cl::cat(JITLinkCategory));
+
 static cl::list<std::string>
     LibrarySearchPaths("L",
                        cl::desc("Add dir to the list of library search paths"),
@@ -400,6 +404,7 @@ bool lazyLinkingRequested() {
 }
 
 static Error applyHarnessPromotions(Session &S, LinkGraph &G) {
+  std::lock_guard<std::mutex> Lock(S.M);
 
   // If this graph is part of the test harness there's nothing to do.
   if (S.HarnessFiles.empty() || S.HarnessFiles.count(G.getName()))
@@ -450,7 +455,11 @@ static Error applyHarnessPromotions(Session &S, LinkGraph &G) {
   return Error::success();
 }
 
-static void dumpSectionContents(raw_ostream &OS, LinkGraph &G) {
+static void dumpSectionContents(raw_ostream &OS, Session &S, LinkGraph &G) {
+  std::lock_guard<std::mutex> Lock(S.M);
+
+  outs() << "Relocated section contents for " << G.getName() << ":\n";
+
   constexpr orc::ExecutorAddrDiff DumpWidth = 16;
   static_assert(isPowerOf2_64(DumpWidth), "DumpWidth must be a power of two");
 
@@ -842,7 +851,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> launchExecutor() {
     S.CreateMemoryManager = createSharedMemoryManager;
 
   return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
-      std::make_unique<DynamicThreadPoolTaskDispatcher>(std::nullopt),
+      std::make_unique<DynamicThreadPoolTaskDispatcher>(MaterializationThreads),
       std::move(S), FromExecutor[ReadEnd], ToExecutor[WriteEnd]);
 #endif
 }
@@ -984,10 +993,21 @@ Expected<std::unique_ptr<Session>> Session::Create(Triple TT,
     auto PageSize = sys::Process::getPageSize();
     if (!PageSize)
       return PageSize.takeError();
+    std::unique_ptr<TaskDispatcher> Dispatcher;
+    if (MaterializationThreads == 0)
+      Dispatcher = std::make_unique<InPlaceTaskDispatcher>();
+    else {
+#if LLVM_ENABLE_THREADS
+      Dispatcher = std::make_unique<DynamicThreadPoolTaskDispatcher>(
+          MaterializationThreads);
+#else
+      llvm_unreachable("MaterializationThreads should be 0");
+#endif
+    }
+
     EPC = std::make_unique<SelfExecutorProcessControl>(
-        std::make_shared<SymbolStringPool>(),
-        std::make_unique<InPlaceTaskDispatcher>(), std::move(TT), *PageSize,
-        createInProcessMemoryManager());
+        std::make_shared<SymbolStringPool>(), std::move(Dispatcher),
+        std::move(TT), *PageSize, createInProcessMemoryManager());
   }
 
   Error Err = Error::success();
@@ -1221,6 +1241,7 @@ void Session::modifyPassConfig(LinkGraph &G, PassConfiguration &PassConfig) {
 
   if (ShowGraphsRegex)
     PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) -> Error {
+      std::lock_guard<std::mutex> Lock(M);
       // Print graph if ShowLinkGraphs is specified-but-empty, or if
       // it contains the given graph.
       if (ShowGraphsRegex->match(G.getName())) {
@@ -1239,9 +1260,8 @@ void Session::modifyPassConfig(LinkGraph &G, PassConfiguration &PassConfig) {
       [this](LinkGraph &G) { return applyHarnessPromotions(*this, G); });
 
   if (ShowRelocatedSectionContents)
-    PassConfig.PostFixupPasses.push_back([](LinkGraph &G) -> Error {
-      outs() << "Relocated section contents for " << G.getName() << ":\n";
-      dumpSectionContents(outs(), G);
+    PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) -> Error {
+      dumpSectionContents(outs(), *this, G);
       return Error::success();
     });
 
@@ -1613,6 +1633,46 @@ static Error sanitizeArguments(const Triple &TT, const char *ArgV0) {
     }
   }
 
+#if LLVM_ENABLE_THREADS
+  if (MaterializationThreads == std::numeric_limits<size_t>::max()) {
+    if (auto HC = std::thread::hardware_concurrency())
+      MaterializationThreads = HC;
+    else {
+      errs() << "Warning: std::thread::hardware_concurrency() returned 0, "
+                "defaulting to -num-threads=1.\n";
+      MaterializationThreads = 1;
+    }
+  }
+#else
+  if (MaterializationThreads.getNumOccurrences() &&
+      MaterializationThreads != 0) {
+    errs() << "Warning: -num-threads was set, but LLVM was built with threads "
+              "disabled. Resetting to -num-threads=0\n";
+  }
+  MaterializationThreads = 0;
+#endif
+
+  if (!!OutOfProcessExecutor.getNumOccurrences() ||
+      !!OutOfProcessExecutorConnect.getNumOccurrences()) {
+    if (NoExec)
+      return make_error<StringError>("-noexec cannot be used with " +
+                                         OutOfProcessExecutor.ArgStr + " or " +
+                                         OutOfProcessExecutorConnect.ArgStr,
+                                     inconvertibleErrorCode());
+
+    if (MaterializationThreads == 0)
+      return make_error<StringError>("-threads=0 cannot be used with " +
+                                         OutOfProcessExecutor.ArgStr + " or " +
+                                         OutOfProcessExecutorConnect.ArgStr,
+                                     inconvertibleErrorCode());
+  }
+
+#ifndef NDEBUG
+  if (DebugFlag && MaterializationThreads != 0)
+    errs() << "Warning: debugging output is not thread safe. "
+              "Use -num-threads=0 to stabilize output.\n";
+#endif // NDEBUG
+
   // Only one of -oop-executor and -oop-executor-connect can be used.
   if (!!OutOfProcessExecutor.getNumOccurrences() &&
       !!OutOfProcessExecutorConnect.getNumOccurrences())
@@ -2502,6 +2562,7 @@ int main(int argc, char *argv[]) {
     if (Timers)
       Timers->JITLinkTG.printAll(errs());
     reportLLVMJITLinkError(EntryPoint.takeError());
+    ExitOnErr(S->ES.endSession());
     exit(1);
   }
 
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 34bb6ce..0f4a4d5 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -449,9 +449,8 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
       }
 
       // Promotion
-      if (renameModuleForThinLTO(*M, *Index,
-                                 /*ClearDSOLocalOnDeclarations=*/false))
-        return true;
+      renameModuleForThinLTO(*M, *Index,
+                             /*ClearDSOLocalOnDeclarations=*/false);
     }
 
     if (Verbose)
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 111c546..ad113ed 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -42,6 +42,11 @@ static cl::opt<bool>
 cl::opt<bool> ShowDetailedWarning("show-detailed-warning",
                                   cl::desc("Show detailed warning message."));
 
+static cl::opt<int> CSProfMaxUnsymbolizedCtxDepth(
+    "csprof-max-unsymbolized-context-depth", cl::init(-1),
+    cl::desc("Keep the last K contexts while merging unsymbolized profile. -1 "
+             "means no depth limit."));
+
 extern cl::opt<std::string> PerfTraceFilename;
 extern cl::opt<bool> ShowDisassemblyOnly;
 extern cl::opt<bool> ShowSourceLocations;
@@ -172,7 +177,19 @@ std::shared_ptr<AddrBasedCtxKey> AddressStack::getContextKey() {
   std::shared_ptr<AddrBasedCtxKey> KeyStr = std::make_shared<AddrBasedCtxKey>();
   KeyStr->Context = Stack;
   CSProfileGenerator::compressRecursionContext<uint64_t>(KeyStr->Context);
-  CSProfileGenerator::trimContext<uint64_t>(KeyStr->Context);
+  // MaxContextDepth(--csprof-max-context-depth) is used to trim both symbolized
+  // and unsymbolized profile context. Sometimes we want to at least preserve
+  // the inlinings for the leaf frame(the profiled binary inlining),
+  // --csprof-max-context-depth may not be flexible enough, in this case,
+  // --csprof-max-unsymbolized-context-depth is used to limit the context for
+  // unsymbolized profile. If both are set, use the minimum of them.
+  int Depth = CSProfileGenerator::MaxContextDepth != -1
+                  ? CSProfileGenerator::MaxContextDepth
+                  : KeyStr->Context.size();
+  Depth = CSProfMaxUnsymbolizedCtxDepth != -1
+              ? std::min(static_cast<int>(CSProfMaxUnsymbolizedCtxDepth), Depth)
+              : Depth;
+  CSProfileGenerator::trimContext<uint64_t>(KeyStr->Context, Depth);
   return KeyStr;
 }
 
diff --git a/llvm/unittests/CodeGen/GlobalISel/GISelUtilsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/GISelUtilsTest.cpp
index 1ff7fd9..9163663 100644
--- a/llvm/unittests/CodeGen/GlobalISel/GISelUtilsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/GISelUtilsTest.cpp
@@ -77,6 +77,15 @@ static const LLT NXV3P0 = LLT::scalable_vector(3, P0);
 static const LLT NXV4P0 = LLT::scalable_vector(4, P0);
 static const LLT NXV12P0 = LLT::scalable_vector(12, P0);
 
+static void collectNonCopyMI(SmallVectorImpl<MachineInstr *> &MIList,
+                             MachineFunction *MF) {
+  for (auto &MBB : *MF)
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != TargetOpcode::COPY)
+        MIList.push_back(&MI);
+    }
+}
+
 TEST(GISelUtilsTest, getGCDType) {
   EXPECT_EQ(S1, getGCDType(S1, S1));
   EXPECT_EQ(S32, getGCDType(S32, S32));
@@ -408,4 +417,90 @@ TEST_F(AArch64GISelMITest, ConstFalseTest) {
     }
   }
 }
+
+TEST_F(AMDGPUGISelMITest, isConstantOrConstantSplatVectorFP) {
+  StringRef MIRString =
+      "  %cst0:_(s32) = G_FCONSTANT float 2.000000e+00\n"
+      "  %cst1:_(s32) = G_FCONSTANT float 0.0\n"
+      "  %cst2:_(s64) = G_FCONSTANT double 3.000000e-02\n"
+      "  %cst3:_(s32) = G_CONSTANT i32 2\n"
+      "  %cst4:_(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)\n"
+      "  %cst5:_(<2 x s32>) = G_BUILD_VECTOR %cst1(s32), %cst0(s32)\n"
+      "  %cst6:_(<2 x s64>) = G_BUILD_VECTOR %cst2(s64), %cst2(s64)\n"
+      "  %cst7:_(<2 x s32>) = G_BUILD_VECTOR %cst3(s32), %cst3:_(s32)\n"
+      "  %cst8:_(<4 x s32>) = G_CONCAT_VECTORS %cst4:_(<2 x s32>), %cst4:_(<2 "
+      "x s32>)\n"
+      "  %cst9:_(<4 x s64>) = G_CONCAT_VECTORS %cst6:_(<2 x s64>), %cst6:_(<2 "
+      "x s64>)\n"
+      "  %cst10:_(<4 x s32>) = G_CONCAT_VECTORS %cst4:_(<2 x s32>), %cst5:_(<2 "
+      "x s32>)\n"
+      "  %cst11:_(<4 x s32>) = G_CONCAT_VECTORS %cst7:_(<2 x s32>), %cst7:_(<2 "
+      "x s32>)\n"
+      "  %cst12:_(s32) = G_IMPLICIT_DEF \n"
+      "  %cst13:_(<2 x s32>) = G_BUILD_VECTOR %cst12(s32), %cst12(s32)\n"
+      "  %cst14:_(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst12(s32)\n"
+      "  %cst15:_(<4 x s32>) = G_CONCAT_VECTORS %cst4:_(<2 x s32>), "
+      "%cst14:_(<2 "
+      "x s32>)\n";
+
+  SmallVector<MachineInstr *, 16> MIList;
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  collectNonCopyMI(MIList, MF);
+
+  EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[0], *MRI).has_value());
+  auto val = isConstantOrConstantSplatVectorFP(*MIList[0], *MRI).value();
+  EXPECT_EQ(2.0, val.convertToFloat());
+
+  EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[1], *MRI).has_value());
+  val = isConstantOrConstantSplatVectorFP(*MIList[1], *MRI).value();
+  EXPECT_EQ(0.0, val.convertToFloat());
+
+  EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[2], *MRI).has_value());
+  val = isConstantOrConstantSplatVectorFP(*MIList[2], *MRI).value();
+  EXPECT_EQ(0.03, val.convertToDouble());
+
+  EXPECT_FALSE(isConstantOrConstantSplatVectorFP(*MIList[3], *MRI).has_value());
+
+  EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[4], *MRI).has_value());
+  val = isConstantOrConstantSplatVectorFP(*MIList[4], *MRI).value();
+  EXPECT_EQ(2.0, val.convertToFloat());
+
+  EXPECT_FALSE(isConstantOrConstantSplatVectorFP(*MIList[5], *MRI).has_value());
+
+  EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[6], *MRI).has_value());
+  val = isConstantOrConstantSplatVectorFP(*MIList[6], *MRI).value();
+  EXPECT_EQ(0.03, val.convertToDouble());
+
+  EXPECT_FALSE(isConstantOrConstantSplatVectorFP(*MIList[7], *MRI).has_value());
+
+  EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[8], *MRI).has_value());
+  val = isConstantOrConstantSplatVectorFP(*MIList[8], *MRI).value();
+  EXPECT_EQ(2.0, val.convertToFloat());
+
+  EXPECT_TRUE(isConstantOrConstantSplatVectorFP(*MIList[9], *MRI).has_value());
+  val = isConstantOrConstantSplatVectorFP(*MIList[9], *MRI).value();
+  EXPECT_EQ(0.03, val.convertToDouble());
+
+  EXPECT_FALSE(
+      isConstantOrConstantSplatVectorFP(*MIList[10], *MRI).has_value());
+
+  EXPECT_FALSE(
+      isConstantOrConstantSplatVectorFP(*MIList[11], *MRI).has_value());
+
+  EXPECT_FALSE(
+      isConstantOrConstantSplatVectorFP(*MIList[12], *MRI).has_value());
+
+  EXPECT_FALSE(
+      isConstantOrConstantSplatVectorFP(*MIList[13], *MRI).has_value());
+
+  EXPECT_FALSE(
+      isConstantOrConstantSplatVectorFP(*MIList[14], *MRI).has_value());
+
+  EXPECT_FALSE(
+      isConstantOrConstantSplatVectorFP(*MIList[15], *MRI).has_value());
+}
 }
diff --git a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
index 59a86fa..40cd055 100644
--- a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
@@ -224,6 +224,32 @@ TEST_F(AArch64GISelMITest, MatchBinaryOp) {
   auto MIBAddCst = B.buildAdd(s64, MIBCst, Copies[0]);
   auto MIBUnmerge = B.buildUnmerge({s32, s32}, B.buildConstant(s64, 42));
 
+  // Match min/max, and make sure they're commutative.
+  auto SMin = B.buildSMin(s64, Copies[2], MIBAdd);
+  EXPECT_TRUE(mi_match(SMin.getReg(0), *MRI,
+                       m_GSMin(m_GAdd(m_Reg(Src1), m_Reg(Src2)), m_Reg(Src0))));
+  EXPECT_EQ(Src0, Copies[2]);
+  EXPECT_EQ(Src1, Copies[0]);
+  EXPECT_EQ(Src2, Copies[1]);
+  auto SMax = B.buildSMax(s64, Copies[2], MIBAdd);
+  EXPECT_TRUE(mi_match(SMax.getReg(0), *MRI,
+                       m_GSMax(m_GAdd(m_Reg(Src1), m_Reg(Src2)), m_Reg(Src0))));
+  EXPECT_EQ(Src0, Copies[2]);
+  EXPECT_EQ(Src1, Copies[0]);
+  EXPECT_EQ(Src2, Copies[1]);
+  auto UMin = B.buildUMin(s64, Copies[2], MIBAdd);
+  EXPECT_TRUE(mi_match(UMin.getReg(0), *MRI,
+                       m_GUMin(m_GAdd(m_Reg(Src1), m_Reg(Src2)), m_Reg(Src0))));
+  EXPECT_EQ(Src0, Copies[2]);
+  EXPECT_EQ(Src1, Copies[0]);
+  EXPECT_EQ(Src2, Copies[1]);
+  auto UMax = B.buildUMax(s64, Copies[2], MIBAdd);
+  EXPECT_TRUE(mi_match(UMax.getReg(0), *MRI,
+                       m_GUMax(m_GAdd(m_Reg(Src1), m_Reg(Src2)), m_Reg(Src0))));
+  EXPECT_EQ(Src0, Copies[2]);
+  EXPECT_EQ(Src1, Copies[0]);
+  EXPECT_EQ(Src2, Copies[1]);
+
   // m_BinOp with opcode.
   // Match binary instruction, opcode and its non-commutative operands.
   match = mi_match(MIBAddCst, *MRI,
@@ -576,6 +602,11 @@ TEST_F(AArch64GISelMITest, MatchMiscellaneous) {
   auto MIBAdd = B.buildAdd(s64, Copies[0], Copies[1]);
   Register Reg = MIBAdd.getReg(0);
 
+  // Extract the type.
+  LLT Ty;
+  EXPECT_TRUE(mi_match(Reg, *MRI, m_GAdd(m_Type(Ty), m_Reg())));
+  EXPECT_EQ(Ty, s64);
+
   // Only one use of Reg.
   B.buildCast(LLT::pointer(0, 32), MIBAdd);
   EXPECT_TRUE(mi_match(Reg, *MRI, m_OneUse(m_GAdd(m_Reg(), m_Reg()))));
@@ -889,6 +920,36 @@ TEST_F(AArch64GISelMITest, MatchSpecificReg) {
   EXPECT_TRUE(mi_match(Add.getReg(0), *MRI, m_GAdd(m_SpecificReg(Reg), m_Reg())));
 }
 
+TEST_F(AArch64GISelMITest, DeferredMatching) {
+  setUp();
+  if (!TM)
+    GTEST_SKIP();
+  auto s64 = LLT::scalar(64);
+  auto s32 = LLT::scalar(32);
+
+  auto Cst1 = B.buildConstant(s64, 42);
+  auto Cst2 = B.buildConstant(s64, 314);
+  auto Add = B.buildAdd(s64, Cst1, Cst2);
+  auto Sub = B.buildSub(s64, Add, Cst1);
+
+  auto TruncAdd = B.buildTrunc(s32, Add);
+  auto TruncSub = B.buildTrunc(s32, Sub);
+  auto NarrowAdd = B.buildAdd(s32, TruncAdd, TruncSub);
+
+  Register X;
+  EXPECT_TRUE(mi_match(Sub.getReg(0), *MRI,
+                       m_GSub(m_GAdd(m_Reg(X), m_Reg()), m_DeferredReg(X))));
+  LLT Ty;
+  EXPECT_TRUE(
+      mi_match(NarrowAdd.getReg(0), *MRI,
+               m_GAdd(m_GTrunc(m_Type(Ty)), m_GTrunc(m_DeferredType(Ty)))));
+
+  // Test commutative.
+  auto Add2 = B.buildAdd(s64, Sub, Cst1);
+  EXPECT_TRUE(mi_match(Add2.getReg(0), *MRI,
+                       m_GAdd(m_Reg(X), m_GSub(m_Reg(), m_DeferredReg(X)))));
+}
+
 } // namespace
 
 int main(int argc, char **argv) {
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 259bdad..a2e1e58 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -119,6 +119,33 @@ TEST_F(SelectionDAGPatternMatchTest, matchValueType) {
   EXPECT_FALSE(sd_match(Op2, m_ScalableVectorVT()));
 }
 
+TEST_F(SelectionDAGPatternMatchTest, matchVecShuffle) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+  const std::array<int, 4> MaskData = {2, 0, 3, 1};
+  const std::array<int, 4> OtherMaskData = {1, 2, 3, 4};
+  ArrayRef<int> Mask;
+
+  SDValue V0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, VInt32VT);
+  SDValue V1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT);
+  SDValue VecShuffleWithMask =
+      DAG->getVectorShuffle(VInt32VT, DL, V0, V1, MaskData);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(VecShuffleWithMask, m_Shuffle(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(VecShuffleWithMask,
+                       m_Shuffle(m_Value(), m_Value(), m_Mask(Mask))));
+  EXPECT_TRUE(
+      sd_match(VecShuffleWithMask,
+               m_Shuffle(m_Value(), m_Value(), m_SpecificMask(MaskData))));
+  EXPECT_FALSE(
+      sd_match(VecShuffleWithMask,
+               m_Shuffle(m_Value(), m_Value(), m_SpecificMask(OtherMaskData))));
+  EXPECT_TRUE(
+      std::equal(MaskData.begin(), MaskData.end(), Mask.begin(), Mask.end()));
+}
+
 TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
   SDLoc DL;
   auto Int32VT = EVT::getIntegerVT(Context, 32);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index d7ac108..9faae88 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -1993,6 +1993,7 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdCustomAligned) {
   OpenMPIRBuilder OMPBuilder(*M);
   IRBuilder<> Builder(BB);
   const int AlignmentValue = 32;
+  llvm::BasicBlock *sourceBlock = Builder.GetInsertBlock();
   AllocaInst *Alloc1 =
       Builder.CreateAlloca(Builder.getPtrTy(), Builder.getInt64(1));
   LoadInst *Load1 = Builder.CreateLoad(Alloc1->getAllocatedType(), Alloc1);
@@ -2031,13 +2032,12 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdCustomAligned) {
 
   // Check if number of assumption instructions is equal to number of aligned
   // variables
-  BasicBlock *LoopPreheader = CLI->getPreheader();
-  size_t NumAssummptionCallsInPreheader = count_if(
-      *LoopPreheader, [](Instruction &I) { return isa<AssumeInst>(I); });
+  size_t NumAssummptionCallsInPreheader =
+      count_if(*sourceBlock, [](Instruction &I) { return isa<AssumeInst>(I); });
   EXPECT_EQ(NumAssummptionCallsInPreheader, AlignedVars.size());
 
   // Check if variables are correctly aligned
-  for (Instruction &Instr : *LoopPreheader) {
+  for (Instruction &Instr : *sourceBlock) {
     if (!isa<AssumeInst>(Instr))
       continue;
     AssumeInst *AssumeInstruction = cast<AssumeInst>(&Instr);
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index e1d9b3e..c390ffe 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -2720,6 +2720,37 @@ TEST_F(ConstantRangeTest, binaryAnd) {
   EXPECT_EQ(R16_32.binaryAnd(R0_99), R0_32);
   EXPECT_EQ(R0_99.binaryAnd(R16_32), R0_32);
 
+  // 'And' with leading bits are masked (with common leading bits stripped)
+  ConstantRange RMaskedL(APInt(8, 0b10'00101'1), APInt(8, 0b10'10000'0 + 1));
+  ConstantRange RMaskedR(APInt(8, 0b10'11111'0), APInt(8, 0b10'11111'1 + 1));
+  EXPECT_EQ(RMaskedL.binaryAnd(RMaskedR).getLower(), APInt(8, 0b10'00101'0));
+  EXPECT_EQ(RMaskedR.binaryAnd(RMaskedL).getLower(), APInt(8, 0b10'00101'0));
+
+  ConstantRange RMaskedL1(APInt(8, 0b00'011'010), APInt(8, 0b00'100'100 + 1));
+  ConstantRange RMaskedR1(APInt(8, 0b00'111'010), APInt(8, 0b00'111'110 + 1));
+  EXPECT_EQ(RMaskedL1.binaryAnd(RMaskedR1).getLower(), APInt(8, 0b00'011'000));
+  EXPECT_EQ(RMaskedR1.binaryAnd(RMaskedL1).getLower(), APInt(8, 0b00'011'000));
+
+  ConstantRange RMaskedL2(APInt(8, 0b0000'0111u), APInt(8, 0b0000'1101u + 1u));
+  ConstantRange RMaskedR2(APInt(8, 0xff), APInt(8, 0));
+  EXPECT_EQ(RMaskedL2.binaryAnd(RMaskedR2), RMaskedL2);
+  EXPECT_EQ(RMaskedR2.binaryAnd(RMaskedL2), RMaskedL2);
+
+  ConstantRange RMaskedL3(APInt(4, 0b0011u), APInt(4, 0));
+  ConstantRange RMaskedR3(APInt(4, 0b1011u), APInt(4, 0));
+  APInt Zero_4(4, 0);
+  EXPECT_EQ(RMaskedL3.binaryAnd(RMaskedR3).getLower().uge(Zero_4), true);
+  EXPECT_EQ(RMaskedR3.binaryAnd(RMaskedL3).getLower().uge(Zero_4), true);
+
+  // wrapped set
+  APInt NegSeven(4, 9); // Also -7
+  ConstantRange RMaskedL4(NegSeven, APInt(4, 1));
+  ConstantRange RMaskedR4(NegSeven, APInt(4, 0));
+  EXPECT_EQ(RMaskedL4.binaryAnd(RMaskedR4).contains(Zero_4), true);
+  EXPECT_EQ(RMaskedR4.binaryAnd(RMaskedL4).contains(Zero_4), true);
+  EXPECT_EQ(RMaskedL4.binaryAnd(RMaskedR4).contains(NegSeven), true);
+  EXPECT_EQ(RMaskedR4.binaryAnd(RMaskedL4).contains(NegSeven), true);
+
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
         return CR1.binaryAnd(CR2);
diff --git a/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp
index a0c8437..259d68e 100644
--- a/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp
+++ b/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp
@@ -51,10 +51,11 @@ TEST(AMDGPUDwarfRegMappingTests, TestWave64DwarfRegMapping) {
       // PC_64 => 16, EXEC_MASK_64 => 17, S0 => 32, S63 => 95,
       // S64 => 1088, S105 => 1129, V0 => 2560, V255 => 2815,
       // A0 => 3072, A255 => 3327
-      for (int llvmReg : {16, 17, 32, 95, 1088, 1129, 2560, 2815, 3072, 3327}) {
-        MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false));
-        EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false));
-        EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, true));
+      for (int DwarfEncoding :
+           {16, 17, 32, 95, 1088, 1129, 2560, 2815, 3072, 3327}) {
+        MCRegister Reg = *MRI->getLLVMRegNum(DwarfEncoding, false);
+        EXPECT_EQ(DwarfEncoding, MRI->getDwarfRegNum(Reg, false));
+        EXPECT_EQ(DwarfEncoding, MRI->getDwarfRegNum(Reg, true));
       }
     }
   }
@@ -70,10 +71,11 @@ TEST(AMDGPUDwarfRegMappingTests, TestWave32DwarfRegMapping) {
       // PC_64 => 16, EXEC_MASK_32 => 1, S0 => 32, S63 => 95,
       // S64 => 1088, S105 => 1129, V0 => 1536, V255 => 1791,
       // A0 => 2048, A255 => 2303
-      for (int llvmReg : {16, 1, 32, 95, 1088, 1129, 1536, 1791, 2048, 2303}) {
-        MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false));
-        EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false));
-        EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, true));
+      for (int DwarfEncoding :
+           {16, 1, 32, 95, 1088, 1129, 1536, 1791, 2048, 2303}) {
+        MCRegister Reg = *MRI->getLLVMRegNum(DwarfEncoding, false);
+        EXPECT_EQ(DwarfEncoding, MRI->getDwarfRegNum(Reg, false));
+        EXPECT_EQ(DwarfEncoding, MRI->getDwarfRegNum(Reg, true));
       }
     }
   }
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index d64f898..6de8165 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -69,6 +69,7 @@ add_llvm_unittest(SupportTests
   PerThreadBumpPtrAllocatorTest.cpp
   ProcessTest.cpp
   ProgramTest.cpp
+  RecyclerTest.cpp
   RegexTest.cpp
   ReverseIterationTest.cpp
   ReplaceFileTest.cpp
diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp
index 8dde2fb..187f47d 100644
--- a/llvm/unittests/Support/Path.cpp
+++ b/llvm/unittests/Support/Path.cpp
@@ -1326,6 +1326,9 @@ TEST_F(FileSystemTest, Remove) {
 
   ASSERT_NO_ERROR(fs::remove_directories("D:/footest"));
 
+  ASSERT_NO_ERROR(fs::remove_directories(Twine(BaseDir) + "/foo/bar/baz"));
+  ASSERT_FALSE(fs::exists(Twine(BaseDir) + "/foo/bar/baz"));
+
   ASSERT_NO_ERROR(fs::remove_directories(BaseDir));
   ASSERT_FALSE(fs::exists(BaseDir));
 }
diff --git a/llvm/unittests/Support/RecyclerTest.cpp b/llvm/unittests/Support/RecyclerTest.cpp
new file mode 100644
index 0000000..a33506b
--- /dev/null
+++ b/llvm/unittests/Support/RecyclerTest.cpp
@@ -0,0 +1,47 @@
+//===--- unittest/Support/RecyclerTest.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Recycler.h"
+#include "llvm/Support/AllocatorBase.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+struct Object8 {
+  char Data[8];
+};
+
+class DecoratedMallocAllocator : public MallocAllocator {
+public:
+  int DeallocCount = 0;
+
+  template <typename T> void Deallocate(T *Ptr) {
+    DeallocCount++;
+    MallocAllocator::Deallocate(Ptr);
+  }
+};
+
+TEST(RecyclerTest, MoveConstructor) {
+  DecoratedMallocAllocator Allocator;
+  Recycler<Object8> R;
+  Object8 *A1 = R.Allocate(Allocator);
+  Object8 *A2 = R.Allocate(Allocator);
+  R.Deallocate(Allocator, A1);
+  R.Deallocate(Allocator, A2);
+  Recycler<Object8> R2(std::move(R));
+  Object8 *A3 = R2.Allocate(Allocator);
+  R2.Deallocate(Allocator, A3);
+  R.clear(Allocator); // Should not deallocate anything as it was moved from.
+  EXPECT_EQ(Allocator.DeallocCount, 0);
+  R2.clear(Allocator);
+  EXPECT_EQ(Allocator.DeallocCount, 2);
+}
+
+} // end anonymous namespace
diff --git a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
index 56da4ce..00c9593 100644
--- a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
+++ b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
@@ -25,11 +25,24 @@ TEST(AMDGPU, TestWave64DwarfRegMapping) {
         // PC_64 => 16, EXEC_MASK_64 => 17, S0 => 32, S63 => 95,
         // S64 => 1088, S105 => 1129, V0 => 2560, V255 => 2815,
         // A0 => 3072, A255 => 3327
-        for (int llvmReg :
+        for (int DwarfEncoding :
              {16, 17, 32, 95, 1088, 1129, 2560, 2815, 3072, 3327}) {
-          MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false));
-          EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false));
-          EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, true));
+          MCRegister Reg = *MRI->getLLVMRegNum(DwarfEncoding, false);
+          EXPECT_EQ(DwarfEncoding, MRI->getDwarfRegNum(Reg, false));
+          EXPECT_EQ(DwarfEncoding, MRI->getDwarfRegNum(Reg, true));
+        }
+
+        // We should get the correct LLVM register when round tripping through
+        // the dwarf encoding.
+        for (MCRegister LLReg : {AMDGPU::VGPR1, AMDGPU::AGPR2, AMDGPU::SGPR3}) {
+          int DwarfEncoding = MRI->getDwarfRegNum(LLReg, false);
+          EXPECT_EQ(LLReg, MRI->getLLVMRegNum(DwarfEncoding, false));
+        }
+
+        // Verify that subregisters have no dwarf encoding.
+        for (MCRegister LLSubReg :
+             {AMDGPU::VGPR1_LO16, AMDGPU::AGPR1_HI16, AMDGPU::SGPR1_HI16}) {
+          EXPECT_EQ(MRI->getDwarfRegNum(LLSubReg, false), -1);
         }
       }
     }
@@ -49,11 +62,24 @@ TEST(AMDGPU, TestWave32DwarfRegMapping) {
         // PC_64 => 16, EXEC_MASK_32 => 1, S0 => 32, S63 => 95,
         // S64 => 1088, S105 => 1129, V0 => 1536, V255 => 1791,
         // A0 => 2048, A255 => 2303
-        for (int llvmReg :
+        for (int DwarfEncoding :
              {16, 1, 32, 95, 1088, 1129, 1536, 1791, 2048, 2303}) {
-          MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false));
-          EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false));
-          EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, true));
+          MCRegister Reg = *MRI->getLLVMRegNum(DwarfEncoding, false);
+          EXPECT_EQ(DwarfEncoding, MRI->getDwarfRegNum(Reg, false));
+          EXPECT_EQ(DwarfEncoding, MRI->getDwarfRegNum(Reg, true));
+        }
+
+        // We should get the correct LLVM register when round tripping through
+        // the dwarf encoding.
+        for (MCRegister LLReg : {AMDGPU::VGPR1, AMDGPU::AGPR2, AMDGPU::SGPR3}) {
+          int DwarfEncoding = MRI->getDwarfRegNum(LLReg, false);
+          EXPECT_EQ(LLReg, MRI->getLLVMRegNum(DwarfEncoding, false));
+        }
+
+        // Verify that subregisters have no dwarf encoding.
+        for (MCRegister LLSubReg :
+             {AMDGPU::VGPR1_LO16, AMDGPU::AGPR1_HI16, AMDGPU::SGPR1_HI16}) {
+          EXPECT_EQ(MRI->getDwarfRegNum(LLSubReg, false), -1);
         }
       }
     }
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index ed334f0..3955d36 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -654,8 +654,9 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
   }
 
   for (StringRef Input :
-       {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqcicsr0p2",
-        "rv64i_xqcilsm0p2", "rv64i_xqcics0p2"}) {
+       {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2",
+        "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcicm0p2",
+        "rv64i_xqcics0p2", "rv64i_xqcicli0p2"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
         ::testing::EndsWith(" is only supported for 'rv32'"));
@@ -1109,10 +1110,15 @@ Experimental extensions
     zalasr               0.1
     zvbc32e              0.7
     zvkgs                0.7
+    sdext                1.0
+    sdtrig               1.0
     smctr                1.0
     ssctr                1.0
     svukte               0.3
     xqcia                0.2
+    xqciac               0.2
+    xqcicli              0.2
+    xqcicm               0.2
     xqcics               0.2
     xqcicsr              0.2
     xqcilsm              0.2
diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
index 847cca7..55b68f5 100644
--- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
@@ -9,12 +9,15 @@
 
 #include "../lib/Transforms/Vectorize/VPlan.h"
 #include "../lib/Transforms/Vectorize/VPlanDominatorTree.h"
+#include "VPlanTestBase.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
 namespace {
 
-TEST(VPDominatorTreeTest, DominanceNoRegionsTest) {
+using VPDominatorTreeTest = VPlanTestBase;
+
+TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) {
   //   VPBB0
   //    |
   //   R1 {
@@ -24,13 +27,13 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) {
   //    \    /
   //    VPBB4
   //  }
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0");
-  VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
-  VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
-  VPBasicBlock *VPBB3 = new VPBasicBlock("VPBB3");
-  VPBasicBlock *VPBB4 = new VPBasicBlock("VPBB4");
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB1, VPBB4);
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB0 = Plan.getEntry();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("VPBB1");
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("VPBB3");
+  VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("VPBB4");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB1, VPBB4);
   VPBB2->setParent(R1);
   VPBB3->setParent(R1);
 
@@ -40,12 +43,7 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) {
   VPBlockUtils::connectBlocks(VPBB2, VPBB4);
   VPBlockUtils::connectBlocks(VPBB3, VPBB4);
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB0);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
   VPDominatorTree VPDT;
   VPDT.recalculate(Plan);
@@ -62,7 +60,6 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) {
   EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB3), VPBB1);
   EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB4), VPBB1);
   EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB4, VPBB4), VPBB4);
-  delete ScalarHeader;
 }
 
 static void
@@ -76,9 +73,7 @@ checkDomChildren(VPDominatorTree &VPDT, VPBlockBase *Src,
   EXPECT_EQ(Children, ExpectedNodes);
 }
 
-TEST(VPDominatorTreeTest, DominanceRegionsTest) {
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
+TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
   {
     // 2 consecutive regions.
     // VPBB0
@@ -99,13 +94,13 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     //    R2BB2
     // }
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0");
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPBasicBlock *R1BB3 = new VPBasicBlock();
-    VPBasicBlock *R1BB4 = new VPBasicBlock();
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB4, "R1");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB0 = Plan.getEntry();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB4 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB4, "R1");
     R1BB2->setParent(R1);
     R1BB3->setParent(R1);
     VPBlockUtils::connectBlocks(VPBB0, R1);
@@ -116,16 +111,13 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     // Cycle.
     VPBlockUtils::connectBlocks(R1BB3, R1BB3);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
 
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB0);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
     VPDominatorTree VPDT;
     VPDT.recalculate(Plan);
 
@@ -177,16 +169,16 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     //   |
     //  VPBB2
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1");
-    VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2");
-    VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3");
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB3, "R1");
-
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
-    VPBasicBlock *R2BB3 = new VPBasicBlock("R2BB3");
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB3, "R2");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB3, "R1");
+
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
+    VPBasicBlock *R2BB3 = Plan.createVPBasicBlock("R2BB#");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB3, "R2");
     R2BB2->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB2, R2BB1);
@@ -199,15 +191,12 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     VPBlockUtils::connectBlocks(R1BB2, R1BB3);
     VPBlockUtils::connectBlocks(R2, R1BB3);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
     VPDominatorTree VPDT;
     VPDT.recalculate(Plan);
 
@@ -220,9 +209,8 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) {
     checkDomChildren(VPDT, R2BB2, {R2BB3});
     checkDomChildren(VPDT, R2BB3, {});
     checkDomChildren(VPDT, R1BB3, {VPBB2});
-    checkDomChildren(VPDT, VPBB2, {ScalarHeaderVPBB});
+    checkDomChildren(VPDT, VPBB2, {Plan.getScalarHeader()});
   }
-  delete ScalarHeader;
 }
 
 } // namespace
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index 1b362d1..19c2483 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -17,7 +17,7 @@
 namespace llvm {
 namespace {
 
-class VPlanHCFGTest : public VPlanTestBase {};
+class VPlanHCFGTest : public VPlanTestIRBase {};
 
 TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) {
   const char *ModuleString =
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
index 1b993b6..e3c542e 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -16,7 +16,7 @@
 namespace llvm {
 namespace {
 
-class VPlanSlpTest : public VPlanTestBase {
+class VPlanSlpTest : public VPlanTestIRBase {
 protected:
   TargetLibraryInfoImpl TLII;
   TargetLibraryInfo TLI;
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index f3a1bba..1ac499f 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -9,6 +9,7 @@
 
 #include "../lib/Transforms/Vectorize/VPlan.h"
 #include "../lib/Transforms/Vectorize/VPlanCFG.h"
+#include "VPlanTestBase.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -237,15 +238,16 @@ TEST(VPInstructionTest, releaseOperandsAtDeletion) {
   delete VPV1;
   delete VPV2;
 }
-TEST(VPBasicBlockTest, getPlan) {
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
+
+using VPBasicBlockTest = VPlanTestBase;
+
+TEST_F(VPBasicBlockTest, getPlan) {
   {
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *VPBB1 = new VPBasicBlock();
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
-    VPBasicBlock *VPBB3 = new VPBasicBlock();
-    VPBasicBlock *VPBB4 = new VPBasicBlock();
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB1 = Plan.getEntry();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
     //     VPBB1
     //     /   \
@@ -256,11 +258,7 @@ TEST(VPBasicBlockTest, getPlan) {
     VPBlockUtils::connectBlocks(VPBB1, VPBB3);
     VPBlockUtils::connectBlocks(VPBB2, VPBB4);
     VPBlockUtils::connectBlocks(VPBB3, VPBB4);
-
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader());
 
     EXPECT_EQ(&Plan, VPBB1->getPlan());
     EXPECT_EQ(&Plan, VPBB2->getPlan());
@@ -269,20 +267,17 @@ TEST(VPBasicBlockTest, getPlan) {
   }
 
   {
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     // VPBasicBlock is the entry into the VPlan, followed by a region.
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB2, "R1");
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock();
     VPBlockUtils::connectBlocks(VPBB1, R1);
 
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
     EXPECT_EQ(&Plan, VPBB1->getPlan());
     EXPECT_EQ(&Plan, R1->getPlan());
@@ -291,30 +286,26 @@ TEST(VPBasicBlockTest, getPlan) {
   }
 
   {
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB2, "R1");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB2, "R1");
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock();
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBlockUtils::connectBlocks(VPBB1, R2);
 
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
     VPBlockUtils::connectBlocks(R1, VPBB2);
     VPBlockUtils::connectBlocks(R2, VPBB2);
 
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
 
     EXPECT_EQ(&Plan, VPBB1->getPlan());
     EXPECT_EQ(&Plan, R1->getPlan());
@@ -325,12 +316,9 @@ TEST(VPBasicBlockTest, getPlan) {
     EXPECT_EQ(&Plan, R2BB2->getPlan());
     EXPECT_EQ(&Plan, VPBB2->getPlan());
   }
-  delete ScalarHeader;
 }
 
-TEST(VPBasicBlockTest, TraversingIteratorTest) {
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
+TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
   {
     // VPBasicBlocks only
     //     VPBB1
@@ -339,11 +327,11 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //    \    /
     //    VPBB4
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *VPBB1 = new VPBasicBlock();
-    VPBasicBlock *VPBB2 = new VPBasicBlock();
-    VPBasicBlock *VPBB3 = new VPBasicBlock();
-    VPBasicBlock *VPBB4 = new VPBasicBlock();
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB1 = Plan.getEntry();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
     VPBlockUtils::connectBlocks(VPBB1, VPBB2);
     VPBlockUtils::connectBlocks(VPBB1, VPBB3);
@@ -356,11 +344,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(VPBB1, FromIterator[0]);
     EXPECT_EQ(VPBB2, FromIterator[1]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader());
   }
 
   {
@@ -382,13 +366,13 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //      |
     //    R2BB2
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *VPBB0 = new VPBasicBlock("VPBB0");
-    VPBasicBlock *R1BB1 = new VPBasicBlock();
-    VPBasicBlock *R1BB2 = new VPBasicBlock();
-    VPBasicBlock *R1BB3 = new VPBasicBlock();
-    VPBasicBlock *R1BB4 = new VPBasicBlock();
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB4, "R1");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *VPBB0 = Plan.getEntry();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R1BB4 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB4, "R1");
     R1BB2->setParent(R1);
     R1BB3->setParent(R1);
     VPBlockUtils::connectBlocks(VPBB0, R1);
@@ -399,9 +383,9 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     // Cycle.
     VPBlockUtils::connectBlocks(R1BB3, R1BB3);
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock();
-    VPBasicBlock *R2BB2 = new VPBasicBlock();
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
 
@@ -458,11 +442,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(R1BB1, FromIterator[6]);
     EXPECT_EQ(R1, FromIterator[7]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB0);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
   }
 
   {
@@ -486,16 +466,16 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //   |
     //  VPBB2
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *R1BB1 = new VPBasicBlock("R1BB1");
-    VPBasicBlock *R1BB2 = new VPBasicBlock("R1BB2");
-    VPBasicBlock *R1BB3 = new VPBasicBlock("R1BB3");
-    VPRegionBlock *R1 = new VPRegionBlock(R1BB1, R1BB3, "R1");
-
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
-    VPBasicBlock *R2BB3 = new VPBasicBlock("R2BB3");
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB3, "R2");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1");
+    VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2");
+    VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB3, "R1");
+
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
+    VPBasicBlock *R2BB3 = Plan.createVPBasicBlock("R2BB3");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB3, "R2");
     R2BB2->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB2, R2BB1);
@@ -508,9 +488,9 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     VPBlockUtils::connectBlocks(R1BB2, R1BB3);
     VPBlockUtils::connectBlocks(R2, R1BB3);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
     // Depth-first.
@@ -543,11 +523,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(R1, FromIterator[8]);
     EXPECT_EQ(VPBB1, FromIterator[9]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
   }
 
   {
@@ -561,16 +537,16 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //      R2BB2
     //   }
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPBasicBlock *R2BB2 = new VPBasicBlock("R2BB2");
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R2BB2, "R2");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
-    VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R2, R2, "R1");
     R2->setParent(R1);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
+    VPBasicBlock *VPBB1 = Plan.getEntry();
     VPBlockUtils::connectBlocks(VPBB1, R1);
 
     // Depth-first.
@@ -593,11 +569,7 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(R1, FromIterator[3]);
     EXPECT_EQ(VPBB1, FromIterator[4]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
   }
 
   {
@@ -619,20 +591,20 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     //   |
     //  VPBB2
     //
-    VPBasicBlock *VPPH = new VPBasicBlock("ph");
-    VPBasicBlock *R3BB1 = new VPBasicBlock("R3BB1");
-    VPRegionBlock *R3 = new VPRegionBlock(R3BB1, R3BB1, "R3");
+    VPlan &Plan = getPlan();
+    VPBasicBlock *R3BB1 = Plan.createVPBasicBlock("R3BB1");
+    VPRegionBlock *R3 = Plan.createVPRegionBlock(R3BB1, R3BB1, "R3");
 
-    VPBasicBlock *R2BB1 = new VPBasicBlock("R2BB1");
-    VPRegionBlock *R2 = new VPRegionBlock(R2BB1, R3, "R2");
+    VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
+    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R3, "R2");
     R3->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R3);
 
-    VPRegionBlock *R1 = new VPRegionBlock(R2, R2, "R1");
+    VPRegionBlock *R1 = Plan.createVPRegionBlock(R2, R2, "R1");
     R2->setParent(R1);
 
-    VPBasicBlock *VPBB1 = new VPBasicBlock("VPBB1");
-    VPBasicBlock *VPBB2 = new VPBasicBlock("VPBB2");
+    VPBasicBlock *VPBB1 = Plan.getEntry();
+    VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
     VPBlockUtils::connectBlocks(VPBB1, R1);
     VPBlockUtils::connectBlocks(R1, VPBB2);
 
@@ -687,26 +659,22 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) {
     EXPECT_EQ(R2BB1, FromIterator[2]);
     EXPECT_EQ(VPBB1, FromIterator[3]);
 
-    // Use Plan to properly clean up created blocks.
-    VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-    VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB);
-    VPBlockUtils::connectBlocks(VPPH, VPBB1);
-    VPlan Plan(VPPH, ScalarHeaderVPBB);
+    VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
   }
-  delete ScalarHeader;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-TEST(VPBasicBlockTest, print) {
+TEST_F(VPBasicBlockTest, print) {
   VPInstruction *TC = new VPInstruction(Instruction::Add, {});
-  VPBasicBlock *VPBB0 = new VPBasicBlock("preheader");
+  VPlan &Plan = getPlan(TC);
+  VPBasicBlock *VPBB0 = Plan.getEntry();
   VPBB0->appendRecipe(TC);
 
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
   VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1});
   VPInstruction *I3 = new VPInstruction(Instruction::Br, {I1, I2});
 
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBB1->appendRecipe(I1);
   VPBB1->appendRecipe(I2);
   VPBB1->appendRecipe(I3);
@@ -714,7 +682,7 @@ TEST(VPBasicBlockTest, print) {
 
   VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1});
   VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4});
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
   VPBB2->appendRecipe(I4);
   VPBB2->appendRecipe(I5);
   VPBB2->setName("bb2");
@@ -730,12 +698,8 @@ TEST(VPBasicBlockTest, print) {
     EXPECT_EQ("EMIT br <badref>, <badref>", I3Dump);
   }
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "scalar.header");
-  auto * ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
-  VPlan Plan(VPBB0, TC, ScalarHeaderVPBB);
   std::string FullDump;
   raw_string_ostream OS(FullDump);
   Plan.printDOT(OS);
@@ -810,26 +774,21 @@ Successor(s): ir-bb<scalar.header>
     OS << *I4;
     EXPECT_EQ("EMIT vp<%5> = mul vp<%3>, vp<%2>", I4Dump);
   }
-  delete ScalarHeader;
 }
 
-TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) {
-
+TEST_F(VPBasicBlockTest, printPlanWithVFsAndUFs) {
   VPInstruction *TC = new VPInstruction(Instruction::Sub, {});
-  VPBasicBlock *VPBB0 = new VPBasicBlock("preheader");
+  VPlan &Plan = getPlan(TC);
+  VPBasicBlock *VPBB0 = Plan.getEntry();
   VPBB0->appendRecipe(TC);
 
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
   VPBB1->appendRecipe(I1);
   VPBB1->setName("bb1");
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
-  VPlan Plan(VPBB0, TC, ScalarHeaderVPBB);
   Plan.setName("TestPlan");
   Plan.addVF(ElementCount::getFixed(4));
 
@@ -847,9 +806,9 @@ Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
-Successor(s): ir-bb<>
+Successor(s): ir-bb<scalar.header>
 
-ir-bb<>:
+ir-bb<scalar.header>:
 No successors
 }
 )";
@@ -871,9 +830,9 @@ Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
-Successor(s): ir-bb<>
+Successor(s): ir-bb<scalar.header>
 
-ir-bb<>:
+ir-bb<scalar.header>:
 No successors
 }
 )";
@@ -895,19 +854,19 @@ Successor(s): bb1
 
 bb1:
   EMIT vp<%2> = add
-Successor(s): ir-bb<>
+Successor(s): ir-bb<scalar.header>
 
-ir-bb<>:
+ir-bb<scalar.header>:
 No successors
 }
 )";
     EXPECT_EQ(ExpectedStr, FullDump);
   }
-  delete ScalarHeader;
 }
 #endif
 
-TEST(VPRecipeTest, CastVPInstructionToVPUser) {
+using VPRecipeTest = VPlanTestBase;
+TEST_F(VPRecipeTest, CastVPInstructionToVPUser) {
   VPValue Op1;
   VPValue Op2;
   VPInstruction Recipe(Instruction::Add, {&Op1, &Op2});
@@ -917,9 +876,7 @@ TEST(VPRecipeTest, CastVPInstructionToVPUser) {
   EXPECT_EQ(&Recipe, BaseR);
 }
 
-TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
                                        PoisonValue::get(Int32));
@@ -936,9 +893,7 @@ TEST(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   delete AI;
 }
 
-TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   FunctionType *FTy = FunctionType::get(Int32, false);
   Function *Fn = Function::Create(FTy, GlobalValue::ExternalLinkage, 0);
@@ -964,9 +919,7 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
   delete Fn;
 }
 
-TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
   IntegerType *Int1 = IntegerType::get(C, 1);
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *SelectI = SelectInst::Create(
@@ -992,9 +945,7 @@ TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
   delete SelectI;
 }
 
-TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
   auto *GEP = GetElementPtrInst::Create(Int32, PoisonValue::get(Int32Ptr),
@@ -1017,9 +968,7 @@ TEST(VPRecipeTest, CastVPWidenGEPRecipeToVPUserAndVPDef) {
   delete GEP;
 }
 
-TEST(VPRecipeTest, CastVPBlendRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *Phi = PHINode::Create(Int32, 1);
   VPValue I1;
@@ -1036,9 +985,7 @@ TEST(VPRecipeTest, CastVPBlendRecipeToVPUser) {
   delete Phi;
 }
 
-TEST(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
   VPValue Addr;
   VPValue Mask;
   InterleaveGroup<Instruction> IG(4, false, Align(4));
@@ -1049,9 +996,7 @@ TEST(VPRecipeTest, CastVPInterleaveRecipeToVPUser) {
   EXPECT_EQ(&Recipe, BaseR);
 }
 
-TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
   VPValue Op1;
   VPValue Op2;
   SmallVector<VPValue *, 4> Args;
@@ -1068,9 +1013,7 @@ TEST(VPRecipeTest, CastVPReplicateRecipeToVPUser) {
   delete Call;
 }
 
-TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
   VPValue Mask;
   VPBranchOnMaskRecipe Recipe(&Mask);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
@@ -1079,9 +1022,7 @@ TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
   EXPECT_EQ(&Recipe, BaseR);
 }
 
-TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
   auto *Load =
@@ -1101,8 +1042,7 @@ TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
   delete Load;
 }
 
-TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
-  LLVMContext C;
+TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
   IntegerType *Int1 = IntegerType::get(C, 1);
   IntegerType *Int32 = IntegerType::get(C, 32);
   PointerType *Int32Ptr = PointerType::get(Int32, 0);
@@ -1242,7 +1182,6 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
 
   {
     // Test for a call to a function without side-effects.
-    LLVMContext C;
     Module M("", C);
     Function *TheFn =
         Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer);
@@ -1281,9 +1220,9 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPInstruction VPInst(Instruction::Add, {&Op1, &Op2});
     VPRecipeBase &Recipe = VPInst;
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
-    EXPECT_TRUE(Recipe.mayReadFromMemory());
+    EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
-    EXPECT_TRUE(Recipe.mayReadOrWriteMemory());
+    EXPECT_FALSE(Recipe.mayReadOrWriteMemory());
   }
   {
     VPValue Op1;
@@ -1296,15 +1235,12 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-TEST(VPRecipeTest, dumpRecipeInPlan) {
-  VPBasicBlock *VPBB0 = new VPBasicBlock("preheader");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB);
+TEST_F(VPRecipeTest, dumpRecipeInPlan) {
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB0 = Plan.getEntry();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
+  VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
-  VPlan Plan(VPBB0, ScalarHeaderVPBB);
 
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
@@ -1366,18 +1302,14 @@ TEST(VPRecipeTest, dumpRecipeInPlan) {
   }
 
   delete AI;
-  delete ScalarHeader;
 }
 
-TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
-  VPBasicBlock *VPBB0 = new VPBasicBlock("preheader");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB);
+TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB0 = Plan.getEntry();
+  VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("");
+  VPBlockUtils::connectBlocks(VPBB1, Plan.getScalarHeader());
   VPBlockUtils::connectBlocks(VPBB0, VPBB1);
-  VPlan Plan(VPBB0, ScalarHeaderVPBB);
 
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
@@ -1456,11 +1388,9 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) {
         testing::ExitedWithCode(0), "EMIT vp<%2> = mul vp<%1>, vp<%1>");
   }
   delete AI;
-  delete ScalarHeader;
 }
 
-TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
-  LLVMContext C;
+TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32),
                                        PoisonValue::get(Int32));
@@ -1543,9 +1473,7 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) {
 
 #endif
 
-TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) {
   VPValue ChainOp;
   VPValue VecOp;
   VPValue CondOp;
@@ -1556,9 +1484,7 @@ TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) {
   EXPECT_TRUE(isa<VPUser>(BaseR));
 }
 
-TEST(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
-  LLVMContext C;
-
+TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) {
   VPValue ChainOp;
   VPValue VecOp;
   VPValue CondOp;
@@ -1630,7 +1556,7 @@ TEST(VPDoubleValueDefTest, traverseUseLists) {
   EXPECT_EQ(&DoubleValueDef, I3.getOperand(0)->getDefiningRecipe());
 }
 
-TEST(VPRecipeTest, CastToVPSingleDefRecipe) {
+TEST_F(VPRecipeTest, CastToVPSingleDefRecipe) {
   VPValue Start;
   VPEVLBasedIVPHIRecipe R(&Start, {});
   VPRecipeBase *B = &R;
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index 06e091d..1836a5e 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -28,7 +28,7 @@ namespace llvm {
 
 /// Helper class to create a module from an assembly string and VPlans for a
 /// given loop entry block.
-class VPlanTestBase : public testing::Test {
+class VPlanTestIRBase : public testing::Test {
 protected:
   TargetLibraryInfoImpl TLII;
   TargetLibraryInfo TLI;
@@ -41,7 +41,7 @@ protected:
   std::unique_ptr<AssumptionCache> AC;
   std::unique_ptr<ScalarEvolution> SE;
 
-  VPlanTestBase()
+  VPlanTestIRBase()
       : TLII(), TLI(TLII),
         DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-"
            "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:"
@@ -92,6 +92,22 @@ protected:
   }
 };
 
+class VPlanTestBase : public testing::Test {
+protected:
+  LLVMContext C;
+  std::unique_ptr<BasicBlock> ScalarHeader;
+  SmallVector<std::unique_ptr<VPlan>> Plans;
+
+  VPlanTestBase() : ScalarHeader(BasicBlock::Create(C, "scalar.header")) {
+    BranchInst::Create(&*ScalarHeader, &*ScalarHeader);
+  }
+
+  VPlan &getPlan(VPValue *TC = nullptr) {
+    Plans.push_back(std::make_unique<VPlan>(&*ScalarHeader, TC));
+    return *Plans.back();
+  }
+};
+
 } // namespace llvm
 
 #endif // LLVM_UNITTESTS_TRANSFORMS_VECTORIZE_VPLANTESTBASE_H
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 6448153..f098ba0 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -8,32 +8,29 @@
 
 #include "../lib/Transforms/Vectorize/VPlanVerifier.h"
 #include "../lib/Transforms/Vectorize/VPlan.h"
+#include "VPlanTestBase.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
+using VPVerifierTest = VPlanTestBase;
+
 namespace {
-TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
+TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
+  VPlan &Plan = getPlan();
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBB1->appendRecipe(UseI);
   VPBB1->appendRecipe(DefI);
 
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
-
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -43,34 +40,27 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
   EXPECT_STREQ("Use before def!\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
+TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
+  VPlan &Plan = getPlan();
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
   auto *CanIV = new VPCanonicalIVPHIRecipe(UseI, {});
   VPInstruction *BranchOnCond =
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(UseI);
   VPBB2->appendRecipe(CanIV);
   VPBB2->appendRecipe(DefI);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
-
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -80,11 +70,9 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
   EXPECT_STREQ("Use before def!\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
-  LLVMContext C;
+TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
   IntegerType *Int32 = IntegerType::get(C, 32);
   auto *Phi = PHINode::Create(Int32, 1);
 
@@ -95,11 +83,11 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
   auto *Blend = new VPBlendRecipe(Phi, {DefI});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
-  VPBasicBlock *VPBB3 = new VPBasicBlock();
-  VPBasicBlock *VPBB4 = new VPBasicBlock();
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB2->appendRecipe(CanIV);
@@ -109,15 +97,11 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
 
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
   VPBlockUtils::connectBlocks(VPBB3, VPBB4);
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB4, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB4, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBB3->setParent(R1);
 
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -129,10 +113,9 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
 #endif
 
   delete Phi;
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
+TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
   auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {});
   VPInstruction *BranchOnCond =
@@ -140,25 +123,20 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPInstruction *BranchOnCond2 =
       new VPInstruction(VPInstruction::BranchOnCond, {I1});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB1->appendRecipe(BranchOnCond2);
   VPBB2->appendRecipe(CanIV);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(VPBB1, R1);
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -168,10 +146,9 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   EXPECT_STREQ("Multiple instances of the same successor.\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
+TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
   auto *CanIV = new VPCanonicalIVPHIRecipe(I1, {});
   VPInstruction *BranchOnCond =
@@ -179,10 +156,10 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   VPInstruction *BranchOnCond2 =
       new VPInstruction(VPInstruction::BranchOnCond, {I1});
 
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
-  VPBasicBlock *VPBB3 = new VPBasicBlock();
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
+  VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("");
 
   VPBB1->appendRecipe(I1);
   VPBB2->appendRecipe(CanIV);
@@ -191,16 +168,11 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
 
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB3, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB3, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBB3->setParent(R1);
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
@@ -210,13 +182,12 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   EXPECT_STREQ("Multiple instances of the same successor.\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
-TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
-  VPBasicBlock *VPPH = new VPBasicBlock("ph");
-  VPBasicBlock *VPBB1 = new VPBasicBlock();
-  VPBasicBlock *VPBB2 = new VPBasicBlock();
+TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
+  VPlan &Plan = getPlan();
+  VPBasicBlock *VPBB1 = Plan.getEntry();
+  VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {});
   VPInstruction *BranchOnCond =
@@ -225,15 +196,10 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPBB1->appendRecipe(DefI);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
   VPBlockUtils::connectBlocks(VPBB1, R1);
 
-  LLVMContext C;
-  auto *ScalarHeader = BasicBlock::Create(C, "");
-  VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader);
-  VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB);
-  VPBlockUtils::connectBlocks(VPPH, VPBB1);
-  VPlan Plan(VPPH, ScalarHeaderVPBB);
+  VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
   VPBB1->setParent(R1);
 
 #if GTEST_HAS_STREAM_REDIRECTION
@@ -244,7 +210,6 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) {
   EXPECT_STREQ("Predecessor is not in the same region.\n",
                ::testing::internal::GetCapturedStderr().c_str());
 #endif
-  delete ScalarHeader;
 }
 
 } // namespace
diff --git a/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt b/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt
index 3ee3a0d..735f17a 100644
--- a/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt
+++ b/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt
@@ -53,6 +53,9 @@ endif()
 if(LLVM_TARGETS_TO_BUILD MATCHES "Mips")
   include(Mips/CMakeLists.txt)
 endif()
+if(LLVM_TARGETS_TO_BUILD MATCHES "RISCV")
+  include(RISCV/CMakeLists.txt)
+endif()
 
 include_directories(${exegesis_includes})
 
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt b/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt
new file mode 100644
index 0000000..1984819
--- /dev/null
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_llvm_exegesis_unittest_includes(
+  ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV
+  ${LLVM_BINARY_DIR}/lib/Target/RISCV
+  ${LLVM_MAIN_SRC_DIR}/tools/llvm-exegesis/lib
+  )
+
+add_llvm_exegesis_unittest_link_components(
+  MC
+  MCParser
+  Object
+  Support
+  Symbolize
+  RISCV
+  )
+
+add_llvm_exegesis_unittest_sources(
+  SnippetGeneratorTest.cpp
+  TargetTest.cpp
+  )
+add_llvm_exegesis_unittest_link_libraries(
+  LLVMExegesisRISCV)
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp
new file mode 100644
index 0000000..5920b79
--- /dev/null
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp
@@ -0,0 +1,122 @@
+//===-- SnippetGeneratorTest.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../Common/AssemblerUtils.h"
+#include "LlvmState.h"
+#include "MCInstrDescView.h"
+#include "ParallelSnippetGenerator.h"
+#include "RISCVInstrInfo.h"
+#include "RegisterAliasing.h"
+#include "SerialSnippetGenerator.h"
+#include "TestBase.h"
+
+namespace llvm {
+namespace exegesis {
+namespace {
+
+using testing::AnyOf;
+using testing::ElementsAre;
+using testing::HasSubstr;
+using testing::SizeIs;
+
+MATCHER(IsInvalid, "") { return !arg.isValid(); }
+MATCHER(IsReg, "") { return arg.isReg(); }
+
+template <typename SnippetGeneratorT>
+class RISCVSnippetGeneratorTest : public RISCVTestBase {
+protected:
+  RISCVSnippetGeneratorTest() : Generator(State, SnippetGenerator::Options()) {}
+
+  std::vector<CodeTemplate> checkAndGetCodeTemplates(unsigned Opcode) {
+    randomGenerator().seed(0); // Initialize seed.
+    const Instruction &Instr = State.getIC().getInstr(Opcode);
+    auto CodeTemplateOrError = Generator.generateCodeTemplates(
+        &Instr, State.getRATC().emptyRegisters());
+    EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration.
+    return std::move(CodeTemplateOrError.get());
+  }
+
+  SnippetGeneratorT Generator;
+};
+
+using RISCVSerialSnippetGeneratorTest =
+    RISCVSnippetGeneratorTest<SerialSnippetGenerator>;
+
+using RISCVParallelSnippetGeneratorTest =
+    RISCVSnippetGeneratorTest<ParallelSnippetGenerator>;
+
+TEST_F(RISCVSerialSnippetGeneratorTest,
+       ImplicitSelfDependencyThroughExplicitRegs) {
+  // - ADD
+  // - Op0 Explicit Def RegClass(GPR)
+  // - Op1 Explicit Use RegClass(GPR)
+  // - Op2 Explicit Use RegClass(GPR)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasAliasingRegisters
+  const unsigned Opcode = RISCV::ADD;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_EXPLICIT_REGS);
+  ASSERT_THAT(CT.Instructions, SizeIs(1));
+  const InstructionTemplate &IT = CT.Instructions[0];
+  EXPECT_THAT(IT.getOpcode(), Opcode);
+  ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
+  EXPECT_THAT(IT.getVariableValues(),
+              AnyOf(ElementsAre(IsReg(), IsInvalid(), IsReg()),
+                    ElementsAre(IsReg(), IsReg(), IsInvalid())))
+      << "Op0 is either set to Op1 or to Op2";
+}
+
+TEST_F(RISCVSerialSnippetGeneratorTest,
+       ImplicitSelfDependencyThroughExplicitRegsForbidAll) {
+  // - XOR
+  // - Op0 Explicit Def RegClass(GPR)
+  // - Op1 Explicit Use RegClass(GPR)
+  // - Op2 Explicit Use RegClass(GPR)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasAliasingRegisters
+  randomGenerator().seed(0); // Initialize seed.
+  const Instruction &Instr = State.getIC().getInstr(RISCV::XOR);
+  auto AllRegisters = State.getRATC().emptyRegisters();
+  AllRegisters.flip();
+  EXPECT_TRUE(errorToBool(
+      Generator.generateCodeTemplates(&Instr, AllRegisters).takeError()));
+}
+
+TEST_F(RISCVParallelSnippetGeneratorTest, MemoryUse) {
+  // LB reads from memory.
+  // - LB
+  // - Op0 Explicit Def RegClass(GPR)
+  // - Op1 Explicit Use Memory RegClass(GPR)
+  // - Op2 Explicit Use Memory
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasMemoryOperands
+  const unsigned Opcode = RISCV::LB;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Info, HasSubstr("instruction has no tied variables"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
+  ASSERT_THAT(CT.Instructions,
+              SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
+  const InstructionTemplate &IT = CT.Instructions[0];
+  EXPECT_THAT(IT.getOpcode(), Opcode);
+  ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
+  EXPECT_EQ(IT.getVariableValues()[1].getReg(), RISCV::X10);
+}
+
+} // namespace
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
new file mode 100644
index 0000000..12d3ce7
--- /dev/null
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
@@ -0,0 +1,56 @@
+//===-- TargetTest.cpp ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Target.h"
+
+#include <cassert>
+#include <memory>
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "TestBase.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace exegesis {
+
+void InitializeRISCVExegesisTarget();
+
+namespace {
+
+using testing::IsEmpty;
+using testing::Not;
+using testing::NotNull;
+
+class RISCVTargetTest : public RISCVTestBase {
+protected:
+  std::vector<MCInst> setRegTo(unsigned Reg, const APInt &Value) {
+    return State.getExegesisTarget().setRegTo(State.getSubtargetInfo(), Reg,
+                                              Value);
+  }
+};
+
+TEST_F(RISCVTargetTest, SetRegToConstant) {
+  const auto Insts = setRegTo(RISCV::X10, APInt());
+  EXPECT_THAT(Insts, Not(IsEmpty()));
+}
+
+TEST_F(RISCVTargetTest, DefaultPfmCounters) {
+  const std::string Expected = "CYCLES";
+  EXPECT_EQ(State.getExegesisTarget().getPfmCounters("").CycleCounter,
+            Expected);
+  EXPECT_EQ(
+      State.getExegesisTarget().getPfmCounters("unknown_cpu").CycleCounter,
+      Expected);
+}
+
+} // namespace
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h b/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h
new file mode 100644
index 0000000..66748fb
--- /dev/null
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h
@@ -0,0 +1,44 @@
+//===-- TestBase.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Test fixture common to all RISC-V tests.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_TOOLS_LLVMEXEGESIS_RISCV_TESTBASE_H
+#define LLVM_UNITTESTS_TOOLS_LLVMEXEGESIS_RISCV_TESTBASE_H
+
+#include "LlvmState.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace exegesis {
+
+void InitializeRISCVExegesisTarget();
+
+class RISCVTestBase : public ::testing::Test {
+protected:
+  RISCVTestBase()
+      : State(cantFail(
+            LLVMState::Create("riscv64-unknown-linux", "generic-rv64"))) {}
+
+  static void SetUpTestCase() {
+    LLVMInitializeRISCVTargetInfo();
+    LLVMInitializeRISCVTargetMC();
+    LLVMInitializeRISCVTarget();
+    InitializeRISCVExegesisTarget();
+  }
+
+  const LLVMState State;
+};
+
+} // namespace exegesis
+} // namespace llvm
+
+#endif
diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
index 3b02f63..4dea89e 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
@@ -162,14 +162,14 @@ static void emitARMTargetDef(const RecordKeeper &RK, raw_ostream &OS) {
   for (const Record *Rec : FMVExts) {
     OS << "  I.emplace_back(";
     OS << "\"" << Rec->getValueAsString("Name") << "\"";
-    OS << ", " << Rec->getValueAsString("Bit");
+    OS << ", " << Rec->getValueAsString("FeatureBit");
+    OS << ", " << Rec->getValueAsString("PriorityBit");
     auto FeatName = Rec->getValueAsString("BackendFeature");
     const Record *FeatRec = ExtensionMap[FeatName];
     if (FeatRec)
       OS << ", " << FeatRec->getValueAsString("ArchExtKindSpelling").upper();
     else
       OS << ", std::nullopt";
-    OS << ", " << (uint64_t)Rec->getValueAsInt("Priority");
     OS << ");\n";
   };
   OS << "  return I;\n"
diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Basic/Attributes.cpp
index 66ba25c..66ba25c 100644
--- a/llvm/utils/TableGen/Attributes.cpp
+++ b/llvm/utils/TableGen/Basic/Attributes.cpp
diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt
index 41d737e..b058fba 100644
--- a/llvm/utils/TableGen/Basic/CMakeLists.txt
+++ b/llvm/utils/TableGen/Basic/CMakeLists.txt
@@ -9,8 +9,15 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLVM_DYLIB
+  ARMTargetDefEmitter.cpp
+  Attributes.cpp
   CodeGenIntrinsics.cpp
+  DirectiveEmitter.cpp
+  IntrinsicEmitter.cpp
+  RISCVTargetDefEmitter.cpp
   SDNodeProperties.cpp
+  TableGen.cpp
+  VTEmitter.cpp
 )
 
 # Users may include its headers as "Basic/*.h"
diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index fd815f4..fd815f4 100644
--- a/llvm/utils/TableGen/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 093602c3..fc2b890 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Basic/CodeGenIntrinsics.h"
-#include "Basic/SequenceToOffsetTable.h"
+#include "CodeGenIntrinsics.h"
+#include "SequenceToOffsetTable.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
index 723f1d7..723f1d7 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/Basic/TableGen.cpp
index bea2a2e..80ac93f 100644
--- a/llvm/utils/TableGen/TableGen.cpp
+++ b/llvm/utils/TableGen/Basic/TableGen.cpp
@@ -6,10 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the main function for LLVM's TableGen.
+// This file contains the global defintions (mostly command line parameters)
+// shared between llvm-tblgen and llvm-min-tblgen.
 //
 //===----------------------------------------------------------------------===//
 
+#include "TableGen.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
@@ -74,7 +76,7 @@ static TableGen::Emitter::Opt X[] = {
     {"print-sets", printSets, "Print expanded sets for testing DAG exprs"},
 };
 
-int main(int argc, char **argv) {
+int tblgen_main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
 
diff --git a/llvm/utils/TableGen/Basic/TableGen.h b/llvm/utils/TableGen/Basic/TableGen.h
new file mode 100644
index 0000000..630aea6
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/TableGen.h
@@ -0,0 +1,13 @@
+//===- TableGen.h ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Shared entry point for llvm-tblgen and llvm-min-tblgen.
+//
+//===----------------------------------------------------------------------===//
+
+int tblgen_main(int argc, char **argv);
diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/Basic/VTEmitter.cpp
index d02932d..d02932d 100644
--- a/llvm/utils/TableGen/VTEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/VTEmitter.cpp
diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt
index ba1e4aa0..96a74c6 100644
--- a/llvm/utils/TableGen/CMakeLists.txt
+++ b/llvm/utils/TableGen/CMakeLists.txt
@@ -11,14 +11,13 @@ set(LLVM_LINK_COMPONENTS Support)
 # build llvm/include. It must not depend on TableGenCommon, as
 # TableGenCommon depends on this already to generate things such as
 # ValueType definitions.
+# Sources included in both, llvm-min-tblgen and llvm-tblgen, must be included
+# into LLVMTableGenBasic to avoid redundant compilation and problems with build
+# caches.
+# At least one source file must be included directly to avoid CMake problems.
+# E.g. CMake derives which linker to use from the types of sources added.
 add_tablegen(llvm-min-tblgen LLVM_HEADERS
-  TableGen.cpp
-  ARMTargetDefEmitter.cpp
-  Attributes.cpp
-  DirectiveEmitter.cpp
-  IntrinsicEmitter.cpp
-  RISCVTargetDefEmitter.cpp
-  VTEmitter.cpp
+  llvm-min-tblgen.cpp
   $<TARGET_OBJECTS:obj.LLVMTableGenBasic>
 
   PARTIAL_SOURCES_INTENDED
@@ -32,10 +31,8 @@ set(LLVM_LINK_COMPONENTS
 add_tablegen(llvm-tblgen LLVM
   DESTINATION "${LLVM_TOOLS_INSTALL_DIR}"
   EXPORT LLVM
-  ARMTargetDefEmitter.cpp
   AsmMatcherEmitter.cpp
   AsmWriterEmitter.cpp
-  Attributes.cpp
   CallingConvEmitter.cpp
   CodeEmitterGen.cpp
   CodeGenMapTable.cpp
@@ -48,7 +45,6 @@ add_tablegen(llvm-tblgen LLVM
   DecoderEmitter.cpp
   DFAEmitter.cpp
   DFAPacketizerEmitter.cpp
-  DirectiveEmitter.cpp
   DisassemblerEmitter.cpp
   DXILEmitter.cpp
   ExegesisEmitter.cpp
@@ -57,18 +53,15 @@ add_tablegen(llvm-tblgen LLVM
   GlobalISelEmitter.cpp
   InstrDocsEmitter.cpp
   InstrInfoEmitter.cpp
-  IntrinsicEmitter.cpp
+  llvm-tblgen.cpp
   MacroFusionPredicatorEmitter.cpp
   OptionParserEmitter.cpp
   OptionRSTEmitter.cpp
   PseudoLoweringEmitter.cpp
   RegisterBankEmitter.cpp
   RegisterInfoEmitter.cpp
-  RISCVTargetDefEmitter.cpp
   SearchableTableEmitter.cpp
   SubtargetEmitter.cpp
-  TableGen.cpp
-  VTEmitter.cpp
   WebAssemblyDisassemblerEmitter.cpp
   X86InstrMappingEmitter.cpp
   X86DisassemblerTables.cpp
@@ -79,6 +72,8 @@ add_tablegen(llvm-tblgen LLVM
   $<TARGET_OBJECTS:obj.LLVMTableGenBasic>
   $<TARGET_OBJECTS:obj.LLVMTableGenCommon>
 
+  PARTIAL_SOURCES_INTENDED
+
   DEPENDS
   intrinsics_gen # via llvm-min-tablegen
   )
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 619e7a4..a81f2b5 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -1723,7 +1723,6 @@ OperandMatcher &InstructionMatcher::addPhysRegInput(const Record *Reg,
   OperandMatcher *OM = new OperandMatcher(*this, OpIdx, "", TempOpIdx);
   Operands.emplace_back(OM);
   Rule.definePhysRegOperand(Reg, *OM);
-  PhysRegInputs.emplace_back(Reg, OpIdx);
   return *OM;
 }
 
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 48ce71b..8e6de80 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -19,6 +19,7 @@
 #include "Common/CodeGenDAGPatterns.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -492,9 +493,11 @@ protected:
   /// the renderers.
   StringMap<OperandMatcher *> DefinedOperands;
 
+  using PhysRegOperandsTy = SmallMapVector<const Record *, OperandMatcher *, 1>;
+
   /// A map of anonymous physical register operands defined by the matchers that
   /// may be referenced by the renderers.
-  DenseMap<const Record *, OperandMatcher *> PhysRegOperands;
+  PhysRegOperandsTy PhysRegOperands;
 
   /// ID for the next instruction variable defined with
   /// implicitlyDefineInsnVar()
@@ -695,6 +698,10 @@ public:
   unsigned allocateOutputInsnID() { return NextOutputInsnID++; }
   unsigned allocateTempRegID() { return NextTempRegID++; }
 
+  iterator_range<PhysRegOperandsTy::const_iterator> physoperands() const {
+    return make_range(PhysRegOperands.begin(), PhysRegOperands.end());
+  }
+
   iterator_range<MatchersTy::iterator> insnmatchers() {
     return make_range(Matchers.begin(), Matchers.end());
   }
@@ -1756,11 +1763,6 @@ protected:
   unsigned InsnVarID;
   bool AllowNumOpsCheck;
 
-  /// PhysRegInputs - List list has an entry for each explicitly specified
-  /// physreg input to the pattern.  The first elt is the Register node, the
-  /// second is the recorded slot number the input pattern match saved it in.
-  SmallVector<std::pair<const Record *, unsigned>, 2> PhysRegInputs;
-
   bool canAddNumOperandsCheck() const {
     // Add if it's allowed, and:
     //    - We don't have a variadic operand
@@ -1802,10 +1804,6 @@ public:
   OperandMatcher &addPhysRegInput(const Record *Reg, unsigned OpIdx,
                                   unsigned TempOpIdx);
 
-  ArrayRef<std::pair<const Record *, unsigned>> getPhysRegInputs() const {
-    return PhysRegInputs;
-  }
-
   StringRef getSymbolicName() const { return SymbolicName; }
 
   unsigned getNumOperandMatchers() const { return Operands.size(); }
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index a0c93be..7488c8d 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -218,8 +218,10 @@ static StringRef getOverloadKindStr(const Record *R) {
       .Case("Int64Ty", "OverloadKind::I64")
       .Case("ResRetHalfTy", "OverloadKind::HALF")
       .Case("ResRetFloatTy", "OverloadKind::FLOAT")
+      .Case("ResRetDoubleTy", "OverloadKind::DOUBLE")
       .Case("ResRetInt16Ty", "OverloadKind::I16")
-      .Case("ResRetInt32Ty", "OverloadKind::I32");
+      .Case("ResRetInt32Ty", "OverloadKind::I32")
+      .Case("ResRetInt64Ty", "OverloadKind::I64");
 }
 
 /// Return a string representation of valid overload information denoted
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 4250b57..3b334ea 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -404,21 +404,34 @@ private:
   createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M,
                             const TreePatternNode &Dst) const;
 
-  Expected<action_iterator> importExplicitDefRenderers(
-      action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-      const TreePatternNode &Dst, unsigned Start = 0) const;
+  Expected<action_iterator>
+  importExplicitDefRenderers(action_iterator InsertPt, RuleMatcher &M,
+                             BuildMIAction &DstMIBuilder,
+                             const TreePatternNode &Dst, bool IsRoot) const;
 
   Expected<action_iterator>
   importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M,
                              BuildMIAction &DstMIBuilder,
                              const TreePatternNode &Dst) const;
-  Expected<action_iterator>
-  importExplicitUseRenderer(action_iterator InsertPt, RuleMatcher &Rule,
-                            BuildMIAction &DstMIBuilder,
-                            const TreePatternNode &Dst) const;
-  Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M,
-                                      BuildMIAction &DstMIBuilder,
-                                      const DAGDefaultOperand &DefaultOp) const;
+
+  Error importNamedNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder,
+                                const TreePatternNode &N) const;
+
+  Error importLeafNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder,
+                               const TreePatternNode &N,
+                               action_iterator InsertPt) const;
+
+  Error importXFormNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder,
+                                const TreePatternNode &N) const;
+
+  Error importInstructionNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder,
+                                      const TreePatternNode &N,
+                                      action_iterator &InsertPt) const;
+
+  Error importNodeRenderer(RuleMatcher &M, BuildMIAction &MIBuilder,
+                           const TreePatternNode &N,
+                           action_iterator &InsertPt) const;
+
   Error importImplicitDefRenderers(BuildMIAction &DstMIBuilder,
                                    ArrayRef<const Record *> ImplicitDefs) const;
 
@@ -992,27 +1005,24 @@ Error GlobalISelEmitter::importChildMatcher(
 
   // Check MBB's before the type check since they are not a known type.
   if (!SrcChild.isLeaf()) {
-    if (SrcChild.getOperator()->isSubClassOf("SDNode")) {
-      auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild.getOperator());
-      if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
-        OM.addPredicate<MBBOperandMatcher>();
-        return Error::success();
-      }
-      if (SrcChild.getOperator()->getName() == "timm") {
-        OM.addPredicate<ImmOperandMatcher>();
+    if (SrcChild.getOperator()->getName() == "bb") {
+      OM.addPredicate<MBBOperandMatcher>();
+      return Error::success();
+    }
+    if (SrcChild.getOperator()->getName() == "timm") {
+      OM.addPredicate<ImmOperandMatcher>();
 
-        // Add predicates, if any
-        for (const TreePredicateCall &Call : SrcChild.getPredicateCalls()) {
-          const TreePredicateFn &Predicate = Call.Fn;
+      // Add predicates, if any
+      for (const TreePredicateCall &Call : SrcChild.getPredicateCalls()) {
+        const TreePredicateFn &Predicate = Call.Fn;
 
-          // Only handle immediate patterns for now
-          if (Predicate.isImmediatePattern()) {
-            OM.addPredicate<OperandImmPredicateMatcher>(Predicate);
-          }
+        // Only handle immediate patterns for now
+        if (Predicate.isImmediatePattern()) {
+          OM.addPredicate<OperandImmPredicateMatcher>(Predicate);
         }
-
-        return Error::success();
       }
+
+      return Error::success();
     }
   } else if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild.getLeafValue())) {
     auto *ChildRec = ChildDefInit->getDef();
@@ -1192,162 +1202,217 @@ Error GlobalISelEmitter::importChildMatcher(
   return failedImport("Src pattern child is an unsupported kind");
 }
 
-Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
-    action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder,
-    const TreePatternNode &Dst) const {
+// Equivalent of MatcherGen::EmitResultOfNamedOperand.
+Error GlobalISelEmitter::importNamedNodeRenderer(
+    RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const {
+  StringRef NodeName = N.getName();
 
-  const auto &SubOperand = Rule.getComplexSubOperand(Dst.getName());
-  if (SubOperand) {
-    DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
-        *std::get<0>(*SubOperand), Dst.getName(), std::get<1>(*SubOperand),
-        std::get<2>(*SubOperand));
-    return InsertPt;
+  if (auto SubOperand = M.getComplexSubOperand(NodeName)) {
+    auto [ComplexPatternRec, RendererID, SubOperandIdx] = *SubOperand;
+    MIBuilder.addRenderer<RenderComplexPatternOperand>(
+        *ComplexPatternRec, NodeName, RendererID, SubOperandIdx);
+    return Error::success();
   }
 
-  if (!Dst.isLeaf()) {
-    if (Dst.getOperator()->isSubClassOf("SDNodeXForm")) {
-      auto &Child = Dst.getChild(0);
-      auto I = SDNodeXFormEquivs.find(Dst.getOperator());
-      if (I != SDNodeXFormEquivs.end()) {
-        const Record *XFormOpc = Dst.getOperator()->getValueAsDef("Opcode");
-        if (XFormOpc->getName() == "timm") {
-          // If this is a TargetConstant, there won't be a corresponding
-          // instruction to transform. Instead, this will refer directly to an
-          // operand in an instruction's operand list.
-          DstMIBuilder.addRenderer<CustomOperandRenderer>(*I->second,
-                                                          Child.getName());
-        } else {
-          DstMIBuilder.addRenderer<CustomRenderer>(*I->second, Child.getName());
-        }
+  if (!N.isLeaf()) {
+    StringRef OperatorName = N.getOperator()->getName();
 
-        return InsertPt;
-      }
-      return failedImport("SDNodeXForm " + Child.getName() +
-                          " has no custom renderer");
+    if (OperatorName == "imm") {
+      MIBuilder.addRenderer<CopyConstantAsImmRenderer>(NodeName);
+      return Error::success();
     }
 
-    // We accept 'bb' here. It's an operator because BasicBlockSDNode isn't
-    // inline, but in MI it's just another operand.
-    if (Dst.getOperator()->isSubClassOf("SDNode")) {
-      auto &ChildSDNI = CGP.getSDNodeInfo(Dst.getOperator());
-      if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
-        DstMIBuilder.addRenderer<CopyRenderer>(Dst.getName());
-        return InsertPt;
-      }
+    if (OperatorName == "fpimm") {
+      MIBuilder.addRenderer<CopyFConstantAsFPImmRenderer>(NodeName);
+      return Error::success();
     }
 
-    // Similarly, imm is an operator in TreePatternNode's view but must be
-    // rendered as operands.
-    // FIXME: The target should be able to choose sign-extended when appropriate
-    //        (e.g. on Mips).
-    if (Dst.getOperator()->getName() == "timm") {
-      DstMIBuilder.addRenderer<CopyRenderer>(Dst.getName());
-      return InsertPt;
-    }
-    if (Dst.getOperator()->getName() == "tframeindex") {
-      DstMIBuilder.addRenderer<CopyRenderer>(Dst.getName());
-      return InsertPt;
-    }
-    if (Dst.getOperator()->getName() == "imm") {
-      DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(Dst.getName());
-      return InsertPt;
-    }
-    if (Dst.getOperator()->getName() == "fpimm") {
-      DstMIBuilder.addRenderer<CopyFConstantAsFPImmRenderer>(Dst.getName());
-      return InsertPt;
+    // TODO: 'imm' and 'fpimm' are the only nodes that need special treatment.
+    //   Remove this check and add CopyRenderer unconditionally for other nodes.
+    if (OperatorName == "bb" || OperatorName == "timm" ||
+        OperatorName == "tframeindex") {
+      MIBuilder.addRenderer<CopyRenderer>(NodeName);
+      return Error::success();
     }
 
-    if (Dst.getOperator()->isSubClassOf("Instruction")) {
-      auto OpTy = getInstResultType(Dst, Target);
-      if (!OpTy)
-        return OpTy.takeError();
+    return failedImport("node has unsupported operator " + to_string(N));
+  }
+
+  if (const auto *DI = dyn_cast<DefInit>(N.getLeafValue())) {
+    const Record *R = DI->getDef();
 
-      unsigned TempRegID = Rule.allocateTempRegID();
-      InsertPt =
-          Rule.insertAction<MakeTempRegisterAction>(InsertPt, *OpTy, TempRegID);
-      DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID);
+    if (N.getNumResults() != 1)
+      return failedImport("node does not have one result " + to_string(N));
 
-      auto InsertPtOrError = createAndImportSubInstructionRenderer(
-          ++InsertPt, Rule, Dst, TempRegID);
-      if (auto Error = InsertPtOrError.takeError())
-        return std::move(Error);
-      return InsertPtOrError.get();
+    if (R->isSubClassOf("ComplexPattern")) {
+      auto I = ComplexPatternEquivs.find(R);
+      if (I == ComplexPatternEquivs.end())
+        return failedImport("ComplexPattern " + R->getName() +
+                            " does not have GISel equivalent");
+
+      const OperandMatcher &OM = M.getOperandMatcher(NodeName);
+      MIBuilder.addRenderer<RenderComplexPatternOperand>(
+          *I->second, NodeName, OM.getAllocatedTemporariesBaseID());
+      return Error::success();
+    }
+
+    if (R->isSubClassOf("RegisterOperand") &&
+        !R->isValueUnset("GIZeroRegister")) {
+      MIBuilder.addRenderer<CopyOrAddZeroRegRenderer>(
+          NodeName, R->getValueAsDef("GIZeroRegister"));
+      return Error::success();
     }
 
-    return failedImport("Dst pattern child isn't a leaf node or an MBB" +
-                        llvm::to_string(Dst));
+    // TODO: All special cases are handled above. Remove this check and add
+    //   CopyRenderer unconditionally.
+    if (R->isSubClassOf("RegisterClass") ||
+        R->isSubClassOf("RegisterOperand") || R->isSubClassOf("ValueType")) {
+      MIBuilder.addRenderer<CopyRenderer>(NodeName);
+      return Error::success();
+    }
   }
 
-  // It could be a specific immediate in which case we should just check for
-  // that immediate.
-  if (const IntInit *ChildIntInit = dyn_cast<IntInit>(Dst.getLeafValue())) {
-    DstMIBuilder.addRenderer<ImmRenderer>(ChildIntInit->getValue());
-    return InsertPt;
+  // TODO: Change this to assert and move to the beginning of the function.
+  if (!M.hasOperand(NodeName))
+    return failedImport("could not find node $" + NodeName +
+                        " in the source DAG");
+
+  // TODO: Remove this check and add CopyRenderer unconditionally.
+  // TODO: Handle nodes with multiple results (provided they can reach here).
+  if (isa<UnsetInit>(N.getLeafValue())) {
+    MIBuilder.addRenderer<CopyRenderer>(NodeName);
+    return Error::success();
   }
 
-  // Otherwise, we're looking for a bog-standard RegisterClass operand.
-  if (auto *ChildDefInit = dyn_cast<DefInit>(Dst.getLeafValue())) {
-    auto *ChildRec = ChildDefInit->getDef();
+  return failedImport("unsupported node " + to_string(N));
+}
 
-    ArrayRef<TypeSetByHwMode> ChildTypes = Dst.getExtTypes();
-    if (ChildTypes.size() != 1)
-      return failedImport("Dst pattern child has multiple results");
+// Equivalent of MatcherGen::EmitResultLeafAsOperand.
+Error GlobalISelEmitter::importLeafNodeRenderer(
+    RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N,
+    action_iterator InsertPt) const {
+  if (const auto *II = dyn_cast<IntInit>(N.getLeafValue())) {
+    MIBuilder.addRenderer<ImmRenderer>(II->getValue());
+    return Error::success();
+  }
 
-    std::optional<LLTCodeGen> OpTyOrNone;
-    if (ChildTypes.front().isMachineValueType())
-      OpTyOrNone = MVTToLLT(ChildTypes.front().getMachineValueType().SimpleTy);
-    if (!OpTyOrNone)
-      return failedImport("Dst operand has an unsupported type");
+  if (const auto *DI = dyn_cast<DefInit>(N.getLeafValue())) {
+    const Record *R = DI->getDef();
 
-    if (ChildRec->isSubClassOf("Register")) {
-      DstMIBuilder.addRenderer<AddRegisterRenderer>(Target, ChildRec);
-      return InsertPt;
+    if (R->isSubClassOf("Register") || R->getName() == "zero_reg") {
+      MIBuilder.addRenderer<AddRegisterRenderer>(Target, R);
+      return Error::success();
     }
 
-    if (ChildRec->isSubClassOf("RegisterClass") ||
-        ChildRec->isSubClassOf("RegisterOperand") ||
-        ChildRec->isSubClassOf("ValueType")) {
-      if (ChildRec->isSubClassOf("RegisterOperand") &&
-          !ChildRec->isValueUnset("GIZeroRegister")) {
-        DstMIBuilder.addRenderer<CopyOrAddZeroRegRenderer>(
-            Dst.getName(), ChildRec->getValueAsDef("GIZeroRegister"));
-        return InsertPt;
-      }
+    if (R->getName() == "undef_tied_input") {
+      std::optional<LLTCodeGen> OpTyOrNone = MVTToLLT(N.getSimpleType(0));
+      if (!OpTyOrNone)
+        return failedImport("unsupported type");
 
-      DstMIBuilder.addRenderer<CopyRenderer>(Dst.getName());
-      return InsertPt;
-    }
+      unsigned TempRegID = M.allocateTempRegID();
+      M.insertAction<MakeTempRegisterAction>(InsertPt, *OpTyOrNone, TempRegID);
 
-    if (ChildRec->isSubClassOf("SubRegIndex")) {
-      CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(ChildRec);
-      DstMIBuilder.addRenderer<ImmRenderer>(SubIdx->EnumValue);
-      return InsertPt;
-    }
+      auto I = M.insertAction<BuildMIAction>(
+          InsertPt, M.allocateOutputInsnID(),
+          &Target.getInstruction(RK.getDef("IMPLICIT_DEF")));
+      auto &ImpDefBuilder = static_cast<BuildMIAction &>(**I);
+      ImpDefBuilder.addRenderer<TempRegRenderer>(TempRegID, /*IsDef=*/true);
 
-    if (ChildRec->isSubClassOf("ComplexPattern")) {
-      const auto &ComplexPattern = ComplexPatternEquivs.find(ChildRec);
-      if (ComplexPattern == ComplexPatternEquivs.end())
-        return failedImport(
-            "SelectionDAG ComplexPattern not mapped to GlobalISel");
+      MIBuilder.addRenderer<TempRegRenderer>(TempRegID);
+      return Error::success();
+    }
 
-      const OperandMatcher &OM = Rule.getOperandMatcher(Dst.getName());
-      DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
-          *ComplexPattern->second, Dst.getName(),
-          OM.getAllocatedTemporariesBaseID());
-      return InsertPt;
+    if (R->isSubClassOf("SubRegIndex")) {
+      const CodeGenSubRegIndex *SubRegIndex = CGRegs.getSubRegIdx(R);
+      MIBuilder.addRenderer<ImmRenderer>(SubRegIndex->EnumValue);
+      return Error::success();
     }
 
-    return failedImport(
-        "Dst pattern child def is an unsupported tablegen class");
+    // There are also RegisterClass / RegisterOperand operands of REG_SEQUENCE /
+    // COPY_TO_REGCLASS, but these instructions are currently handled elsewhere.
   }
 
-  // Handle the case where the MVT/register class is omitted in the dest pattern
-  // but MVT exists in the source pattern.
-  if (isa<UnsetInit>(Dst.getLeafValue()) && Rule.hasOperand(Dst.getName())) {
-    DstMIBuilder.addRenderer<CopyRenderer>(Dst.getName());
-    return InsertPt;
+  return failedImport("unrecognized node " + to_string(N));
+}
+
+// Equivalent of MatcherGen::EmitResultSDNodeXFormAsOperand.
+Error GlobalISelEmitter::importXFormNodeRenderer(
+    RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N) const {
+  const Record *XFormRec = N.getOperator();
+  auto I = SDNodeXFormEquivs.find(XFormRec);
+  if (I == SDNodeXFormEquivs.end())
+    return failedImport("SDNodeXForm " + XFormRec->getName() +
+                        " does not have GISel equivalent");
+
+  // TODO: Fail to import if GISDNodeXForm does not have RendererFn.
+  //   This currently results in a fatal error in emitRenderOpcodes.
+  const Record *XFormEquivRec = I->second;
+
+  // The node to apply the transformation function to.
+  // FIXME: The node may not have a name and may be a leaf. It should be
+  //   rendered first, like any other nodes. This may or may not require
+  //   introducing a temporary register, and we can't tell that without
+  //   inspecting the node (possibly recursively). This is a general drawback
+  //   of appending renderers directly to BuildMIAction.
+  const TreePatternNode &Node = N.getChild(0);
+  StringRef NodeName = Node.getName();
+
+  const Record *XFormOpc = CGP.getSDNodeTransform(XFormRec).first;
+  if (XFormOpc->getName() == "timm") {
+    // If this is a TargetConstant, there won't be a corresponding
+    // instruction to transform. Instead, this will refer directly to an
+    // operand in an instruction's operand list.
+    MIBuilder.addRenderer<CustomOperandRenderer>(*XFormEquivRec, NodeName);
+  } else {
+    MIBuilder.addRenderer<CustomRenderer>(*XFormEquivRec, NodeName);
   }
-  return failedImport("Dst pattern child is an unsupported kind");
+
+  return Error::success();
+}
+
+// Equivalent of MatcherGen::EmitResultInstructionAsOperand.
+Error GlobalISelEmitter::importInstructionNodeRenderer(
+    RuleMatcher &M, BuildMIAction &MIBuilder, const TreePatternNode &N,
+    action_iterator &InsertPt) const {
+  Expected<LLTCodeGen> OpTy = getInstResultType(N, Target);
+  if (!OpTy)
+    return OpTy.takeError();
+
+  // TODO: See the comment in importXFormNodeRenderer. We rely on the node
+  //   requiring a temporary register, which prevents us from using this
+  //   function on the root of the destination DAG.
+  unsigned TempRegID = M.allocateTempRegID();
+  InsertPt = M.insertAction<MakeTempRegisterAction>(InsertPt, *OpTy, TempRegID);
+  MIBuilder.addRenderer<TempRegRenderer>(TempRegID);
+
+  auto InsertPtOrError =
+      createAndImportSubInstructionRenderer(++InsertPt, M, N, TempRegID);
+  if (!InsertPtOrError)
+    return InsertPtOrError.takeError();
+
+  InsertPt = *InsertPtOrError;
+  return Error::success();
+}
+
+// Equivalent of MatcherGen::EmitResultOperand.
+Error GlobalISelEmitter::importNodeRenderer(RuleMatcher &M,
+                                            BuildMIAction &MIBuilder,
+                                            const TreePatternNode &N,
+                                            action_iterator &InsertPt) const {
+  if (N.hasName())
+    return importNamedNodeRenderer(M, MIBuilder, N);
+
+  if (N.isLeaf())
+    return importLeafNodeRenderer(M, MIBuilder, N, InsertPt);
+
+  if (N.getOperator()->isSubClassOf("SDNodeXForm"))
+    return importXFormNodeRenderer(M, MIBuilder, N);
+
+  if (N.getOperator()->isSubClassOf("Instruction"))
+    return importInstructionNodeRenderer(M, MIBuilder, N, InsertPt);
+
+  // Should not reach here.
+  return failedImport("unrecognized node " + llvm::to_string(N));
 }
 
 /// Generates code that builds the resulting instruction(s) from the destination
@@ -1364,18 +1429,19 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
   action_iterator InsertPt = InsertPtOrError.get();
   BuildMIAction &DstMIBuilder = *static_cast<BuildMIAction *>(InsertPt->get());
 
-  for (auto PhysInput : InsnMatcher.getPhysRegInputs()) {
+  for (auto PhysOp : M.physoperands()) {
     InsertPt = M.insertAction<BuildMIAction>(
         InsertPt, M.allocateOutputInsnID(),
         &Target.getInstruction(RK.getDef("COPY")));
     BuildMIAction &CopyToPhysRegMIBuilder =
         *static_cast<BuildMIAction *>(InsertPt->get());
-    CopyToPhysRegMIBuilder.addRenderer<AddRegisterRenderer>(
-        Target, PhysInput.first, true);
-    CopyToPhysRegMIBuilder.addRenderer<CopyPhysRegRenderer>(PhysInput.first);
+    CopyToPhysRegMIBuilder.addRenderer<AddRegisterRenderer>(Target,
+                                                            PhysOp.first, true);
+    CopyToPhysRegMIBuilder.addRenderer<CopyPhysRegRenderer>(PhysOp.first);
   }
 
-  if (auto Error = importExplicitDefRenderers(InsertPt, M, DstMIBuilder, Dst)
+  if (auto Error = importExplicitDefRenderers(InsertPt, M, DstMIBuilder, Dst,
+                                              /*IsRoot=*/true)
                        .takeError())
     return std::move(Error);
 
@@ -1404,8 +1470,8 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
   DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID, true);
 
   // Handle additional (ignored) results.
-  InsertPtOrError = importExplicitDefRenderers(std::prev(*InsertPtOrError), M,
-                                               DstMIBuilder, Dst, /*Start=*/1);
+  InsertPtOrError = importExplicitDefRenderers(
+      std::prev(*InsertPtOrError), M, DstMIBuilder, Dst, /*IsRoot=*/false);
   if (auto Error = InsertPtOrError.takeError())
     return std::move(Error);
 
@@ -1446,16 +1512,16 @@ GlobalISelEmitter::createInstructionRenderer(action_iterator InsertPt,
 
 Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
     action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-    const TreePatternNode &Dst, unsigned Start) const {
+    const TreePatternNode &Dst, bool IsRoot) const {
   const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
 
   // Process explicit defs. The caller may have already handled the first def.
-  for (unsigned I = Start, E = DstI->Operands.NumDefs; I != E; ++I) {
+  for (unsigned I = IsRoot ? 0 : 1, E = DstI->Operands.NumDefs; I != E; ++I) {
     const CGIOperandList::OperandInfo &OpInfo = DstI->Operands[I];
     std::string OpName = getMangledRootDefName(OpInfo.Name);
 
     // If the def is used in the source DAG, forward it.
-    if (M.hasOperand(OpName)) {
+    if (IsRoot && M.hasOperand(OpName)) {
       // CopyRenderer saves a StringRef, so cannot pass OpName itself -
       // let's use a string with an appropriate lifetime.
       StringRef PermanentRef = M.getOperandMatcher(OpName).getSymbolicName();
@@ -1601,11 +1667,9 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
               dyn_cast<DefInit>(SubRegChild.getLeafValue())) {
         CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
 
-        auto InsertPtOrError =
-            importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild);
-        if (auto Error = InsertPtOrError.takeError())
-          return std::move(Error);
-        InsertPt = InsertPtOrError.get();
+        if (Error Err = importNodeRenderer(M, DstMIBuilder, ValChild, InsertPt))
+          return Err;
+
         DstMIBuilder.addRenderer<SubRegIndexRenderer>(SubIdx);
       }
     }
@@ -1660,21 +1724,20 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
       // This is a predicate or optional def operand which the pattern has not
       // overridden, or which we aren't letting it override; emit the 'default
       // ops' operands.
-
-      const Record *OperandNode = DstI->Operands[InstOpNo].Rec;
-      if (auto Error = importDefaultOperandRenderers(
-              InsertPt, M, DstMIBuilder, CGP.getDefaultOperand(OperandNode)))
-        return std::move(Error);
+      for (const TreePatternNode &OpNode :
+           make_pointee_range(CGP.getDefaultOperand(OperandNode).DefaultOps)) {
+        if (Error Err = importNodeRenderer(M, DstMIBuilder, OpNode, InsertPt))
+          return Err;
+      }
 
       ++NumDefaultOps;
       continue;
     }
 
-    auto InsertPtOrError = importExplicitUseRenderer(InsertPt, M, DstMIBuilder,
-                                                     Dst.getChild(Child));
-    if (auto Error = InsertPtOrError.takeError())
-      return std::move(Error);
-    InsertPt = InsertPtOrError.get();
+    if (Error Err =
+            importNodeRenderer(M, DstMIBuilder, Dst.getChild(Child), InsertPt))
+      return Err;
+
     ++Child;
   }
 
@@ -1688,47 +1751,6 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
   return InsertPt;
 }
 
-Error GlobalISelEmitter::importDefaultOperandRenderers(
-    action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
-    const DAGDefaultOperand &DefaultOp) const {
-  for (const auto &Op : DefaultOp.DefaultOps) {
-    const auto &N = *Op;
-    if (!N.isLeaf())
-      return failedImport("Could not add default op");
-
-    const auto *DefaultOp = N.getLeafValue();
-
-    if (const DefInit *DefaultDefOp = dyn_cast<DefInit>(DefaultOp)) {
-      std::optional<LLTCodeGen> OpTyOrNone = MVTToLLT(N.getSimpleType(0));
-      auto *Def = DefaultDefOp->getDef();
-      if (Def->getName() == "undef_tied_input") {
-        unsigned TempRegID = M.allocateTempRegID();
-        M.insertAction<MakeTempRegisterAction>(InsertPt, *OpTyOrNone,
-                                               TempRegID);
-        InsertPt = M.insertAction<BuildMIAction>(
-            InsertPt, M.allocateOutputInsnID(),
-            &Target.getInstruction(RK.getDef("IMPLICIT_DEF")));
-        BuildMIAction &IDMIBuilder =
-            *static_cast<BuildMIAction *>(InsertPt->get());
-        IDMIBuilder.addRenderer<TempRegRenderer>(TempRegID);
-        DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID);
-      } else {
-        DstMIBuilder.addRenderer<AddRegisterRenderer>(Target, Def);
-      }
-      continue;
-    }
-
-    if (const IntInit *DefaultIntOp = dyn_cast<IntInit>(DefaultOp)) {
-      DstMIBuilder.addRenderer<ImmRenderer>(DefaultIntOp->getValue());
-      continue;
-    }
-
-    return failedImport("Could not add default op");
-  }
-
-  return Error::success();
-}
-
 Error GlobalISelEmitter::importImplicitDefRenderers(
     BuildMIAction &DstMIBuilder, ArrayRef<const Record *> ImplicitDefs) const {
   if (!ImplicitDefs.empty())
diff --git a/llvm/utils/TableGen/llvm-min-tblgen.cpp b/llvm/utils/TableGen/llvm-min-tblgen.cpp
new file mode 100644
index 0000000..79fce5c
--- /dev/null
+++ b/llvm/utils/TableGen/llvm-min-tblgen.cpp
@@ -0,0 +1,18 @@
+//===- llvm-min-tblgen.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the main function for LLVM's TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Basic/TableGen.h"
+
+/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen.
+/// The indirection to tblgen_main exists to ensure that the static variables
+/// for the llvm::cl:: mechanism are linked into both executables.
+int main(int argc, char **argv) { return tblgen_main(argc, argv); }
diff --git a/llvm/utils/TableGen/llvm-tblgen.cpp b/llvm/utils/TableGen/llvm-tblgen.cpp
new file mode 100644
index 0000000..a383824
--- /dev/null
+++ b/llvm/utils/TableGen/llvm-tblgen.cpp
@@ -0,0 +1,18 @@
+//===- llvm-tblgen.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the main function for LLVM's TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Basic/TableGen.h"
+
+/// Command line parameters are shared between llvm-tblgen and llvm-min-tblgen.
+/// The indirection to tblgen_main exists to ensure that the static variables
+/// for the llvm::cl:: mechanism are linked into both executables.
+int main(int argc, char **argv) { return tblgen_main(argc, argv); }
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index b108a21..e1cc02e 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -1396,7 +1396,7 @@ def find_diff_matching(lhs: List[str], rhs: List[str]) -> List[tuple]:
             backlinks.append(None)
 
     # Commit to names in the matching by walking the backlinks. Recursively
-    # attempt to fill in more matches in-betweem.
+    # attempt to fill in more matches in-between.
     match_idx = table_candidate_idx[-1]
     while match_idx is not None:
         current = candidates[match_idx]
diff --git a/llvm/utils/emacs/llvm-mode.el b/llvm/utils/emacs/llvm-mode.el
index dab3783..660d071 100644
--- a/llvm/utils/emacs/llvm-mode.el
+++ b/llvm/utils/emacs/llvm-mode.el
@@ -32,7 +32,7 @@
    `(,(regexp-opt
        '("alwaysinline" "argmemonly" "allocsize" "builtin" "cold" "convergent" "dereferenceable" "dereferenceable_or_null" "hot" "immarg" "inaccessiblememonly"
          "inaccessiblemem_or_argmemonly" "inalloca" "inlinehint" "jumptable" "minsize" "mustprogress" "naked" "nobuiltin" "nonnull" "nocapture"
-         "nocallback" "nocf_check" "noduplicate" "nofree" "noimplicitfloat" "noinline" "nomerge" "nonlazybind" "noprofile" "noredzone" "noreturn"
+         "nocallback" "nocf_check" "noduplicate" "noext" "nofree" "noimplicitfloat" "noinline" "nomerge" "nonlazybind" "noprofile" "noredzone" "noreturn"
          "norecurse" "nosync" "noundef" "nounwind" "nosanitize_bounds" "nosanitize_coverage" "null_pointer_is_valid" "optdebug" "optforfuzzing" "optnone" "optsize" "preallocated" "readnone" "readonly" "returned" "returns_twice"
          "shadowcallstack" "signext" "speculatable" "speculative_load_hardening" "ssp" "sspreq" "sspstrong" "safestack" "sanitize_address" "sanitize_hwaddress" "sanitize_memtag"
          "sanitize_thread" "sanitize_memory" "strictfp" "swifterror" "uwtable" "vscale_range" "willreturn" "writeonly" "zeroext") 'symbols) . font-lock-constant-face)
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 36fc5ee..48a338a 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -379,6 +379,10 @@ You can test this locally with the following command:
         # Each file is prefixed like:
         # diff --git a/file b/file
         for file in re.split("^diff --git ", stdout, 0, re.MULTILINE):
+            # We skip checking in MIR files as undef is a valid token and not
+            # going away.
+            if file.endswith(".mir"):
+                continue
             # search for additions of undef
             if re.search(r"^[+](?!\s*#\s*).*(\bundef\b|UndefValue::get)", file, re.MULTILINE):
                 files.append(re.match("a/([^ ]+)", file.splitlines()[0])[1])
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 61e4f8d..670f24c 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -54,6 +54,7 @@ static_library("bugprone") {
     "MultiLevelImplicitPointerConversionCheck.cpp",
     "MultipleNewInOneExpressionCheck.cpp",
     "MultipleStatementMacroCheck.cpp",
+    "NarrowingConversionsCheck.cpp",
     "NoEscapeCheck.cpp",
     "NonZeroEnumToBoolConversionCheck.cpp",
     "NondeterministicPointerIterationOrderCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
index be444d4..a06b2f1 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
@@ -28,7 +28,6 @@ static_library("cppcoreguidelines") {
     "MacroUsageCheck.cpp",
     "MisleadingCaptureDefaultByValueCheck.cpp",
     "MissingStdForwardCheck.cpp",
-    "NarrowingConversionsCheck.cpp",
     "NoMallocCheck.cpp",
     "NoSuspendWithLockCheck.cpp",
     "OwningMemoryCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
index 7deefe9..c79d5ad 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
@@ -80,6 +80,7 @@ unittest("ClangdTests") {
     "GlobalCompilationDatabaseTests.cpp",
     "HeaderSourceSwitchTests.cpp",
     "HeadersTests.cpp",
+    "HeuristicResolverTests.cpp",
     "HoverTests.cpp",
     "IncludeCleanerTests.cpp",
     "IndexActionTests.cpp",
diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index c945c8a..d8c4d8a 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -95,10 +95,18 @@ clang_tablegen("BuiltinsRISCV") {
   args = [ "-gen-clang-builtins" ]
 }
 
+clang_tablegen("BuiltinsSPIRV") {
+  args = [ "-gen-clang-builtins" ]
+}
+
 clang_tablegen("BuiltinsX86") {
   args = [ "-gen-clang-builtins" ]
 }
 
+clang_tablegen("BuiltinsX86_64") {
+  args = [ "-gen-clang-builtins" ]
+}
+
 # ARM CDE, MVE, and NEON.
 
 clang_tablegen("arm_neon") {
diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
index 31b4ba6..d759ff4 100644
--- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
@@ -26,7 +26,9 @@ static_library("Basic") {
     "//clang/include/clang/Basic:Builtins",
     "//clang/include/clang/Basic:BuiltinsBPF",
     "//clang/include/clang/Basic:BuiltinsRISCV",
+    "//clang/include/clang/Basic:BuiltinsSPIRV",
     "//clang/include/clang/Basic:BuiltinsX86",
+    "//clang/include/clang/Basic:BuiltinsX86_64",
     "//clang/include/clang/Basic:DiagnosticGroups",
     "//clang/include/clang/Basic:RegularKeywordAttrInfo",
     "//clang/include/clang/Basic:arm_cde_builtins",
@@ -128,6 +130,7 @@ static_library("Basic") {
     "Targets/WebAssembly.cpp",
     "Targets/X86.cpp",
     "Targets/XCore.cpp",
+    "Targets/Xtensa.cpp",
     "TokenKinds.cpp",
     "TypeTraits.cpp",
     "Version.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
index 615c11b..5b0b680 100644
--- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
@@ -94,6 +94,8 @@ static_library("Driver") {
     "ToolChains/PS4CPU.cpp",
     "ToolChains/RISCVToolchain.cpp",
     "ToolChains/SPIRV.cpp",
+    "ToolChains/SPIRVOpenMP.cpp",
+    "ToolChains/SYCL.cpp",
     "ToolChains/Solaris.cpp",
     "ToolChains/TCE.cpp",
     "ToolChains/UEFI.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index bcd3677..fd2ac58 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -100,6 +100,7 @@ static_library("Sema") {
     "SemaPPC.cpp",
     "SemaPseudoObject.cpp",
     "SemaRISCV.cpp",
+    "SemaSPIRV.cpp",
     "SemaSYCL.cpp",
     "SemaStmt.cpp",
     "SemaStmtAsm.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index 6734329..d104825 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -299,6 +299,7 @@ static_library("builtins") {
         "mulxc3.c",
         "powixf2.c",
         "trunctfxf2.c",
+        "truncxfhf2.c",
       ]
     }
   }
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn
index 77b2510..2ac06c4 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/lsan/BUILD.gn
@@ -1,7 +1,6 @@
 source_set("common_sources") {
   configs -= [ "//llvm/utils/gn/build:llvm_code" ]
   configs += [ "//llvm/utils/gn/build:crt_code" ]
-  defines = [ "UBSAN_CAN_USE_CXXABI" ]
   deps = [
     "//compiler-rt/lib/interception:sources",
     "//compiler-rt/lib/sanitizer_common:sources",
@@ -18,7 +17,6 @@ source_set("common_sources") {
 source_set("sources") {
   configs -= [ "//llvm/utils/gn/build:llvm_code" ]
   configs += [ "//llvm/utils/gn/build:crt_code" ]
-  defines = [ "UBSAN_CAN_USE_CXXABI" ]
   deps = [
     "//compiler-rt/lib/interception:sources",
     "//compiler-rt/lib/sanitizer_common:sources",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn
index d3b4a40..c331193 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn
@@ -27,7 +27,6 @@ gen_version_script("version_script") {
 source_set("sources") {
   configs -= [ "//llvm/utils/gn/build:llvm_code" ]
   configs += [ "//llvm/utils/gn/build:crt_code" ]
-  defines = [ "UBSAN_CAN_USE_CXXABI" ]
   deps = [
     "//compiler-rt/lib/interception:sources",
     "//compiler-rt/lib/sanitizer_common:sources",
@@ -65,7 +64,6 @@ source_set("standalone_sources") {
   configs -= [ "//llvm/utils/gn/build:llvm_code" ]
   configs -= [ "//llvm/utils/gn/build:no_rtti" ]
   configs += [ "//llvm/utils/gn/build:crt_code" ]
-  defines = [ "UBSAN_CAN_USE_CXXABI" ]
   sources = [
     "ubsan_diag_standalone.cpp",
     "ubsan_init_standalone.cpp",
@@ -77,7 +75,6 @@ source_set("cxx_sources") {
   configs -= [ "//llvm/utils/gn/build:llvm_code" ]
   configs -= [ "//llvm/utils/gn/build:no_rtti" ]
   configs += [ "//llvm/utils/gn/build:crt_code" ]
-  defines = [ "UBSAN_CAN_USE_CXXABI" ]
   sources = [
     "ubsan_handlers_cxx.cpp",
     "ubsan_handlers_cxx.h",
diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
index 7c9edfb..d74de40 100644
--- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
@@ -16,6 +16,7 @@ static_library("Host") {
   ]
   public_deps = [ "//llvm/utils/gn/build/libs/xml" ]
   sources = [
+    "aix/HostInfoAIX.cpp",
     "common/Alarm.cpp",
     "common/File.cpp",
     "common/FileAction.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Telemetry/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Telemetry/BUILD.gn
new file mode 100644
index 0000000..82c1152
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/lib/Telemetry/BUILD.gn
@@ -0,0 +1,5 @@
+static_library("Telemetry") {
+  output_name = "LLVMTelemetry"
+  deps = [ "//llvm/lib/Support" ]
+  sources = [ "Telemetry.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/RISCV/BUILD.gn
index c334b54..7c6cdd0 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/RISCV/BUILD.gn
@@ -1,6 +1,14 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("RISCVGenExegesis") {
+  args = [ "-gen-exegesis" ]
+  td_file = "//llvm/lib/Target/RISCV/RISCV.td"
+}
+
 static_library("RISCV") {
   output_name = "LLVMExegesisRISCV"
   deps = [
+    ":RISCVGenExegesis",
     "//llvm/lib/CodeGen",
     "//llvm/lib/IR",
     "//llvm/lib/Support",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
index 9527909..0d01bfa 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
@@ -50,6 +50,7 @@ group("unittests") {
     "TableGen:TableGenTests",
     "Target:TargetMachineCTests",
     "TargetParser:TargetParserTests",
+    "Telemetry:TelemetryTests",
     "Testing/ADT:TestingADTTests",
     "Testing/Support:TestingSupportTests",
     "TextAPI:TextAPITests",
@@ -101,7 +102,10 @@ group("unittests") {
     ]
   }
   if (llvm_build_RISCV) {
-    deps += [ "Target/RISCV:RISCVTests" ]
+    deps += [
+      "Target/RISCV:RISCVTests",
+      "tools/llvm-exegesis/RISCV:LLVMExegesisRISCVTests",
+    ]
   }
   if (llvm_build_SystemZ) {
     deps += [ "MC/SystemZ:SystemZAsmLexerTests" ]
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
index 47b03b4..bf6a0b7 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
@@ -73,6 +73,7 @@ unittest("SupportTests") {
     "ProcessTest.cpp",
     "ProgramTest.cpp",
     "RISCVAttributeParserTest.cpp",
+    "RecyclerTest.cpp",
     "RegexTest.cpp",
     "ReplaceFileTest.cpp",
     "ReverseIterationTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Telemetry/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Telemetry/BUILD.gn
new file mode 100644
index 0000000..47ecf35
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/unittests/Telemetry/BUILD.gn
@@ -0,0 +1,10 @@
+import("//third-party/unittest/unittest.gni")
+
+unittest("TelemetryTests") {
+  deps = [
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Telemetry",
+  ]
+  sources = [ "TelemetryTest.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/unittests/tools/llvm-exegesis/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/tools/llvm-exegesis/RISCV/BUILD.gn
new file mode 100644
index 0000000..d1db867
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/unittests/tools/llvm-exegesis/RISCV/BUILD.gn
@@ -0,0 +1,26 @@
+import("//third-party/unittest/unittest.gni")
+
+unittest("LLVMExegesisRISCVTests") {
+  deps = [
+    "//llvm/lib/DebugInfo/Symbolize",
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCParser",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target/RISCV",
+
+    # Exegesis reaches inside the Target/RISCV tablegen internals and must
+    # depend on these Target/RISCV-internal build targets.
+    "//llvm/lib/Target/RISCV/MCTargetDesc",
+    "//llvm/tools/llvm-exegesis/lib",
+    "//llvm/tools/llvm-exegesis/lib/RISCV",
+  ]
+  include_dirs = [
+    "//llvm/lib/Target/RISCV",
+    "//llvm/tools/llvm-exegesis/lib",
+  ]
+  sources = [
+    "SnippetGeneratorTest.cpp",
+    "TargetTest.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
index ba52a97..e2daa1e 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
@@ -1,29 +1,13 @@
-source_set("llvm-min-tblgen-sources") {
-  sources = [
-    "ARMTargetDefEmitter.cpp",
-    "Attributes.cpp",
-    "DirectiveEmitter.cpp",
-    "IntrinsicEmitter.cpp",
-    "RISCVTargetDefEmitter.cpp",
-    "TableGen.cpp",
-    "VTEmitter.cpp",
-  ]
-  deps = [
-    "Basic",
-    "//llvm/lib/Support",
-  ]
-}
-
 executable("llvm-min-tblgen") {
+  sources = [ "llvm-min-tblgen.cpp" ]
   deps = [
-    ":llvm-min-tblgen-sources",
     "Basic",
+    "//llvm/lib/Support",
   ]
 }
 
 executable("llvm-tblgen") {
   deps = [
-    ":llvm-min-tblgen-sources",
     "Basic",
     "Common",
     "//llvm/include/llvm/Config:llvm-config",
@@ -55,6 +39,7 @@ executable("llvm-tblgen") {
     "GlobalISelEmitter.cpp",
     "InstrDocsEmitter.cpp",
     "InstrInfoEmitter.cpp",
+    "llvm-tblgen.cpp",
     "MacroFusionPredicatorEmitter.cpp",
     "OptionParserEmitter.cpp",
     "OptionRSTEmitter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
index 2ebe393..ef6d6e4 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
@@ -1,10 +1,17 @@
-static_library("Basic") {
+source_set("Basic") {
   deps = [
     "//llvm/lib/Support",
     "//llvm/lib/TableGen",
   ]
   sources = [
+    "ARMTargetDefEmitter.cpp",
+    "Attributes.cpp",
     "CodeGenIntrinsics.cpp",
+    "DirectiveEmitter.cpp",
+    "IntrinsicEmitter.cpp",
+    "RISCVTargetDefEmitter.cpp",
     "SDNodeProperties.cpp",
+    "TableGen.cpp",
+    "VTEmitter.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
index c46e7cb..db11e56 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
@@ -1,4 +1,5 @@
 static_library("Common") {
+  output_name = "LLVMTableGenCommon"
   deps = [
     "//llvm/include/llvm/CodeGen:GenVT",
     "//llvm/lib/CodeGenTypes",
diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py
index 3a19595..fda3ef5 100644
--- a/llvm/utils/lit/tests/shtest-format.py
+++ b/llvm/utils/lit/tests/shtest-format.py
@@ -18,7 +18,7 @@
 # CHECK: Command Output (stderr):
 # CHECK-NEXT: --
 # CHECK-NOT: --
-# CHECK: cat{{(_64)?(\.exe)?}}: {{cannot open does-not-exist|does-not-exist: No such file or directory}}
+# CHECK: cat{{(_64)?(\.exe)?}}: {{(cannot open does-not-exist|.*does-not-exist.*: No such file or directory)}}
 # CHECK: --
 
 # CHECK: FAIL: shtest-format :: external_shell/fail_with_bad_encoding.txt
diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
index 2f39206..d9b2bdc 100644
--- a/llvm/utils/mlgo-utils/CMakeLists.txt
+++ b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -1,9 +1,11 @@
-configure_lit_site_cfg(
-  "${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
-  "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
-)
+if(LLVM_INCLUDE_TESTS)
+  configure_lit_site_cfg(
+    "${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
+  )
 
-add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
-  ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS "FileCheck" "not" "count" "split-file" "yaml2obj" "llvm-objcopy"
-)
+  add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
+    ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS "FileCheck" "not" "count" "split-file" "yaml2obj" "llvm-objcopy"
+  )
+endif()
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 0608eef..a888ac2 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -27,15 +27,19 @@ if(MLIR_STANDALONE_BUILD)
 
   include_directories(${LLVM_INCLUDE_DIRS})
 
-  set(UNITTEST_DIR ${LLVM_THIRD_PARTY_DIR}/unittest)
-  if(EXISTS ${UNITTEST_DIR}/googletest/include/gtest/gtest.h)
-    add_subdirectory(${UNITTEST_DIR} third-party/unittest)
-  endif()
-
   set(CMAKE_LIBRARY_OUTPUT_DIRECTORY
     "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}")
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin")
 
+  # These definitions are needed to fill SHLIBDIR in tests.
+  set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
+  set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
+  if(WIN32 OR CYGWIN)
+    # DLL platform -- put DLLs into bin.
+    set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
+  else()
+    set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
+  endif()
   set(LLVM_LIT_ARGS "-sv" CACHE STRING "Default options for lit")
 endif()
 
@@ -166,7 +170,7 @@ configure_file(
 #   The pybind11 library can be found (set with -DPYBIND_DIR=...)
 #   The python executable is correct (set with -DPython3_EXECUTABLE=...)
 # By default, find_package and probing for installed pybind11 is performed.
-# Super projects can set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES=ON to 
+# Super projects can set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES=ON to
 # disable all package setup and control it themselves.
 #-------------------------------------------------------------------------------
 
@@ -192,8 +196,10 @@ endif()
 
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
-include_directories( "include")
-include_directories( ${MLIR_INCLUDE_DIR})
+include_directories(BEFORE
+  "include"
+  ${MLIR_INCLUDE_DIR}
+  )
 
 # Adding tools/mlir-tblgen here as calling add_tablegen sets some variables like
 # MLIR_TABLEGEN_EXE in PARENT_SCOPE which gets lost if that folder is included
@@ -218,7 +224,7 @@ if (MLIR_INCLUDE_TESTS)
   add_definitions(-DMLIR_INCLUDE_TESTS)
   add_custom_target(MLIRUnitTests)
   set_target_properties(MLIRUnitTests PROPERTIES FOLDER "MLIR/Tests")
-  if (EXISTS ${LLVM_THIRD_PARTY_DIR}/unittest/googletest/include/gtest/gtest.h)
+  if (TARGET llvm_gtest)
     add_subdirectory(unittests)
   else()
     message(WARNING "gtest not found, unittests will not be available")
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index e1e7959..9c7b00b 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -584,7 +584,7 @@ function(add_mlir_aggregate name)
   # TODO: Should be transitive.
   set_target_properties(${name} PROPERTIES
     MLIR_AGGREGATE_EXCLUDE_LIBS "${_embed_libs}")
-  if(MSVC)
+  if(WIN32)
     set_property(TARGET ${name} PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
   endif()
 
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 53a7013..717a503 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -512,7 +512,7 @@ function(add_mlir_python_common_capi_library name)
   )
   add_dependencies(${name} ${_header_sources_target})
 
-  if(MSVC)
+  if(WIN32)
     set_property(TARGET ${name} PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
   endif()
   set_target_properties(${name} PROPERTIES
@@ -649,6 +649,15 @@ function(add_mlir_python_extension libname extname)
     message(FATAL_ERROR "Unhandled arguments to add_mlir_python_extension(${libname}, ... : ${ARG_UNPARSED_ARGUMENTS}")
   endif()
 
+  # The extension itself must be compiled with RTTI and exceptions enabled.
+  # Also, some warning classes triggered by pybind11 are disabled.
+  set(eh_rtti_enable)
+  if (MSVC)
+    set(eh_rtti_enable /EHsc /GR)
+  elseif(LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL)
+    set(eh_rtti_enable -frtti -fexceptions)
+  endif ()
+
   # The actual extension library produces a shared-object or DLL and has
   # sources that must be compiled in accordance with pybind11 needs (RTTI and
   # exceptions).
@@ -671,18 +680,18 @@ function(add_mlir_python_extension libname extname)
           -Wno-nested-anon-types
           -Wno-c++98-compat-extra-semi
           -Wno-covered-switch-default
+          ${eh_rtti_enable}
       )
     endif()
+    
+    if(APPLE)
+      # NanobindAdaptors.h uses PyClassMethod_New to build `pure_subclass`es but nanobind
+      # doesn't declare this API as undefined in its linker flags. So we need to declare it as such
+      # for downstream users that do not do something like `-undefined dynamic_lookup`.
+      set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-U -Wl,_PyClassMethod_New")
+    endif()
   endif()
 
-  # The extension itself must be compiled with RTTI and exceptions enabled.
-  # Also, some warning classes triggered by pybind11 are disabled.
-  set(eh_rtti_enable)
-  if (MSVC)
-    set(eh_rtti_enable /EHsc /GR)
-  elseif(LLVM_COMPILER_IS_GCC_COMPATIBLE)
-    set(eh_rtti_enable -frtti -fexceptions)
-  endif ()
   target_compile_options(${libname} PRIVATE ${eh_rtti_enable})
 
   # Configure the output to match python expectations.
diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md
index a0bd1ca..32df331 100644
--- a/mlir/docs/Bindings/Python.md
+++ b/mlir/docs/Bindings/Python.md
@@ -1035,7 +1035,7 @@ class ConstantOp(_ods_ir.OpView):
     ...
 ```
 
-expects `value` to be a `TypedAttr` (e.g., `IntegerAttr` or `FloatAttr`). 
+expects `value` to be a `TypedAttr` (e.g., `IntegerAttr` or `FloatAttr`).
 Thus, a natural extension is a builder that accepts a MLIR type and a Python value and instantiates the appropriate `TypedAttr`:
 
 ```python
@@ -1181,9 +1181,9 @@ make the passes available along with the dialect.
 Dialect functionality other than IR objects or passes, such as helper functions,
 can be exposed to Python similarly to attributes and types. C API is expected to
 exist for this functionality, which can then be wrapped using pybind11 and
-`[include/mlir/Bindings/Python/PybindAdaptors.h](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/PybindAdaptors.h)`,
+[`include/mlir/Bindings/Python/PybindAdaptors.h`](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/PybindAdaptors.h),
 or nanobind and
-`[include/mlir/Bindings/Python/NanobindAdaptors.h](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h)`
+[`include/mlir/Bindings/Python/NanobindAdaptors.h`](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h)
 utilities to connect to the rest of Python API. The bindings can be located in a
 separate module or in the same module as attributes and types, and
 loaded along with the dialect.
diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
index 3168f5e..abacd5a 100644
--- a/mlir/docs/DialectConversion.md
+++ b/mlir/docs/DialectConversion.md
@@ -242,19 +242,6 @@ cannot. These materializations are used by the conversion framework to ensure
 type safety during the conversion process. There are several types of
 materializations depending on the situation.
 
-*   Argument Materialization
-
-    -   An argument materialization is used when converting the type of a block
-        argument during a [signature conversion](#region-signature-conversion).
-        The new block argument types are specified in a `SignatureConversion`
-        object. An original block argument can be converted into multiple
-        block arguments, which is not supported everywhere in the dialect
-        conversion. (E.g., adaptors support only a single replacement value for
-        each original value.) Therefore, an argument materialization is used to
-        convert potentially multiple new block arguments back into a single SSA
-        value. An argument materialization is also used when replacing an op
-        result with multiple values.
-
 *   Source Materialization
 
     -   A source materialization is used when a value was replaced with a value
@@ -344,17 +331,6 @@ class TypeConverter {
   /// persist after the conversion has finished.
 
   /// This method registers a materialization that will be called when
-  /// converting (potentially multiple) block arguments that were the result of
-  /// a signature conversion of a single block argument, to a single SSA value
-  /// with the old argument type.
-  template <typename FnT,
-            typename T = typename llvm::function_traits<FnT>::template arg_t<1>>
-  void addArgumentMaterialization(FnT &&callback) {
-    argumentMaterializations.emplace_back(
-        wrapMaterialization<T>(std::forward<FnT>(callback)));
-  }
-
-  /// This method registers a materialization that will be called when
   /// converting a replacement value back to its original source type.
   /// This is used when some uses of the original value persist beyond the main
   /// conversion.
@@ -406,12 +382,11 @@ done explicitly via a conversion pattern.
 To convert the types of block arguments within a Region, a custom hook on the
 `ConversionPatternRewriter` must be invoked; `convertRegionTypes`. This hook
 uses a provided type converter to apply type conversions to all blocks of a
-given region. As noted above, the conversions performed by this method use the
-argument materialization hook on the `TypeConverter`. This hook also takes an
-optional `TypeConverter::SignatureConversion` parameter that applies a custom
-conversion to the entry block of the region. The types of the entry block
-arguments are often tied semantically to the operation, e.g.,
-`func::FuncOp`, `AffineForOp`, etc.
+given region. This hook also takes an optional
+`TypeConverter::SignatureConversion` parameter that applies a custom conversion
+to the entry block of the region. The types of the entry block arguments are
+often tied semantically to the operation, e.g., `func::FuncOp`, `AffineForOp`,
+etc.
 
 To convert the signature of just one given block, the
 `applySignatureConversion` hook can be used.
diff --git a/mlir/docs/TargetLLVMIR.md b/mlir/docs/TargetLLVMIR.md
index 96a4589..3a2f44f 100644
--- a/mlir/docs/TargetLLVMIR.md
+++ b/mlir/docs/TargetLLVMIR.md
@@ -646,7 +646,7 @@ Examples:
 
 ```mlir
 
-func.func @qux(%arg0: memref<?x?xf32>)
+func.func @qux(%arg0: memref<?x?xf32>) attributes {llvm.emit_c_interface}
 
 // Gets converted into the following
 // (using type alias for brevity):
@@ -683,8 +683,18 @@ llvm.func @qux(%arg0: !llvm.ptr, %arg1: !llvm.ptr,
 llvm.func @_mlir_ciface_qux(!llvm.ptr)
 ```
 
+
+```cpp
+// The C function implementation for the interface function.
+extern "C" {
+void _mlir_ciface_qux(MemRefDescriptor<float, 2> *input) {
+  // detailed impl
+}
+}
+```
+
 ```mlir
-func.func @foo(%arg0: memref<?x?xf32>) {
+func.func @foo(%arg0: memref<?x?xf32>) attributes {llvm.emit_c_interface} {
   return
 }
 
@@ -719,8 +729,15 @@ llvm.func @_mlir_ciface_foo(%arg0: !llvm.ptr) {
 }
 ```
 
+```cpp
+// The C function signature for the interface function.
+extern "C" {
+void _mlir_ciface_foo(MemRefDescriptor<float, 2> *input);
+}
+```
+
 ```mlir
-func.func @foo(%arg0: memref<?x?xf32>) -> memref<?x?xf32> {
+func.func @foo(%arg0: memref<?x?xf32>) -> memref<?x?xf32> attributes {llvm.emit_c_interface} {
   return %arg0 : memref<?x?xf32>
 }
 
@@ -744,6 +761,7 @@ llvm.func @foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64,
 }
 
 // Interface function callable from C.
+// NOTE: the returned memref becomes the first argument
 llvm.func @_mlir_ciface_foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
   %0 = llvm.load %arg1 : !llvm.ptr
   %1 = llvm.extractvalue %0[0] : !llvm.memref_2d
@@ -760,6 +778,14 @@ llvm.func @_mlir_ciface_foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
 }
 ```
 
+```cpp
+// The C function signature for the interface function.
+extern "C" {
+void _mlir_ciface_foo(MemRefDescriptor<float, 2> *output,
+                      MemRefDescriptor<float, 2> *input);
+}
+```
+
 Rationale: Introducing auxiliary functions for C-compatible interfaces is
 preferred to modifying the calling convention since it will minimize the effect
 of C compatibility on intra-module calls or calls between MLIR-generated
diff --git a/mlir/docs/Tutorials/Toy/Ch-2.md b/mlir/docs/Tutorials/Toy/Ch-2.md
index b807ee3..039417c 100644
--- a/mlir/docs/Tutorials/Toy/Ch-2.md
+++ b/mlir/docs/Tutorials/Toy/Ch-2.md
@@ -262,7 +262,7 @@ class ConstantOp : public mlir::Op<
                      mlir::OpTrait::OneResult,
                      /// We also provide a utility `getType` accessor that
                      /// returns the TensorType of the single result.
-                     mlir::OpTraits::OneTypedResult<TensorType>::Impl> {
+                     mlir::OpTrait::OneTypedResult<TensorType>::Impl> {
 
  public:
   /// Inherit the constructors from the base Op class.
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
index 3ad70e7..123d114 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -220,6 +220,7 @@ void ToyToLLVMLoweringPass::runOnOperation() {
   mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
   populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns);
   cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
+  cf::populateAssertToLLVMConversionPattern(typeConverter, patterns);
   populateFuncToLLVMConversionPatterns(typeConverter, patterns);
 
   // The only remaining operation to lower from the `toy` dialect, is the
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index 0992285..26c4140 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -45,6 +45,13 @@ MLIR_CAPI_EXPORTED MlirType
 mlirLLVMFunctionTypeGet(MlirType resultType, intptr_t nArgumentTypes,
                         MlirType const *argumentTypes, bool isVarArg);
 
+/// Returns the number of input types.
+MLIR_CAPI_EXPORTED intptr_t mlirLLVMFunctionTypeGetNumInputs(MlirType type);
+
+/// Returns the pos-th input type.
+MLIR_CAPI_EXPORTED MlirType mlirLLVMFunctionTypeGetInput(MlirType type,
+                                                         intptr_t pos);
+
 /// Returns `true` if the type is an LLVM dialect struct type.
 MLIR_CAPI_EXPORTED bool mlirTypeIsALLVMStructType(MlirType type);
 
diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
index dfd358e..b6d10ba 100644
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -332,9 +332,11 @@ public:
   /// does not exist.
   template <typename StateT, typename AnchorT>
   const StateT *lookupState(AnchorT anchor) const {
-    auto it =
-        analysisStates.find({LatticeAnchor(anchor), TypeID::get<StateT>()});
-    if (it == analysisStates.end())
+    const auto &mapIt = analysisStates.find(LatticeAnchor(anchor));
+    if (mapIt == analysisStates.end())
+      return nullptr;
+    auto it = mapIt->second.find(TypeID::get<StateT>());
+    if (it == mapIt->second.end())
       return nullptr;
     return static_cast<const StateT *>(it->second.get());
   }
@@ -343,11 +345,7 @@ public:
   template <typename AnchorT>
   void eraseState(AnchorT anchor) {
     LatticeAnchor la(anchor);
-
-    for (auto it = analysisStates.begin(); it != analysisStates.end(); ++it) {
-      if (it->first.first == la)
-        analysisStates.erase(it);
-    }
+    analysisStates.erase(LatticeAnchor(anchor));
   }
 
   // Erase all analysis states
@@ -426,7 +424,8 @@ private:
 
   /// A type-erased map of lattice anchors to associated analysis states for
   /// first-class lattice anchors.
-  DenseMap<std::pair<LatticeAnchor, TypeID>, std::unique_ptr<AnalysisState>>
+  DenseMap<LatticeAnchor, DenseMap<TypeID, std::unique_ptr<AnalysisState>>,
+           DenseMapInfo<LatticeAnchor::ParentTy>>
       analysisStates;
 
   /// Allow the base child analysis class to access the internals of the solver.
@@ -643,7 +642,7 @@ AnalysisT *DataFlowSolver::load(Args &&...args) {
 template <typename StateT, typename AnchorT>
 StateT *DataFlowSolver::getOrCreateState(AnchorT anchor) {
   std::unique_ptr<AnalysisState> &state =
-      analysisStates[{LatticeAnchor(anchor), TypeID::get<StateT>()}];
+      analysisStates[LatticeAnchor(anchor)][TypeID::get<StateT>()];
   if (!state) {
     state = std::unique_ptr<StateT>(new StateT(anchor));
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
@@ -689,10 +688,6 @@ struct DenseMapInfo<mlir::ProgramPoint> {
   }
 };
 
-template <>
-struct DenseMapInfo<mlir::LatticeAnchor>
-    : public DenseMapInfo<mlir::LatticeAnchor::ParentTy> {};
-
 // Allow llvm::cast style functions.
 template <typename To>
 struct CastInfo<To, mlir::LatticeAnchor>
diff --git a/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h b/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
index b88c1e8..88f1802 100644
--- a/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
+++ b/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
@@ -29,6 +29,10 @@ namespace cf {
 /// Collect the patterns to convert from the ControlFlow dialect to LLVM. The
 /// conversion patterns capture the LLVMTypeConverter by reference meaning the
 /// references have to remain alive during the entire pattern lifetime.
+///
+/// Note: This function does not populate the default cf.assert lowering. That
+/// is because some platforms have a custom cf.assert lowering. The default
+/// lowering can be populated with `populateAssertToLLVMConversionPattern`.
 void populateControlFlowToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns);
 
diff --git a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h
index 22df7f1..acc39e6 100644
--- a/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h
+++ b/mlir/include/mlir/Conversion/SCFToEmitC/SCFToEmitC.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H
 #define MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H
 
+#include "mlir/Transforms/DialectConversion.h"
 #include <memory>
 
 namespace mlir {
@@ -19,7 +20,8 @@ class RewritePatternSet;
 #include "mlir/Conversion/Passes.h.inc"
 
 /// Collect a set of patterns to convert SCF operations to the EmitC dialect.
-void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns);
+void populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns,
+                                          TypeConverter &typeConverter);
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_SCFTOEMITC_SCFTOEMITC_H
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index f5ca243..e2eab1f 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -1083,6 +1083,9 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> {
     %indices_2 = affine.apply #map2()[%linear_index]
     ```
 
+    In other words, `%0:3 = affine.delinearize_index %x into (B, C)` produces
+    `%0 = {%x / (B * C), (%x mod (B * C)) / C, %x mod C}`.
+
     The basis may either contain `N` or `N-1` elements, where `N` is the number of results.
     If there are N basis elements, the first one will not be used during computations,
     but may be used during analysis and canonicalization to eliminate terms from
@@ -1098,7 +1101,12 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> {
     %0:3 = affine.delinearize_index %linear_index into (244, 244) : index, index
     ```
 
-    Note that, due to the constraints of affine maps, all the basis elements must
+    Note that, for symmetry with `getPaddedBasis()`, if `hasOuterBound` is `true`
+    when one of the `OpFoldResult` builders is called but the first element of the
+    basis is `nullptr`, that first element is ignored and the builder proceeds as if
+    there was no outer bound.
+
+    Due to the constraints of affine maps, all the basis elements must
     be strictly positive. A dynamic basis element being 0 or negative causes
     undefined behavior.
   }];
@@ -1136,6 +1144,11 @@ def AffineDelinearizeIndexOp : Affine_Op<"delinearize_index", [Pure]> {
     /// Return a vector that contains the basis of the operation, removing
     /// the outer bound if one is present.
     SmallVector<OpFoldResult> getEffectiveBasis();
+
+    /// Return the vector with one basis element per result of the operation. If
+    /// there is no outer bound specified, the leading entry of this result will be
+    /// nullptr.
+    SmallVector<OpFoldResult> getPaddedBasis();
   }];
 
   let hasVerifier = 1;
@@ -1160,6 +1173,9 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
     sum(i = 0 to N-1) %idx_i * product(j = i + 1 to N-1) B_j
     ```
 
+    In other words, `%0 = affine.linearize_index [%z, %y, %x] by (Z, Y, X)`
+    gives `%0 = %x + %y * X + %z * X * Y`, or `%0 = %x + X * (%y + Y * (%z))`.
+
     The basis may either have `N` or `N-1` elements, where `N` is the number of
     inputs to linearize_index. If `N` inputs are provided, the first one is not used
     in computation, but may be used during analysis or canonicalization as a bound
@@ -1168,6 +1184,10 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
     If all `N` basis elements are provided, the linearize_index operation is said to
     "have an outer bound".
 
+    As a convenience, and for symmetry with `getPaddedBasis()`, ifg the first
+    element of a set of `OpFoldResult`s passed to the builders of this operation is
+    `nullptr`, that element is ignored.
+
     If the `disjoint` property is present, this is an optimization hint that,
     for all `i`, `0 <= %idx_i < B_i` - that is, no index affects any other index,
     except that `%idx_0` may be negative to make the index as a whole negative.
@@ -1224,6 +1244,11 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index",
     /// Return a vector that contains the basis of the operation, removing
     /// the outer bound if one is present.
     SmallVector<OpFoldResult> getEffectiveBasis();
+
+    /// Return the vector with one basis element per index operand of the operation.
+    /// If there is no outer bound specified, the leading entry of this basis will be
+    /// nullptr.
+    SmallVector<OpFoldResult> getPaddedBasis();
   }];
 
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 2f71caa..0722ff6 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1499,10 +1499,6 @@ def Arith_CmpIOp
                        SignlessIntegerLikeOfAnyRank:$lhs,
                        SignlessIntegerLikeOfAnyRank:$rhs);
 
-  let extraClassDeclaration = [{
-    static arith::CmpIPredicate getPredicateByName(StringRef name);
-  }];
-
   let hasFolder = 1;
   let hasCanonicalizer = 1;
 }
@@ -1546,10 +1542,6 @@ def Arith_CmpFOp : Arith_CompareOp<"cmpf",
                        DefaultValuedAttr<
                          Arith_FastMathAttr, "::mlir::arith::FastMathFlags::none">:$fastmath);
 
-  let extraClassDeclaration = [{
-    static arith::CmpFPredicate getPredicateByName(StringRef name);
-  }];
-
   let hasFolder = 1;
   let hasCanonicalizer = 1;
   let assemblyFormat = [{ $predicate `,` $lhs `,` $rhs (`fastmath` `` $fastmath^)?
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 983f7a2..d1a102e 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -456,7 +456,7 @@ public:
   /// read by themselves (e.g., ExtractSliceOp).
   bool isValueRead(Value value) const;
 
-  /// Starting from `value`, follow the use-def chain in reverse, always
+  /// Starting from `opOperand`, follow the use-def chain in reverse, always
   /// selecting the aliasing OpOperands. Find and return Values for which
   /// `condition` evaluates to true. OpOperands of such matching Values are not
   /// traversed any further, the visited aliasing opOperands will be preserved
@@ -484,7 +484,7 @@ public:
   /// Additional stopping conditions for the traversal can be specified in
   /// `config`.
   SetVector<Value> findValueInReverseUseDefChain(
-      Value value, llvm::function_ref<bool(Value)> condition,
+      OpOperand *opOperand, llvm::function_ref<bool(Value)> condition,
       TraversalConfig config = TraversalConfig(),
       llvm::DenseSet<OpOperand *> *visitedOpOperands = nullptr) const;
 
@@ -520,7 +520,7 @@ public:
   ///
   /// Note: OpResults of unknown ops are handled conservatively and assumed to
   /// be definitions.
-  SetVector<Value> findDefinitions(Value value) const;
+  SetVector<Value> findDefinitions(OpOperand *opOperand) const;
 
   /// Return `true` if the given OpResult has been decided to bufferize inplace.
   virtual bool isInPlace(OpOperand &opOperand) const;
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
index d50a304..bd23a19 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
@@ -127,9 +127,9 @@ public:
   /// Return true if the buffer of the given tensor value is writable.
   bool isWritable(Value value) const;
 
-  /// Find the definitions of the given tensor value or retrieve them from the
-  /// cache.
-  const SetVector<Value> &findDefinitionsCached(Value value);
+  /// Find the definitions of the given operand's value or
+  /// retrieve them from the cache.
+  const SetVector<Value> &findDefinitionsCached(OpOperand *opOperand);
 
   /// Reset cached data structures.
   void resetCache() override;
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index fe43a05..c8e456a 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -2,10 +2,12 @@
 #define MLIR_DIALECT_BUFFERIZATION_TRANSFORMS_PASSES_H
 
 #include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
 class FunctionOpInterface;
+class MemRefType;
 class ModuleOp;
 class RewritePatternSet;
 class OpBuilder;
@@ -38,7 +40,7 @@ std::unique_ptr<Pass> createOwnershipBasedBufferDeallocationPass(
     DeallocationOptions options = DeallocationOptions());
 
 /// Creates a pass that finds all temporary allocations
-/// and attempts to move the deallocation after the last user/dependency 
+/// and attempts to move the deallocation after the last user/dependency
 /// of the allocation, thereby optimizing allocation liveness.
 std::unique_ptr<Pass> createOptimizeAllocationLivenessPass();
 
@@ -157,6 +159,12 @@ std::unique_ptr<Pass> createBufferLoopHoistingPass();
 // Options struct for BufferResultsToOutParams pass.
 // Note: defined only here, not in tablegen.
 struct BufferResultsToOutParamsOpts {
+  /// Allocator function: Generate a memref allocation with the given type.
+  /// Since `promoteBufferResultsToOutParams` doesn't allow dynamically shaped
+  /// results, we don't allow passing a range of values for dynamic dims.
+  using AllocationFn =
+      std::function<FailureOr<Value>(OpBuilder &, Location, MemRefType)>;
+
   /// Memcpy function: Generate a memcpy between two memrefs.
   using MemCpyFn =
       std::function<LogicalResult(OpBuilder &, Location, Value, Value)>;
@@ -167,9 +175,20 @@ struct BufferResultsToOutParamsOpts {
     return true;
   };
 
+  /// Allocation function; used to allocate a memref.
+  /// Default memref.alloc is used
+  AllocationFn allocationFn = [](OpBuilder &builder, Location loc,
+                                 MemRefType type) {
+    return builder.create<memref::AllocOp>(loc, type).getResult();
+  };
+
   /// Memcpy function; used to create a copy between two memrefs.
-  /// If this is empty, memref.copy is used.
-  std::optional<MemCpyFn> memCpyFn;
+  /// Default memref.copy is used.
+  MemCpyFn memCpyFn = [](OpBuilder &builder, Location loc, Value from,
+                         Value to) {
+    builder.create<memref::CopyOp>(loc, from, to);
+    return success();
+  };
 
   /// If true, the pass adds a "bufferize.result" attribute to each output
   /// parameter.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Transforms.h
index 8926759..a4ee893 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Transforms.h
@@ -10,7 +10,9 @@
 #define MLIR_DIALECT_BUFFERIZATION_TRANSFORMS_TRANSFORMS_H
 
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 
 namespace mlir {
 namespace bufferization {
@@ -34,13 +36,35 @@ struct OneShotBufferizationOptions;
 /// "tensor.empty" op.
 LogicalResult eliminateEmptyTensors(RewriterBase &rewriter, Operation *op);
 
+/// A function type that defines a callback to control the construction
+/// of the subset extraction of the `SubsetInsertionOpInterface`.
+/// The subset extraction value can be used as a replacement for the
+/// `emptyTensorOp` value which is being consumed by `user`, failing
+/// of building such a value should be indicated with an empty value.
+/// This function should guarantee the legality of the replacement,
+/// i.e. the replacement should dominate the user of the `emptyTensorOp`
+/// being eliminated.
+using ControlBuildSubsetExtractionFn =
+    std::function<Value(RewriterBase &, SubsetInsertionOpInterface,
+                        tensor::EmptyOp emptyTensorOp, Operation *user)>;
+
+/// This method builds and returns a subset extraction value for the
+/// destination tensor that the given `op` inserts into.
+/// It returns a value which should replace the `emptyTensorOp` use
+/// that is being consumed by `user`.
+/// If no such a value found it will return an empty Value.
+Value buildSubsetExtraction(RewriterBase &rewriter,
+                            SubsetInsertionOpInterface op,
+                            tensor::EmptyOp emptyTensorOp, Operation *user);
+
 /// Try to eliminate "tensor.empty" ops inside `op`.
 ///
 /// This function overload accepts an existing `OneShotAnalysisState`, which
 /// contains in-place bufferization decisions. This overload is useful if an
 /// existing analysis should be reused for empty tensor elimination.
-LogicalResult eliminateEmptyTensors(RewriterBase &rewriter, Operation *op,
-                                    OneShotAnalysisState &state);
+LogicalResult eliminateEmptyTensors(
+    RewriterBase &rewriter, Operation *op, OneShotAnalysisState &state,
+    ControlBuildSubsetExtractionFn subsetsExtractionFn = buildSubsetExtraction);
 
 /// Within the given operation, hoist buffers from loops where possible. See
 /// "BufferLoopHoistingPass" for more information.
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index fc5a335..744a0dc 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -727,7 +727,7 @@ def EmitC_ReturnOp : EmitC_Op<"return", [Pure, HasParent<"FuncOp">,
     Example:
 
     ```mlir
-    emitc.func @foo() : (i32) {
+    emitc.func @foo() -> (i32) {
       ...
       emitc.return %0 : i32
     }
@@ -1305,8 +1305,6 @@ def EmitC_IfOp : EmitC_Op<"if",
       Block* body = getBody(1);
       return OpBuilder::atBlockEnd(body, listener);
     }
-    Block* thenBlock();
-    Block* elseBlock();
   }];
   let hasCustomAssemblyFormat = 1;
 }
diff --git a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
index 237a825..2112018 100644
--- a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
+++ b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
@@ -352,7 +352,7 @@ def ReturnOp : Func_Op<"return", [Pure, HasParent<"FuncOp">,
     Example:
 
     ```mlir
-    func.func @foo() : (i32, f8) {
+    func.func @foo() -> (i32, f8) {
       ...
       return %0, %1 : i32, f8
     }
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 42a017d..3adfd5f 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1055,7 +1055,7 @@ def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,
     imposed by one's target platform.
   }];
   let assemblyFormat = [{
-    $format attr-dict ($args^ `:` type($args))?
+    $format attr-dict (`,` $args^ `:` type($args))?
   }];
 }
 
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
index 80b4547..61d4ccec 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -168,13 +168,13 @@ def MapNestedForallToThreads :
 
       #### Return modes:
 
-      This operation ignores non-gpu_launch ops and drops them in the return.
+      This operation ignores non-`gpu_launch` ops and drops them in the return.
 
       If any scf.forall with tensors is found, the transform definitely
       fails.
 
-      If all the scf.forall operations with gpu.thread mapping contained
-      within the LaunchOp referred to by the `target` PDLOperation lower to GPU
+      If all the `scf.forall` operations with gpu.thread mapping contained
+      within the `LaunchOp` referred to by the `target` handle lower to GPU
       properly, the transform succeeds. Otherwise the transform definitely
       fails.
 
@@ -277,8 +277,8 @@ def MapForallToBlocks :
     If any scf.forall with tensors is found, the transform definitely
     fails.
 
-    If all the scf.forall operations contained within the LaunchOp
-    referred to by the `target` PDLOperation lower to GPU properly, the
+    If all the `scf.forall` operations contained within the LaunchOp
+    referred to by the `target` handle lower to GPU properly, the
     transform succeeds. Otherwise the transform definitely fails.
 
     The returned handle points to the same LaunchOp operand, consuming it and
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index e8eeafd..2673897 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -825,7 +825,7 @@ def LLVM_MemoryEffectsAttr : LLVM_Attr<"MemoryEffects", "memory_effects"> {
 def LLVM_AliasScopeDomainAttr : LLVM_Attr<"AliasScopeDomain",
                                           "alias_scope_domain"> {
   let parameters = (ins
-    "DistinctAttr":$id,
+    "Attribute":$id,
     OptionalParameter<"StringAttr">:$description
   );
 
@@ -853,7 +853,7 @@ def LLVM_AliasScopeDomainAttr : LLVM_Attr<"AliasScopeDomain",
 
 def LLVM_AliasScopeAttr : LLVM_Attr<"AliasScope", "alias_scope"> {
   let parameters = (ins
-    "DistinctAttr":$id,
+    "Attribute":$id,
     "AliasScopeDomainAttr":$domain,
     OptionalParameter<"StringAttr">:$description
   );
@@ -891,6 +891,8 @@ def LLVM_AliasScopeAttr : LLVM_Attr<"AliasScope", "alias_scope"> {
     }
     ```
 
+    The first attribute can either be a DistinctAttr or a StringAttr.
+
     See the following link for more details:
     https://llvm.org/docs/LangRef.html#noalias-and-alias-scope-metadata
   }];
@@ -898,6 +900,8 @@ def LLVM_AliasScopeAttr : LLVM_Attr<"AliasScope", "alias_scope"> {
   let summary = "LLVM dialect alias scope";
 
   let assemblyFormat = "`<` struct(params) `>`";
+
+  let genVerifyDecl = 1;
 }
 
 def LLVM_AliasScopeArrayAttr
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 37eec6e..fff4048 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -472,9 +472,6 @@ def TransposeOp : LinalgStructuredBase_Op<"transpose", [
       getRegionBuilder() {
       return regionBuilder;
     }
-
-    static void createRegion(::mlir::OpBuilder &opBuilder,
-                             ::mlir::OperationState & odsState);
   }];
 
   let hasFolder = 1;
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 2e713bc..081bf9b 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1765,8 +1765,8 @@ def TileReductionUsingForOp : Op<Transform_Dialect, "structured.tile_reduction_u
   let arguments = (ins TransformHandleTypeInterface:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes);
   let results = (outs Variadic<TransformHandleTypeInterface>:$fill_op,
-                      TransformHandleTypeInterface:$split_linalg_op,
-                      TransformHandleTypeInterface:$combining_linalg_op,
+                      TransformHandleTypeInterface:$split_op,
+                      TransformHandleTypeInterface:$combining_op,
                       TransformHandleTypeInterface:$for_op);
 
   let builders = [
@@ -1784,7 +1784,7 @@ def TileReductionUsingForOp : Op<Transform_Dialect, "structured.tile_reduction_u
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
         ::mlir::transform::TransformRewriter &rewriter,
-        ::mlir::linalg::LinalgOp target,
+        Operation *target,
         ::mlir::transform::ApplyToEachResultList &results,
         ::mlir::transform::TransformState &state);
   }];
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
index b1a9e33..2091c0c 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
@@ -108,14 +108,15 @@ def ClauseRequiresAttr : OpenMP_EnumAttr<ClauseRequires, "clause_requires">;
 def ClauseTaskDependIn : I32EnumAttrCase<"taskdependin", 0>;
 def ClauseTaskDependOut : I32EnumAttrCase<"taskdependout", 1>;
 def ClauseTaskDependInOut : I32EnumAttrCase<"taskdependinout", 2>;
-
-def ClauseTaskDepend : OpenMP_I32EnumAttr<
-    "ClauseTaskDepend",
-    "depend clause in a target or task construct", [
-      ClauseTaskDependIn,
-      ClauseTaskDependOut,
-      ClauseTaskDependInOut
-    ]>;
+def ClauseTaskDependMutexInOutSet
+    : I32EnumAttrCase<"taskdependmutexinoutset", 3>;
+def ClauseTaskDependInOutSet : I32EnumAttrCase<"taskdependinoutset", 4>;
+
+def ClauseTaskDepend
+    : OpenMP_I32EnumAttr<
+          "ClauseTaskDepend", "depend clause in a target or task construct",
+          [ClauseTaskDependIn, ClauseTaskDependOut, ClauseTaskDependInOut,
+           ClauseTaskDependMutexInOutSet, ClauseTaskDependInOutSet]>;
 
 def ClauseTaskDependAttr : OpenMP_EnumAttr<ClauseTaskDepend,
                                            "clause_task_depend"> {
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCF.h b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
index b62c941..ba64818 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCF.h
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
@@ -40,12 +40,6 @@ void buildTerminatedBody(OpBuilder &builder, Location loc);
 namespace mlir {
 namespace scf {
 
-// Insert `loop.yield` at the end of the only region's only block if it
-// does not have a terminator already.  If a new `loop.yield` is inserted,
-// the location is specified by `loc`. If the region is empty, insert a new
-// block first.
-void ensureLoopTerminator(Region &region, Builder &builder, Location loc);
-
 /// Returns the loop parent of an induction variable. If the provided value is
 /// not an induction variable, then return nullptr.
 ForOp getForInductionVarOwner(Value val);
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 23c597a..6f408b3 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -302,7 +302,7 @@ def ForallOp : SCF_Op<"forall", [
        AttrSizedOperandSegments,
        AutomaticAllocationScope,
        DeclareOpInterfaceMethods<LoopLikeOpInterface,
-          ["getInitsMutable", "getRegionIterArgs", "getLoopInductionVars", 
+          ["getInitsMutable", "getRegionIterArgs", "getLoopInductionVars",
            "getLoopLowerBounds", "getLoopUpperBounds", "getLoopSteps",
            "promoteIfSingleIteration", "yieldTiledValuesAndReplace"]>,
        RecursiveMemoryEffects,
@@ -671,7 +671,7 @@ def IfOp : SCF_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterface, [
     "getNumRegionInvocations", "getRegionInvocationBounds",
     "getEntrySuccessorRegions"]>,
     InferTypeOpAdaptor, SingleBlockImplicitTerminator<"scf::YieldOp">,
-    RecursiveMemoryEffects, NoRegionArguments]> {
+    RecursiveMemoryEffects, RecursivelySpeculatable, NoRegionArguments]> {
   let summary = "if-then-else operation";
   let description = [{
     The `scf.if` operation represents an if-then-else construct for
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
index b87407d..18c9dfd 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
@@ -66,6 +66,9 @@ void populateSCFStructuralTypeConversionTarget(
 /// Populates the provided pattern set with patterns that do 1:N type
 /// conversions on (some) SCF ops. This is intended to be used with
 /// applyPartialOneToNConversion.
+/// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
+/// 1:N support has been added to the regular dialect conversion driver.
+/// Use populateSCFStructuralTypeConversions() instead.
 void populateSCFStructuralOneToNTypeConversions(
     const TypeConverter &typeConverter, RewritePatternSet &patterns);
 
diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index 08a0398..8bccba4 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -321,11 +321,6 @@ def Shape_DimOp : Shape_Op<"dim",
   let assemblyFormat = "$value `,` $index attr-dict `:` type($value) `,`"
                        "type($index) `->` type($extent)";
 
-  let builders = [
-    // Builder that allows passing a constant dimension as a simple integer.
-    OpBuilder<(ins "Value":$value, "int64_t":$index)>
-  ];
-
   let extraClassDeclaration = [{
     /// Get the `index` value as integer if it is constant.
     std::optional<int64_t> getConstantIndex();
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index f553692..d3f12c3 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -126,11 +126,12 @@ def Tosa_ConvOpQuantInfoBuilder : OpBuilder<
   (ins "::mlir::Type":$outputType, "::mlir::Value":$input,
        "::mlir::Value":$weight, "::mlir::Value":$bias,
        "::mlir::DenseI64ArrayAttr":$pad, "::mlir::DenseI64ArrayAttr":$stride,
-       "::mlir::DenseI64ArrayAttr":$dilation),
+       "::mlir::DenseI64ArrayAttr":$dilation,
+       "::mlir::TypeAttr":$acc_type),
   [{
     buildConvOpWithQuantInfo($_builder, $_state, outputType,
                              input, weight, bias,
-                             pad, stride, dilation);
+                             pad, stride, dilation, acc_type);
   }]>;
 
 // Handles tosa.transpose_conv2d which has an outpad and output shape attribute.
@@ -139,12 +140,13 @@ def Tosa_TransConvOpQuantInfoBuilder : OpBuilder<
        "::mlir::Value":$weight, "mlir::Value":$bias,
        "::mlir::DenseI64ArrayAttr":$outpad,
        "::mlir::DenseI64ArrayAttr":$stride,
-       "::mlir::DenseI64ArrayAttr":$outputShape),
+       "::mlir::DenseI64ArrayAttr":$outputShape,
+       "::mlir::TypeAttr":$acc_type),
   [{
     buildTransConvOpWithQuantInfo($_builder, $_state, outputType,
                                   input, weight, bias,
                                   outpad, stride,
-                                  outputShape);
+                                  outputShape, acc_type);
   }]>;
 
 // The tosa.fully_connected op has its own builder as it does not have
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index e3c7258..6b43c9a 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -57,7 +57,7 @@ def Tosa_ArgMaxOp : Tosa_InferShapedTypeOp<"argmax"> {
 // Accumulator types.
 //===----------------------------------------------------------------------===//
 
-def Tosa_AccType : AnyTypeOf<[I<32>, SI<32>, F16, F32]>;
+def Tosa_AccType : AnyTypeOf<[I<32>, I<48>, F16, F32]>;
 
 //===----------------------------------------------------------------------===//
 // Operator: avg_pool2d
@@ -106,6 +106,7 @@ def Tosa_Conv2DOp : Tosa_InferShapedTypeOp<"conv2d"> {
     Tosa_IntArrayAttr4:$pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr2:$dilation,
+    TypeAttrOf<Tosa_AccType>:$acc_type,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -135,6 +136,7 @@ def Tosa_Conv3DOp : Tosa_InferShapedTypeOp<"conv3d"> {
     Tosa_IntArrayAttr6:$pad,
     Tosa_IntArrayAttr3:$stride,
     Tosa_IntArrayAttr3:$dilation,
+    TypeAttrOf<Tosa_AccType>:$acc_type,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -165,6 +167,7 @@ def Tosa_DepthwiseConv2DOp : Tosa_InferShapedTypeOp<"depthwise_conv2d"> {
     Tosa_IntArrayAttr4:$pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr2:$dilation,
+    TypeAttrOf<Tosa_AccType>:$acc_type,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -348,6 +351,7 @@ def Tosa_TransposeConv2DOp : Tosa_InferShapedTypeOp<"transpose_conv2d"> {
     Tosa_IntArrayAttr4:$out_pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr4:$out_shape,
+    TypeAttrOf<Tosa_AccType>:$acc_type,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -357,6 +361,7 @@ def Tosa_TransposeConv2DOp : Tosa_InferShapedTypeOp<"transpose_conv2d"> {
   );
 
   let builders = [Tosa_TransConvOpQuantInfoBuilder];
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1552,21 +1557,21 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> {
     Example:
 
     ```mlir
-    %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-    tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<4x9xf32>)
+    %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+    tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<4xi32>)  -> (tensor<4x9xf32>)
     ```
 
     Example 2:
 
     ```mlir
-    %0 = arith.constant dense<[[-1, 2], [3, 4]]> : tensor<2x2xi32>
-    tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<?x9xf32>)
+    %0 = arith.constant dense<[-1, 2, 3, 4]> : tensor<4xi32>
+    tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<4xi32>)  -> (tensor<?x9xf32>)
     ```
   }];
 
   let arguments = (ins
     Tosa_RankedTensor:$input1,
-    Tosa_Int32Or64Tensor:$padding,
+    TosaTensorRankOf<[Tosa_Int32Or64], [1]>:$padding,
     Optional<Tosa_ScalarTensor>:$pad_const,
     OptionalAttr<Tosa_PadOpQuantizationAttr>:$quantization_info
   );
@@ -1698,7 +1703,8 @@ def Tosa_TileOp : Tosa_InferShapedTypeOp<"tile"> {
 // Operator: transpose
 //===----------------------------------------------------------------------===//
 def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
-                [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>]> {
+                [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+                 AllElementTypesMatch<["input1", "output"]>]> {
   let summary = "Transpose operator";
 
   let description = [{
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index a6d3163..d3cc6e9 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -65,17 +65,17 @@ def Tosa_Int32Or64 : AnyTypeOf<[Tosa_Int32,
 // int8  : symmetric  per tensor/per channel, signed
 // int16 : symmetric  per tensor,             signed
 //===----------------------------------------------------------------------===//
-def Tosa_QuantizedInt	: AnyTypeOf<[ Tosa_QuantizedType<"uint8", [8], 0>,
-                                     Tosa_QuantizedType<"int4", [4, 0], 1>,
-                                     Tosa_QuantizedType<"int8", [8, 0], 1>,
-                                     Tosa_QuantizedType<"int16", [16, 0], 1>,
-                                     Tosa_QuantizedType<"int32", [32, 0], 1>]>;
+def Tosa_QuantizedInt : AnyTypeOf<[Tosa_QuantizedType<"uint8", [8], 0>,
+                                   Tosa_QuantizedType<"int4", [4, 0], 1>,
+                                   Tosa_QuantizedType<"int8", [8, 0], 1>,
+                                   Tosa_QuantizedType<"int16", [16, 0], 1>,
+                                   Tosa_QuantizedType<"int32", [32, 0], 1>]>;
 
 //===----------------------------------------------------------------------===//
 // Multi-category types.
 //===----------------------------------------------------------------------===//
 def Tosa_AnyNumber : AnyTypeOf<[Tosa_Int, Tosa_QuantizedInt, AnyFloat],
-                               "number">;
+                                "number">;
 
 // For weight tensors from tosa::Conv2DOp, tosa::Conv3DOp,
 // tosa::DepthwiseConv2DOp, tosa::TransposeConv2DOp, tosa::FullyConnectedOp
@@ -112,7 +112,7 @@ class TosaTensorRankOf<list<Type> allowedTypes, list<int> ranks>
 
 def Tosa_I1Tensor : TosaTensorOf<[I1]>;
 def Tosa_Int32Tensor : TosaTensorOf<[Tosa_Int32]>;
-def Tosa_Int32Or64Tensor :TosaTensorOf<[Tosa_Int32Or64]>;
+def Tosa_Int32Or64Tensor : TosaTensorOf<[Tosa_Int32Or64]>;
 
 def Tosa_FloatTensor : TosaTensorOf<[AnyFloat]>;
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 2aaa7fd..4841f94 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -164,11 +164,9 @@ def XeGPU_SGMapAttr : XeGPUAttr<"SGMap", "sg_map"> {
   }];
   let parameters = (ins
     ArrayRefParameter<"uint32_t">:$wi_layout,
-    ArrayRefParameter<"uint32_t">:$wi_data);
+    ArrayRefParameter<"uint32_t">:$wi_data
+  );
 
-  let builders = [
-    AttrBuilder<(ins)>
-  ];
 
   let hasCustomAssemblyFormat = 1;
   let genVerifyDecl = 1;
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index f3e5f6d..fb24a68 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -368,7 +368,6 @@ private:
   DenseSet<std::pair<TypeID, TypeID>> unresolvedPromisedInterfaces;
 
   friend class DialectRegistry;
-  friend void registerDialect();
   friend class MLIRContext;
 };
 
diff --git a/mlir/include/mlir/IR/Dominance.h b/mlir/include/mlir/IR/Dominance.h
index 16d17b9..9e1254c 100644
--- a/mlir/include/mlir/IR/Dominance.h
+++ b/mlir/include/mlir/IR/Dominance.h
@@ -113,12 +113,12 @@ protected:
   llvm::PointerIntPair<DomTree *, 1, bool>
   getDominanceInfo(Region *region, bool needsDomTree) const;
 
-  /// Return "true" if the specified block A properly (post)dominates block B.
-  bool properlyDominatesImpl(Block *a, Block *b) const;
-
-  /// Return "true" if the specified op A properly (post)dominates op B.
-  bool properlyDominatesImpl(Operation *a, Operation *b,
-                             bool enclosingOpOk = true) const;
+  /// Return "true" if block iterator A properly (post)dominates block iterator
+  /// B. If `enclosingOk` is set, A is considered to (post)dominate B if A
+  /// encloses B.
+  bool properlyDominatesImpl(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                             Block::iterator bIt,
+                             bool enclosingOk = true) const;
 
   /// A mapping of regions to their base dominator tree and a cached
   /// "hasSSADominance" bit. This map does not contain dominator trees for
@@ -151,9 +151,7 @@ public:
   /// The `enclosingOpOk` flag says whether we should return true if the B op
   /// is enclosed by a region on A.
   bool properlyDominates(Operation *a, Operation *b,
-                         bool enclosingOpOk = true) const {
-    return super::properlyDominatesImpl(a, b, enclosingOpOk);
-  }
+                         bool enclosingOpOk = true) const;
 
   /// Return true if operation A dominates operation B, i.e. if A and B are the
   /// same operation or A properly dominates B.
@@ -188,8 +186,17 @@ public:
   /// Graph regions have only a single block. To be consistent with "proper
   /// dominance" of ops, the single block is considered to properly dominate
   /// itself in a graph region.
-  bool properlyDominates(Block *a, Block *b) const {
-    return super::properlyDominatesImpl(a, b);
+  bool properlyDominates(Block *a, Block *b) const;
+
+  bool properlyDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                         Block::iterator bIt, bool enclosingOk = true) const {
+    return super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
+
+  bool dominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                 Block::iterator bIt, bool enclosingOk = true) const {
+    return (aBlock == bBlock && aIt == bIt) ||
+           super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
   }
 };
 
@@ -200,9 +207,7 @@ public:
 
   /// Return true if operation A properly postdominates operation B.
   bool properlyPostDominates(Operation *a, Operation *b,
-                             bool enclosingOpOk = true) const {
-    return super::properlyDominatesImpl(a, b, enclosingOpOk);
-  }
+                             bool enclosingOpOk = true) const;
 
   /// Return true if operation A postdominates operation B.
   bool postDominates(Operation *a, Operation *b) const {
@@ -210,14 +215,24 @@ public:
   }
 
   /// Return true if the specified block A properly postdominates block B.
-  bool properlyPostDominates(Block *a, Block *b) const {
-    return super::properlyDominatesImpl(a, b);
-  }
+  bool properlyPostDominates(Block *a, Block *b) const;
 
   /// Return true if the specified block A postdominates block B.
   bool postDominates(Block *a, Block *b) const {
     return a == b || properlyPostDominates(a, b);
   }
+
+  bool properlyPostDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                             Block::iterator bIt,
+                             bool enclosingOk = true) const {
+    return super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
+
+  bool postDominates(Block *aBlock, Block::iterator aIt, Block *bBlock,
+                     Block::iterator bIt, bool enclosingOk = true) const {
+    return (aBlock == bBlock && aIt == bIt) ||
+           super::properlyDominatesImpl(aBlock, aIt, bBlock, bIt, enclosingOk);
+  }
 };
 
 } // namespace mlir
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index ef5b8b1..5eb2d69 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -693,9 +693,6 @@ public:
   /// Return the dialect this operation is registered to.
   Dialect &getDialect() const { return *getImpl()->getDialect(); }
 
-  /// Use the specified object to parse this ops custom assembly format.
-  ParseResult parseAssembly(OpAsmParser &parser, OperationState &result) const;
-
   /// Represent the operation name as an opaque pointer. (Used to support
   /// PointerLikeTypeTraits).
   static RegisteredOperationName getFromOpaquePointer(const void *pointer) {
@@ -1169,16 +1166,20 @@ public:
   OpPrintingFlags &skipRegions(bool skip = true);
 
   /// Do not verify the operation when using custom operation printers.
-  OpPrintingFlags &assumeVerified();
+  OpPrintingFlags &assumeVerified(bool enable = true);
 
   /// Use local scope when printing the operation. This allows for using the
   /// printer in a more localized and thread-safe setting, but may not
   /// necessarily be identical to what the IR will look like when dumping
   /// the full module.
-  OpPrintingFlags &useLocalScope();
+  OpPrintingFlags &useLocalScope(bool enable = true);
 
   /// Print users of values as comments.
-  OpPrintingFlags &printValueUsers();
+  OpPrintingFlags &printValueUsers(bool enable = true);
+
+  /// Print unique SSA ID numbers for values, block arguments and naming
+  /// conflicts across all regions
+  OpPrintingFlags &printUniqueSSAIDs(bool enable = true);
 
   /// Return if the given ElementsAttr should be elided.
   bool shouldElideElementsAttr(ElementsAttr attr) const;
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index b75fc5e..50b69b8 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -427,6 +427,28 @@ def PartialReductionOpInterface : OpInterface<"PartialReductionOpInterface"> {
         /*defaultImplementation=*/[{
           return failure();
         }]
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Method to return the position of the partial result tile computed by
+          the tiled operation. This is same as
+          TilingInterface:::getResultTilePosition, but determines the result
+          tile position for partial reduction.
+        }],
+        /*retType=*/"::llvm::LogicalResult",
+        /*methodName=*/"getPartialResultTilePosition",
+        /*args=*/(ins
+            "::mlir::OpBuilder &":$b,
+            "unsigned":$resultNumber,
+            "::mlir::ArrayRef<::mlir::OpFoldResult> ":$offsets,
+            "::mlir::ArrayRef<::mlir::OpFoldResult> ":$sizes,
+            "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultOffsets,
+            "::mlir::SmallVector<::mlir::OpFoldResult> &":$resultSizes,
+            "::mlir::ArrayRef<int>":$reductionDims),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return failure();
+        }]
       >
   ];
 }
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index eea0647..33c9af7 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -319,9 +319,13 @@ private:
   /// Appends the converted result type and operands of `callInst` to the
   /// `types` and `operands` arrays. For indirect calls, the method additionally
   /// inserts the called function at the beginning of the `operands` array.
+  /// If `allowInlineAsm` is set to false (the default), it will return failure
+  /// if the called operand is an inline asm which isn't convertible to MLIR as
+  /// a value.
   LogicalResult convertCallTypeAndOperands(llvm::CallBase *callInst,
                                            SmallVectorImpl<Type> &types,
-                                           SmallVectorImpl<Value> &operands);
+                                           SmallVectorImpl<Value> &operands,
+                                           bool allowInlineAsm = false);
   /// Converts the parameter attributes attached to `func` and adds them to the
   /// `funcOp`.
   void convertParameterAttributes(llvm::Function *func, LLVMFuncOp funcOp,
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 28150e8..9a6975d 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -181,6 +181,10 @@ public:
   /// converting (potentially multiple) block arguments that were the result of
   /// a signature conversion of a single block argument, to a single SSA value
   /// with the old block argument type.
+  ///
+  /// Note: Argument materializations are used only with the 1:N dialect
+  /// conversion driver. The 1:N dialect conversion driver will be removed soon
+  /// and so will be argument materializations.
   template <typename FnT, typename T = typename llvm::function_traits<
                               std::decay_t<FnT>>::template arg_t<1>>
   void addArgumentMaterialization(FnT &&callback) {
@@ -880,15 +884,7 @@ public:
   void replaceOp(Operation *op, Operation *newOp) override;
 
   /// Replace the given operation with the new value ranges. The number of op
-  /// results and value ranges must match. If an original SSA value is replaced
-  /// by multiple SSA values (i.e., a value range has more than 1 element), the
-  /// conversion driver will insert an argument materialization to convert the
-  /// N SSA values back into 1 SSA value of the original type. The given
-  /// operation is erased.
-  ///
-  /// Note: The argument materialization is a workaround until we have full 1:N
-  /// support in the dialect conversion. (It is going to disappear from both
-  /// `replaceOpWithMultiple` and `applySignatureConversion`.)
+  /// results and value ranges must match. The given  operation is erased.
   void replaceOpWithMultiple(Operation *op, ArrayRef<ValueRange> newValues);
 
   /// PatternRewriter hook for erasing a dead operation. The uses of this
@@ -1285,8 +1281,8 @@ struct ConversionConfig {
   // represented at the moment.
   RewriterBase::Listener *listener = nullptr;
 
-  /// If set to "true", the dialect conversion attempts to build source/target/
-  /// argument materializations through the type converter API in lieu of
+  /// If set to "true", the dialect conversion attempts to build source/target
+  /// materializations through the type converter API in lieu of
   /// "builtin.unrealized_conversion_cast ops". The conversion process fails if
   /// at least one materialization could not be built.
   ///
diff --git a/mlir/include/mlir/Transforms/LocationSnapshot.h b/mlir/include/mlir/Transforms/LocationSnapshot.h
index ccfdbac..cefe005 100644
--- a/mlir/include/mlir/Transforms/LocationSnapshot.h
+++ b/mlir/include/mlir/Transforms/LocationSnapshot.h
@@ -51,18 +51,6 @@ void generateLocationsFromIR(raw_ostream &os, StringRef fileName, StringRef tag,
 LogicalResult generateLocationsFromIR(StringRef fileName, StringRef tag,
                                       Operation *op, OpPrintingFlags flags);
 
-/// Create a pass to generate new locations by snapshotting the IR to the given
-/// file, and using the printed locations within that file. If `filename` is
-/// empty, a temporary file is generated instead. If a 'tag' is non-empty, the
-/// generated locations are represented as a NameLoc with the given tag as the
-/// name, and then fused with the existing locations. Otherwise, the existing
-/// locations are replaced.
-std::unique_ptr<Pass> createLocationSnapshotPass(OpPrintingFlags flags,
-                                                 StringRef fileName = "",
-                                                 StringRef tag = "");
-/// Overload utilizing pass options for initialization.
-std::unique_ptr<Pass> createLocationSnapshotPass();
-
 } // namespace mlir
 
 #endif // MLIR_TRANSFORMS_LOCATIONSNAPSHOT_H
diff --git a/mlir/include/mlir/Transforms/OneToNTypeConversion.h b/mlir/include/mlir/Transforms/OneToNTypeConversion.h
index 7b4dd65..37a3268 100644
--- a/mlir/include/mlir/Transforms/OneToNTypeConversion.h
+++ b/mlir/include/mlir/Transforms/OneToNTypeConversion.h
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// Note: The 1:N dialect conversion is deprecated and will be removed soon.
+// 1:N support has been added to the regular dialect conversion driver.
+//
 // This file provides utils for implementing (poor-man's) dialect conversion
 // passes with 1:N type conversions.
 //
@@ -119,6 +122,8 @@ public:
   /// types must be the same as the result types of the op) and the new values
   /// (i.e., the converted types must be the same as the types of the new
   /// values).
+  /// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
+  /// Use replaceOpWithMultiple() instead.
   void replaceOp(Operation *op, ValueRange newValues,
                  const OneToNTypeMapping &resultMapping);
   using PatternRewriter::replaceOp;
@@ -251,6 +256,9 @@ public:
 /// or illegal types; the function simply applies the given patterns and does
 /// not fail if some ops or types remain unconverted (i.e., the conversion is
 /// only "partial").
+/// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
+/// 1:N support has been added to the regular dialect conversion driver.
+/// Use applyPartialConversion() instead.
 LogicalResult
 applyPartialOneToNConversion(Operation *op, TypeConverter &typeConverter,
                              const FrozenRewritePatternSet &patterns);
@@ -259,6 +267,9 @@ applyPartialOneToNConversion(Operation *op, TypeConverter &typeConverter,
 /// FunctionOpInterface op with the given type converter. This only supports
 /// ops which use FunctionType to represent their type. This is intended to be
 /// used with the 1:N dialect conversion.
+/// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
+/// 1:N support has been added to the regular dialect conversion driver.
+/// Use populateFunctionOpInterfaceTypeConversionPattern() instead.
 void populateOneToNFunctionOpInterfaceTypeConversionPattern(
     StringRef functionLikeOpName, const TypeConverter &converter,
     RewritePatternSet &patterns);
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 000d9f6..c4a8e7a 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -331,13 +331,21 @@ def LocationSnapshot : Pass<"snapshot-op-locations"> {
     ... loc(fused["original_source.cpp":1:1, "snapshot"("snapshot_source.mlir":10:10)])
     ```
   }];
-  let constructor = "mlir::createLocationSnapshotPass()";
   let options = [
     Option<"fileName", "filename", "std::string", /*default=*/"",
            "The filename to print the generated IR">,
     Option<"tag", "tag", "std::string", /*default=*/"",
            "A tag to use when fusing the new locations with the "
            "original. If unset, the locations are replaced.">,
+    Option<"enableDebugInfo", "print-debuginfo", "bool", /*default=*/"false",
+           "Print debug info in MLIR output">,
+    Option<"printGenericOpForm", "print-op-generic", "bool", /*default=*/"false",
+           "Print the generic op form">,
+    Option<"useLocalScope", "print-local-scope", "bool", /*default=*/"false",
+           "Print with local scope and inline information (eliding "
+           "aliases for attributes, types, and locations">,
+    Option<"printPrettyDebugInfo", "pretty-debuginfo", "bool", /*default=*/"false",
+           "Print pretty debug info in MLIR output">,
   ];
 }
 
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 86afa95..453d4f7 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -272,13 +272,13 @@ struct PyAttrBuilderMap {
   static bool dunderContains(const std::string &attributeKind) {
     return PyGlobals::get().lookupAttributeBuilder(attributeKind).has_value();
   }
-  static nb::callable dundeGetItemNamed(const std::string &attributeKind) {
+  static nb::callable dunderGetItemNamed(const std::string &attributeKind) {
     auto builder = PyGlobals::get().lookupAttributeBuilder(attributeKind);
     if (!builder)
       throw nb::key_error(attributeKind.c_str());
     return *builder;
   }
-  static void dundeSetItemNamed(const std::string &attributeKind,
+  static void dunderSetItemNamed(const std::string &attributeKind,
                                 nb::callable func, bool replace) {
     PyGlobals::get().registerAttributeBuilder(attributeKind, std::move(func),
                                               replace);
@@ -287,8 +287,8 @@ struct PyAttrBuilderMap {
   static void bind(nb::module_ &m) {
     nb::class_<PyAttrBuilderMap>(m, "AttrBuilder")
         .def_static("contains", &PyAttrBuilderMap::dunderContains)
-        .def_static("get", &PyAttrBuilderMap::dundeGetItemNamed)
-        .def_static("insert", &PyAttrBuilderMap::dundeSetItemNamed,
+        .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed)
+        .def_static("insert", &PyAttrBuilderMap::dunderSetItemNamed,
                     "attribute_kind"_a, "attr_builder"_a, "replace"_a = false,
                     "Register an attribute builder for building MLIR "
                     "attributes from python values.");
@@ -2587,6 +2587,8 @@ private:
 //------------------------------------------------------------------------------
 
 void mlir::python::populateIRCore(nb::module_ &m) {
+  // disable leak warnings which tend to be false positives.
+  nb::set_leak_warnings(false);
   //----------------------------------------------------------------------------
   // Enums.
   //----------------------------------------------------------------------------
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index 6ed82ba..da450dd 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -55,6 +55,16 @@ MlirType mlirLLVMFunctionTypeGet(MlirType resultType, intptr_t nArgumentTypes,
       unwrapList(nArgumentTypes, argumentTypes, argumentStorage), isVarArg));
 }
 
+intptr_t mlirLLVMFunctionTypeGetNumInputs(MlirType type) {
+  return llvm::cast<LLVM::LLVMFunctionType>(unwrap(type)).getNumParams();
+}
+
+MlirType mlirLLVMFunctionTypeGetInput(MlirType type, intptr_t pos) {
+  assert(pos >= 0 && "pos in array must be positive");
+  return wrap(llvm::cast<LLVM::LLVMFunctionType>(unwrap(type))
+                  .getParamType(static_cast<unsigned>(pos)));
+}
+
 bool mlirTypeIsALLVMStructType(MlirType type) {
   return isa<LLVM::LLVMStructType>(unwrap(type));
 }
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index 8672e7b..d0ffb94 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -215,7 +215,6 @@ void mlir::cf::populateControlFlowToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
   // clang-format off
   patterns.add<
-      AssertOpLowering,
       BranchOpLowering,
       CondBranchOpLowering,
       SwitchOpLowering>(converter);
@@ -258,6 +257,7 @@ struct ConvertControlFlowToLLVM
     LLVMTypeConverter converter(ctx, options);
     RewritePatternSet patterns(ctx);
     mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
+    mlir::cf::populateAssertToLLVMConversionPattern(converter, patterns);
 
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
@@ -286,6 +286,7 @@ struct ControlFlowToLLVMDialectInterface
       RewritePatternSet &patterns) const final {
     mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter,
                                                           patterns);
+    mlir::cf::populateAssertToLLVMConversionPattern(typeConverter, patterns);
   }
 };
 } // namespace
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index b3c3fd4..544fc57 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -19,6 +19,59 @@
 
 using namespace mlir;
 
+LLVM::LLVMFuncOp mlir::getOrDefineFunction(gpu::GPUModuleOp moduleOp,
+                                           Location loc, OpBuilder &b,
+                                           StringRef name,
+                                           LLVM::LLVMFunctionType type) {
+  LLVM::LLVMFuncOp ret;
+  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
+    OpBuilder::InsertionGuard guard(b);
+    b.setInsertionPointToStart(moduleOp.getBody());
+    ret = b.create<LLVM::LLVMFuncOp>(loc, name, type, LLVM::Linkage::External);
+  }
+  return ret;
+}
+
+static SmallString<16> getUniqueSymbolName(gpu::GPUModuleOp moduleOp,
+                                           StringRef prefix) {
+  // Get a unique global name.
+  unsigned stringNumber = 0;
+  SmallString<16> stringConstName;
+  do {
+    stringConstName.clear();
+    (prefix + Twine(stringNumber++)).toStringRef(stringConstName);
+  } while (moduleOp.lookupSymbol(stringConstName));
+  return stringConstName;
+}
+
+LLVM::GlobalOp
+mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
+                                gpu::GPUModuleOp moduleOp, Type llvmI8,
+                                StringRef namePrefix, StringRef str,
+                                uint64_t alignment, unsigned addrSpace) {
+  llvm::SmallString<20> nullTermStr(str);
+  nullTermStr.push_back('\0'); // Null terminate for C
+  auto globalType =
+      LLVM::LLVMArrayType::get(llvmI8, nullTermStr.size_in_bytes());
+  StringAttr attr = b.getStringAttr(nullTermStr);
+
+  // Try to find existing global.
+  for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
+    if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
+        globalOp.getValueAttr() == attr &&
+        globalOp.getAlignment().value_or(0) == alignment &&
+        globalOp.getAddrSpace() == addrSpace)
+      return globalOp;
+
+  // Not found: create new global.
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleOp.getBody());
+  SmallString<16> name = getUniqueSymbolName(moduleOp, namePrefix);
+  return b.create<LLVM::GlobalOp>(loc, globalType,
+                                  /*isConstant=*/true, LLVM::Linkage::Internal,
+                                  name, attr, alignment, addrSpace);
+}
+
 LogicalResult
 GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
                                    ConversionPatternRewriter &rewriter) const {
@@ -328,61 +381,6 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   return success();
 }
 
-static SmallString<16> getUniqueFormatGlobalName(gpu::GPUModuleOp moduleOp) {
-  const char formatStringPrefix[] = "printfFormat_";
-  // Get a unique global name.
-  unsigned stringNumber = 0;
-  SmallString<16> stringConstName;
-  do {
-    stringConstName.clear();
-    (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName);
-  } while (moduleOp.lookupSymbol(stringConstName));
-  return stringConstName;
-}
-
-/// Create an global that contains the given format string. If a global with
-/// the same format string exists already in the module, return that global.
-static LLVM::GlobalOp getOrCreateFormatStringConstant(
-    OpBuilder &b, Location loc, gpu::GPUModuleOp moduleOp, Type llvmI8,
-    StringRef str, uint64_t alignment = 0, unsigned addrSpace = 0) {
-  llvm::SmallString<20> formatString(str);
-  formatString.push_back('\0'); // Null terminate for C
-  auto globalType =
-      LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
-  StringAttr attr = b.getStringAttr(formatString);
-
-  // Try to find existing global.
-  for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
-    if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
-        globalOp.getValueAttr() == attr &&
-        globalOp.getAlignment().value_or(0) == alignment &&
-        globalOp.getAddrSpace() == addrSpace)
-      return globalOp;
-
-  // Not found: create new global.
-  OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointToStart(moduleOp.getBody());
-  SmallString<16> name = getUniqueFormatGlobalName(moduleOp);
-  return b.create<LLVM::GlobalOp>(loc, globalType,
-                                  /*isConstant=*/true, LLVM::Linkage::Internal,
-                                  name, attr, alignment, addrSpace);
-}
-
-template <typename T>
-static LLVM::LLVMFuncOp getOrDefineFunction(T &moduleOp, const Location loc,
-                                            ConversionPatternRewriter &rewriter,
-                                            StringRef name,
-                                            LLVM::LLVMFunctionType type) {
-  LLVM::LLVMFuncOp ret;
-  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
-    ConversionPatternRewriter::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointToStart(moduleOp.getBody());
-    ret = rewriter.create<LLVM::LLVMFuncOp>(loc, name, type,
-                                            LLVM::Linkage::External);
-  }
-  return ret;
-}
-
 LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
     gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -420,8 +418,8 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
   Value printfDesc = printfBeginCall.getResult();
 
   // Create the global op or find an existing one.
-  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
-      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat());
+  LLVM::GlobalOp global = getOrCreateStringConstant(
+      rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat());
 
   // Get a pointer to the format string's first element and pass it to printf()
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
@@ -502,9 +500,9 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
       getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);
 
   // Create the global op or find an existing one.
-  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
-      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat(), /*alignment=*/0,
-      addressSpace);
+  LLVM::GlobalOp global = getOrCreateStringConstant(
+      rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat(),
+      /*alignment=*/0, addressSpace);
 
   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
@@ -546,8 +544,8 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
       getOrDefineFunction(moduleOp, loc, rewriter, "vprintf", vprintfType);
 
   // Create the global op or find an existing one.
-  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
-      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat());
+  LLVM::GlobalOp global = getOrCreateStringConstant(
+      rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat());
 
   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index 444a07a..e73a748 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -14,6 +14,27 @@
 
 namespace mlir {
 
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// Find or create an external function declaration in the given module.
+LLVM::LLVMFuncOp getOrDefineFunction(gpu::GPUModuleOp moduleOp, Location loc,
+                                     OpBuilder &b, StringRef name,
+                                     LLVM::LLVMFunctionType type);
+
+/// Create a global that contains the given string. If a global with the same
+/// string already exists in the module, return that global.
+LLVM::GlobalOp getOrCreateStringConstant(OpBuilder &b, Location loc,
+                                         gpu::GPUModuleOp moduleOp, Type llvmI8,
+                                         StringRef namePrefix, StringRef str,
+                                         uint64_t alignment = 0,
+                                         unsigned addrSpace = 0);
+
+//===----------------------------------------------------------------------===//
+// Lowering Patterns
+//===----------------------------------------------------------------------===//
+
 /// Lowering for gpu.dynamic.shared.memory to LLVM dialect. The pattern first
 /// create a 0-sized global array symbol similar as LLVM expects. It constructs
 /// a memref descriptor with these values and return it.
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index e022d3ce..2768929 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -25,6 +25,7 @@
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
@@ -236,6 +237,103 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   }
 };
 
+/// Lowering of cf.assert into a conditional __assertfail.
+struct AssertOpToAssertfailLowering
+    : public ConvertOpToLLVMPattern<cf::AssertOp> {
+  using ConvertOpToLLVMPattern<cf::AssertOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(cf::AssertOp assertOp, cf::AssertOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    MLIRContext *ctx = rewriter.getContext();
+    Location loc = assertOp.getLoc();
+    Type i8Type = typeConverter->convertType(rewriter.getIntegerType(8));
+    Type i32Type = typeConverter->convertType(rewriter.getIntegerType(32));
+    Type i64Type = typeConverter->convertType(rewriter.getIntegerType(64));
+    Type ptrType = LLVM::LLVMPointerType::get(ctx);
+    Type voidType = LLVM::LLVMVoidType::get(ctx);
+
+    // Find or create __assertfail function declaration.
+    auto moduleOp = assertOp->getParentOfType<gpu::GPUModuleOp>();
+    auto assertfailType = LLVM::LLVMFunctionType::get(
+        voidType, {ptrType, ptrType, i32Type, ptrType, i64Type});
+    LLVM::LLVMFuncOp assertfailDecl = getOrDefineFunction(
+        moduleOp, loc, rewriter, "__assertfail", assertfailType);
+    assertfailDecl.setPassthroughAttr(
+        ArrayAttr::get(ctx, StringAttr::get(ctx, "noreturn")));
+
+    // Split blocks and insert conditional branch.
+    // ^before:
+    //   ...
+    //   cf.cond_br %condition, ^after, ^assert
+    // ^assert:
+    //   cf.assert
+    //   cf.br ^after
+    // ^after:
+    //   ...
+    Block *beforeBlock = assertOp->getBlock();
+    Block *assertBlock =
+        rewriter.splitBlock(beforeBlock, assertOp->getIterator());
+    Block *afterBlock =
+        rewriter.splitBlock(assertBlock, ++assertOp->getIterator());
+    rewriter.setInsertionPointToEnd(beforeBlock);
+    rewriter.create<cf::CondBranchOp>(loc, adaptor.getArg(), afterBlock,
+                                      assertBlock);
+    rewriter.setInsertionPointToEnd(assertBlock);
+    rewriter.create<cf::BranchOp>(loc, afterBlock);
+
+    // Continue cf.assert lowering.
+    rewriter.setInsertionPoint(assertOp);
+
+    // Populate file name, file number and function name from the location of
+    // the AssertOp.
+    StringRef fileName = "(unknown)";
+    StringRef funcName = "(unknown)";
+    int32_t fileLine = 0;
+    while (auto callSiteLoc = dyn_cast<CallSiteLoc>(loc))
+      loc = callSiteLoc.getCallee();
+    if (auto fileLineColLoc = dyn_cast<FileLineColRange>(loc)) {
+      fileName = fileLineColLoc.getFilename().strref();
+      fileLine = fileLineColLoc.getStartLine();
+    } else if (auto nameLoc = dyn_cast<NameLoc>(loc)) {
+      funcName = nameLoc.getName().strref();
+      if (auto fileLineColLoc =
+              dyn_cast<FileLineColRange>(nameLoc.getChildLoc())) {
+        fileName = fileLineColLoc.getFilename().strref();
+        fileLine = fileLineColLoc.getStartLine();
+      }
+    }
+
+    // Create constants.
+    auto getGlobal = [&](LLVM::GlobalOp global) {
+      // Get a pointer to the format string's first element.
+      Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
+          loc, LLVM::LLVMPointerType::get(ctx, global.getAddrSpace()),
+          global.getSymNameAttr());
+      Value start =
+          rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
+                                       globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+      return start;
+    };
+    Value assertMessage = getGlobal(getOrCreateStringConstant(
+        rewriter, loc, moduleOp, i8Type, "assert_message_", assertOp.getMsg()));
+    Value assertFile = getGlobal(getOrCreateStringConstant(
+        rewriter, loc, moduleOp, i8Type, "assert_file_", fileName));
+    Value assertFunc = getGlobal(getOrCreateStringConstant(
+        rewriter, loc, moduleOp, i8Type, "assert_func_", funcName));
+    Value assertLine =
+        rewriter.create<LLVM::ConstantOp>(loc, i32Type, fileLine);
+    Value c1 = rewriter.create<LLVM::ConstantOp>(loc, i64Type, 1);
+
+    // Insert function call to __assertfail.
+    SmallVector<Value> arguments{assertMessage, assertFile, assertLine,
+                                 assertFunc, c1};
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(assertOp, assertfailDecl,
+                                              arguments);
+    return success();
+  }
+};
+
 /// Import the GPU Ops to NVVM Patterns.
 #include "GPUToNVVM.cpp.inc"
 
@@ -358,7 +456,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
   using gpu::index_lowering::IndexKind;
   using gpu::index_lowering::IntrType;
   populateWithGenerated(patterns);
-  patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
+  patterns.add<GPUPrintfOpToVPrintfLowering, AssertOpToAssertfailLowering>(
+      converter);
   patterns.add<
       gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
                                       NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d52a869..afebded 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -47,7 +47,6 @@
 
 #include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
-#include "../GPUCommon/OpToFuncCallLowering.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
@@ -297,6 +296,7 @@ struct LowerGpuOpsToROCDLOpsPass
     populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
     populateMathToLLVMConversionPatterns(converter, llvmPatterns);
     cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
+    cf::populateAssertToLLVMConversionPattern(converter, llvmPatterns);
     populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
     populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
     populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
@@ -346,16 +346,6 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
 }
 
-template <typename OpTy>
-static void populateOpPatterns(const LLVMTypeConverter &converter,
-                               RewritePatternSet &patterns, StringRef f32Func,
-                               StringRef f64Func, StringRef f32ApproxFunc,
-                               StringRef f16Func) {
-  patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
-  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f32ApproxFunc,
-                                           f16Func);
-}
-
 void mlir::populateGpuToROCDLConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
     mlir::gpu::amd::Runtime runtime) {
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 49e2d94..72799e4 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -85,7 +85,7 @@ static Value unrankedMemRefMaterialization(OpBuilder &builder,
                                            UnrankedMemRefType resultType,
                                            ValueRange inputs, Location loc,
                                            const LLVMTypeConverter &converter) {
-  // An argument materialization must return a value of type
+  // A source materialization must return a value of type
   // `resultType`, so insert a cast from the memref descriptor type
   // (!llvm.struct) to the original memref type.
   Value packed =
@@ -101,7 +101,7 @@ static Value rankedMemRefMaterialization(OpBuilder &builder,
                                          MemRefType resultType,
                                          ValueRange inputs, Location loc,
                                          const LLVMTypeConverter &converter) {
-  // An argument materialization must return a value of type `resultType`,
+  // A source materialization must return a value of type `resultType`,
   // so insert a cast from the memref descriptor type (!llvm.struct) to the
   // original memref type.
   Value packed =
@@ -234,19 +234,9 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
         .getResult(0);
   });
 
-  // Argument materializations convert from the new block argument types
+  // Source materializations convert from the new block argument types
   // (multiple SSA values that make up a memref descriptor) back to the
   // original block argument type.
-  addArgumentMaterialization([&](OpBuilder &builder,
-                                 UnrankedMemRefType resultType,
-                                 ValueRange inputs, Location loc) {
-    return unrankedMemRefMaterialization(builder, resultType, inputs, loc,
-                                         *this);
-  });
-  addArgumentMaterialization([&](OpBuilder &builder, MemRefType resultType,
-                                 ValueRange inputs, Location loc) {
-    return rankedMemRefMaterialization(builder, resultType, inputs, loc, *this);
-  });
   addSourceMaterialization([&](OpBuilder &builder,
                                UnrankedMemRefType resultType, ValueRange inputs,
                                Location loc) {
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 58fd3d5..5d00039 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -304,6 +304,7 @@ void ConvertOpenMPToLLVMPass::runOnOperation() {
   LLVMTypeConverter converter(&getContext());
   arith::populateArithToLLVMConversionPatterns(converter, patterns);
   cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
+  cf::populateAssertToLLVMConversionPattern(converter, patterns);
   populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns);
   populateFuncToLLVMConversionPatterns(converter, patterns);
   populateOpenMPToLLVMConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt b/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt
index 79119d3..af5493b 100644
--- a/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt
+++ b/mlir/lib/Conversion/SCFToEmitC/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_conversion_library(MLIRSCFToEmitC
   LINK_LIBS PUBLIC
   MLIRArithDialect
   MLIREmitCDialect
+  MLIREmitCTransforms
   MLIRSCFDialect
   MLIRTransforms
   )
diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
index 67a43c4..92523ca 100644
--- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
+++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/EmitC/Transforms/TypeConversions.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -39,21 +40,22 @@ struct SCFToEmitCPass : public impl::SCFToEmitCBase<SCFToEmitCPass> {
 
 // Lower scf::for to emitc::for, implementing result values using
 // emitc::variable's updated within the loop body.
-struct ForLowering : public OpRewritePattern<ForOp> {
-  using OpRewritePattern<ForOp>::OpRewritePattern;
+struct ForLowering : public OpConversionPattern<ForOp> {
+  using OpConversionPattern<ForOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(ForOp forOp,
-                                PatternRewriter &rewriter) const override;
+  LogicalResult
+  matchAndRewrite(ForOp forOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
 };
 
 // Create an uninitialized emitc::variable op for each result of the given op.
 template <typename T>
-static SmallVector<Value> createVariablesForResults(T op,
-                                                    PatternRewriter &rewriter) {
-  SmallVector<Value> resultVariables;
-
+static LogicalResult
+createVariablesForResults(T op, const TypeConverter *typeConverter,
+                          ConversionPatternRewriter &rewriter,
+                          SmallVector<Value> &resultVariables) {
   if (!op.getNumResults())
-    return resultVariables;
+    return success();
 
   Location loc = op->getLoc();
   MLIRContext *context = op.getContext();
@@ -62,7 +64,9 @@ static SmallVector<Value> createVariablesForResults(T op,
   rewriter.setInsertionPoint(op);
 
   for (OpResult result : op.getResults()) {
-    Type resultType = result.getType();
+    Type resultType = typeConverter->convertType(result.getType());
+    if (!resultType)
+      return rewriter.notifyMatchFailure(op, "result type conversion failed");
     Type varType = emitc::LValueType::get(resultType);
     emitc::OpaqueAttr noInit = emitc::OpaqueAttr::get(context, "");
     emitc::VariableOp var =
@@ -70,13 +74,13 @@ static SmallVector<Value> createVariablesForResults(T op,
     resultVariables.push_back(var);
   }
 
-  return resultVariables;
+  return success();
 }
 
 // Create a series of assign ops assigning given values to given variables at
 // the current insertion point of given rewriter.
-static void assignValues(ValueRange values, SmallVector<Value> &variables,
-                         PatternRewriter &rewriter, Location loc) {
+static void assignValues(ValueRange values, ValueRange variables,
+                         ConversionPatternRewriter &rewriter, Location loc) {
   for (auto [value, var] : llvm::zip(values, variables))
     rewriter.create<emitc::AssignOp>(loc, var, value);
 }
@@ -89,18 +93,25 @@ SmallVector<Value> loadValues(const SmallVector<Value> &variables,
   });
 }
 
-static void lowerYield(SmallVector<Value> &resultVariables,
-                       PatternRewriter &rewriter, scf::YieldOp yield) {
+static LogicalResult lowerYield(Operation *op, ValueRange resultVariables,
+                                ConversionPatternRewriter &rewriter,
+                                scf::YieldOp yield) {
   Location loc = yield.getLoc();
-  ValueRange operands = yield.getOperands();
 
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(yield);
 
-  assignValues(operands, resultVariables, rewriter, loc);
+  SmallVector<Value> yieldOperands;
+  if (failed(rewriter.getRemappedValues(yield.getOperands(), yieldOperands))) {
+    return rewriter.notifyMatchFailure(op, "failed to lower yield operands");
+  }
+
+  assignValues(yieldOperands, resultVariables, rewriter, loc);
 
   rewriter.create<emitc::YieldOp>(loc);
   rewriter.eraseOp(yield);
+
+  return success();
 }
 
 // Lower the contents of an scf::if/scf::index_switch regions to an
@@ -108,27 +119,32 @@ static void lowerYield(SmallVector<Value> &resultVariables,
 // moved into the respective lowered region, but the scf::yield is replaced not
 // only with an emitc::yield, but also with a sequence of emitc::assign ops that
 // set the yielded values into the result variables.
-static void lowerRegion(SmallVector<Value> &resultVariables,
-                        PatternRewriter &rewriter, Region &region,
-                        Region &loweredRegion) {
+static LogicalResult lowerRegion(Operation *op, ValueRange resultVariables,
+                                 ConversionPatternRewriter &rewriter,
+                                 Region &region, Region &loweredRegion) {
   rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end());
   Operation *terminator = loweredRegion.back().getTerminator();
-  lowerYield(resultVariables, rewriter, cast<scf::YieldOp>(terminator));
+  return lowerYield(op, resultVariables, rewriter,
+                    cast<scf::YieldOp>(terminator));
 }
 
-LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
-                                           PatternRewriter &rewriter) const {
+LogicalResult
+ForLowering::matchAndRewrite(ForOp forOp, OpAdaptor adaptor,
+                             ConversionPatternRewriter &rewriter) const {
   Location loc = forOp.getLoc();
 
   // Create an emitc::variable op for each result. These variables will be
   // assigned to by emitc::assign ops within the loop body.
-  SmallVector<Value> resultVariables =
-      createVariablesForResults(forOp, rewriter);
+  SmallVector<Value> resultVariables;
+  if (failed(createVariablesForResults(forOp, getTypeConverter(), rewriter,
+                                       resultVariables)))
+    return rewriter.notifyMatchFailure(forOp,
+                                       "create variables for results failed");
 
-  assignValues(forOp.getInits(), resultVariables, rewriter, loc);
+  assignValues(adaptor.getInitArgs(), resultVariables, rewriter, loc);
 
   emitc::ForOp loweredFor = rewriter.create<emitc::ForOp>(
-      loc, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep());
+      loc, adaptor.getLowerBound(), adaptor.getUpperBound(), adaptor.getStep());
 
   Block *loweredBody = loweredFor.getBody();
 
@@ -143,13 +159,27 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
 
   rewriter.restoreInsertionPoint(ip);
 
+  // Convert the original region types into the new types by adding unrealized
+  // casts in the beginning of the loop. This performs the conversion in place.
+  if (failed(rewriter.convertRegionTypes(&forOp.getRegion(),
+                                         *getTypeConverter(), nullptr))) {
+    return rewriter.notifyMatchFailure(forOp, "region types conversion failed");
+  }
+
+  // Register the replacements for the block arguments and inline the body of
+  // the scf.for loop into the body of the emitc::for loop.
+  Block *scfBody = &(forOp.getRegion().front());
   SmallVector<Value> replacingValues;
   replacingValues.push_back(loweredFor.getInductionVar());
   replacingValues.append(iterArgsValues.begin(), iterArgsValues.end());
+  rewriter.mergeBlocks(scfBody, loweredBody, replacingValues);
 
-  rewriter.mergeBlocks(forOp.getBody(), loweredBody, replacingValues);
-  lowerYield(resultVariables, rewriter,
-             cast<scf::YieldOp>(loweredBody->getTerminator()));
+  auto result = lowerYield(forOp, resultVariables, rewriter,
+                           cast<scf::YieldOp>(loweredBody->getTerminator()));
+
+  if (failed(result)) {
+    return result;
+  }
 
   // Load variables into SSA values after the for loop.
   SmallVector<Value> resultValues = loadValues(resultVariables, rewriter, loc);
@@ -160,38 +190,66 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
 
 // Lower scf::if to emitc::if, implementing result values as emitc::variable's
 // updated within the then and else regions.
-struct IfLowering : public OpRewritePattern<IfOp> {
-  using OpRewritePattern<IfOp>::OpRewritePattern;
+struct IfLowering : public OpConversionPattern<IfOp> {
+  using OpConversionPattern<IfOp>::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(IfOp ifOp,
-                                PatternRewriter &rewriter) const override;
+  LogicalResult
+  matchAndRewrite(IfOp ifOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
 };
 
 } // namespace
 
-LogicalResult IfLowering::matchAndRewrite(IfOp ifOp,
-                                          PatternRewriter &rewriter) const {
+LogicalResult
+IfLowering::matchAndRewrite(IfOp ifOp, OpAdaptor adaptor,
+                            ConversionPatternRewriter &rewriter) const {
   Location loc = ifOp.getLoc();
 
   // Create an emitc::variable op for each result. These variables will be
   // assigned to by emitc::assign ops within the then & else regions.
-  SmallVector<Value> resultVariables =
-      createVariablesForResults(ifOp, rewriter);
-
-  Region &thenRegion = ifOp.getThenRegion();
-  Region &elseRegion = ifOp.getElseRegion();
+  SmallVector<Value> resultVariables;
+  if (failed(createVariablesForResults(ifOp, getTypeConverter(), rewriter,
+                                       resultVariables)))
+    return rewriter.notifyMatchFailure(ifOp,
+                                       "create variables for results failed");
+
+  // Utility function to lower the contents of an scf::if region to an emitc::if
+  // region. The contents of the scf::if regions is moved into the respective
+  // emitc::if regions, but the scf::yield is replaced not only with an
+  // emitc::yield, but also with a sequence of emitc::assign ops that set the
+  // yielded values into the result variables.
+  auto lowerRegion = [&resultVariables, &rewriter,
+                      &ifOp](Region &region, Region &loweredRegion) {
+    rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end());
+    Operation *terminator = loweredRegion.back().getTerminator();
+    auto result = lowerYield(ifOp, resultVariables, rewriter,
+                             cast<scf::YieldOp>(terminator));
+    if (failed(result)) {
+      return result;
+    }
+    return success();
+  };
+
+  Region &thenRegion = adaptor.getThenRegion();
+  Region &elseRegion = adaptor.getElseRegion();
 
   bool hasElseBlock = !elseRegion.empty();
 
   auto loweredIf =
-      rewriter.create<emitc::IfOp>(loc, ifOp.getCondition(), false, false);
+      rewriter.create<emitc::IfOp>(loc, adaptor.getCondition(), false, false);
 
   Region &loweredThenRegion = loweredIf.getThenRegion();
-  lowerRegion(resultVariables, rewriter, thenRegion, loweredThenRegion);
+  auto result = lowerRegion(thenRegion, loweredThenRegion);
+  if (failed(result)) {
+    return result;
+  }
 
   if (hasElseBlock) {
     Region &loweredElseRegion = loweredIf.getElseRegion();
-    lowerRegion(resultVariables, rewriter, elseRegion, loweredElseRegion);
+    auto result = lowerRegion(elseRegion, loweredElseRegion);
+    if (failed(result)) {
+      return result;
+    }
   }
 
   rewriter.setInsertionPointAfter(ifOp);
@@ -203,37 +261,46 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp,
 
 // Lower scf::index_switch to emitc::switch, implementing result values as
 // emitc::variable's updated within the case and default regions.
-struct IndexSwitchOpLowering : public OpRewritePattern<IndexSwitchOp> {
-  using OpRewritePattern<IndexSwitchOp>::OpRewritePattern;
+struct IndexSwitchOpLowering : public OpConversionPattern<IndexSwitchOp> {
+  using OpConversionPattern::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(IndexSwitchOp indexSwitchOp,
-                                PatternRewriter &rewriter) const override;
+  LogicalResult
+  matchAndRewrite(IndexSwitchOp indexSwitchOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
 };
 
-LogicalResult
-IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp,
-                                       PatternRewriter &rewriter) const {
+LogicalResult IndexSwitchOpLowering::matchAndRewrite(
+    IndexSwitchOp indexSwitchOp, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
   Location loc = indexSwitchOp.getLoc();
 
   // Create an emitc::variable op for each result. These variables will be
   // assigned to by emitc::assign ops within the case and default regions.
-  SmallVector<Value> resultVariables =
-      createVariablesForResults(indexSwitchOp, rewriter);
+  SmallVector<Value> resultVariables;
+  if (failed(createVariablesForResults(indexSwitchOp, getTypeConverter(),
+                                       rewriter, resultVariables))) {
+    return rewriter.notifyMatchFailure(indexSwitchOp,
+                                       "create variables for results failed");
+  }
 
   auto loweredSwitch = rewriter.create<emitc::SwitchOp>(
-      loc, indexSwitchOp.getArg(), indexSwitchOp.getCases(),
-      indexSwitchOp.getNumCases());
+      loc, adaptor.getArg(), adaptor.getCases(), indexSwitchOp.getNumCases());
 
   // Lowering all case regions.
-  for (auto pair : llvm::zip(indexSwitchOp.getCaseRegions(),
-                             loweredSwitch.getCaseRegions())) {
-    lowerRegion(resultVariables, rewriter, std::get<0>(pair),
-                std::get<1>(pair));
+  for (auto pair :
+       llvm::zip(adaptor.getCaseRegions(), loweredSwitch.getCaseRegions())) {
+    if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter,
+                           *std::get<0>(pair), std::get<1>(pair)))) {
+      return failure();
+    }
   }
 
   // Lowering default region.
-  lowerRegion(resultVariables, rewriter, indexSwitchOp.getDefaultRegion(),
-              loweredSwitch.getDefaultRegion());
+  if (failed(lowerRegion(indexSwitchOp, resultVariables, rewriter,
+                         adaptor.getDefaultRegion(),
+                         loweredSwitch.getDefaultRegion()))) {
+    return failure();
+  }
 
   rewriter.setInsertionPointAfter(indexSwitchOp);
   SmallVector<Value> results = loadValues(resultVariables, rewriter, loc);
@@ -242,15 +309,22 @@ IndexSwitchOpLowering::matchAndRewrite(IndexSwitchOp indexSwitchOp,
   return success();
 }
 
-void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns) {
-  patterns.add<ForLowering>(patterns.getContext());
-  patterns.add<IfLowering>(patterns.getContext());
-  patterns.add<IndexSwitchOpLowering>(patterns.getContext());
+void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns,
+                                                TypeConverter &typeConverter) {
+  patterns.add<ForLowering>(typeConverter, patterns.getContext());
+  patterns.add<IfLowering>(typeConverter, patterns.getContext());
+  patterns.add<IndexSwitchOpLowering>(typeConverter, patterns.getContext());
 }
 
 void SCFToEmitCPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
-  populateSCFToEmitCConversionPatterns(patterns);
+  TypeConverter typeConverter;
+  // Fallback converter
+  // See note https://mlir.llvm.org/docs/DialectConversion/#type-converter
+  // Type converters are called most to least recently inserted
+  typeConverter.addConversion([](Type t) { return t; });
+  populateEmitCSizeTTypeConversions(typeConverter);
+  populateSCFToEmitCConversionPatterns(patterns, typeConverter);
 
   // Configure conversion to lower out SCF operations.
   ConversionTarget target(getContext());
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
index 6f085cb..b5a0da1 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
@@ -338,11 +338,6 @@ public:
           padOp, "tosa.pad was unable to determine the pad constant value.");
     }
 
-    Value lowIndex =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0));
-    Value highIndex =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1));
-
     SmallVector<OpFoldResult, 3> lowValues;
     SmallVector<OpFoldResult, 3> highValues;
 
@@ -350,11 +345,12 @@ public:
     highValues.reserve(rank);
 
     for (int i = 0; i < rank; i++) {
-      Value inputIndex = rewriter.create<arith::ConstantIndexOp>(loc, i);
+      Value lowIndex = rewriter.create<arith::ConstantIndexOp>(loc, 2 * i);
+      Value highIndex = rewriter.create<arith::ConstantIndexOp>(loc, 2 * i + 1);
       Value lowVal = rewriter.createOrFold<tensor::ExtractOp>(
-          loc, padding, ValueRange({inputIndex, lowIndex}));
+          loc, padding, ValueRange({lowIndex}));
       Value highVal = rewriter.createOrFold<tensor::ExtractOp>(
-          loc, padding, ValueRange({inputIndex, highIndex}));
+          loc, padding, ValueRange({highIndex}));
 
       lowVal = rewriter.createOrFold<arith::IndexCastOp>(
           loc, rewriter.getIndexType(), lowVal);
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index dceebbf..b45829b 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -4520,6 +4520,10 @@ void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder,
                                      OperationState &odsState,
                                      Value linearIndex, ValueRange basis,
                                      bool hasOuterBound) {
+  if (hasOuterBound && !basis.empty() && basis.front() == nullptr) {
+    hasOuterBound = false;
+    basis = basis.drop_front();
+  }
   SmallVector<Value> dynamicBasis;
   SmallVector<int64_t> staticBasis;
   dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis,
@@ -4533,6 +4537,10 @@ void AffineDelinearizeIndexOp::build(OpBuilder &odsBuilder,
                                      Value linearIndex,
                                      ArrayRef<OpFoldResult> basis,
                                      bool hasOuterBound) {
+  if (hasOuterBound && !basis.empty() && basis.front() == OpFoldResult()) {
+    hasOuterBound = false;
+    basis = basis.drop_front();
+  }
   SmallVector<Value> dynamicBasis;
   SmallVector<int64_t> staticBasis;
   dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis);
@@ -4654,6 +4662,13 @@ SmallVector<OpFoldResult> AffineDelinearizeIndexOp::getEffectiveBasis() {
   return getMixedValues(getStaticBasis(), getDynamicBasis(), builder);
 }
 
+SmallVector<OpFoldResult> AffineDelinearizeIndexOp::getPaddedBasis() {
+  SmallVector<OpFoldResult> ret = getMixedBasis();
+  if (!hasOuterBound())
+    ret.insert(ret.begin(), OpFoldResult());
+  return ret;
+}
+
 namespace {
 
 // Drops delinearization indices that correspond to unit-extent basis
@@ -4672,25 +4687,27 @@ struct DropUnitExtentBasis
       return zero.value();
     };
 
-    bool hasOuterBound = delinearizeOp.hasOuterBound();
     // Replace all indices corresponding to unit-extent basis with 0.
     // Remaining basis can be used to get a new `affine.delinearize_index` op.
     SmallVector<OpFoldResult> newBasis;
-    for (auto [index, basis] : llvm::enumerate(delinearizeOp.getMixedBasis())) {
-      std::optional<int64_t> basisVal = getConstantIntValue(basis);
+    for (auto [index, basis] :
+         llvm::enumerate(delinearizeOp.getPaddedBasis())) {
+      std::optional<int64_t> basisVal =
+          basis ? getConstantIntValue(basis) : std::nullopt;
       if (basisVal && *basisVal == 1)
-        replacements[index + (hasOuterBound ? 0 : 1)] = getZero();
+        replacements[index] = getZero();
       else
         newBasis.push_back(basis);
     }
 
-    if (newBasis.size() == delinearizeOp.getStaticBasis().size())
+    if (newBasis.size() == delinearizeOp.getNumResults())
       return rewriter.notifyMatchFailure(delinearizeOp,
                                          "no unit basis elements");
 
-    if (!newBasis.empty() || !hasOuterBound) {
+    if (!newBasis.empty()) {
+      // Will drop the leading nullptr from `basis` if there was no outer bound.
       auto newDelinearizeOp = rewriter.create<affine::AffineDelinearizeIndexOp>(
-          loc, delinearizeOp.getLinearIndex(), newBasis, hasOuterBound);
+          loc, delinearizeOp.getLinearIndex(), newBasis);
       int newIndex = 0;
       // Map back the new delinearized indices to the values they replace.
       for (auto &replacement : replacements) {
@@ -4871,6 +4888,8 @@ void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder,
                                    OperationState &odsState,
                                    ValueRange multiIndex, ValueRange basis,
                                    bool disjoint) {
+  if (!basis.empty() && basis.front() == Value())
+    basis = basis.drop_front();
   SmallVector<Value> dynamicBasis;
   SmallVector<int64_t> staticBasis;
   dispatchIndexOpFoldResults(getAsOpFoldResult(basis), dynamicBasis,
@@ -4883,6 +4902,8 @@ void AffineLinearizeIndexOp::build(OpBuilder &odsBuilder,
                                    ValueRange multiIndex,
                                    ArrayRef<OpFoldResult> basis,
                                    bool disjoint) {
+  if (!basis.empty() && basis.front() == OpFoldResult())
+    basis = basis.drop_front();
   SmallVector<Value> dynamicBasis;
   SmallVector<int64_t> staticBasis;
   dispatchIndexOpFoldResults(basis, dynamicBasis, staticBasis);
@@ -4965,7 +4986,14 @@ SmallVector<OpFoldResult> AffineLinearizeIndexOp::getEffectiveBasis() {
                           builder);
   }
 
-  return ::mlir::getMixedValues(getStaticBasis(), getDynamicBasis(), builder);
+  return getMixedValues(getStaticBasis(), getDynamicBasis(), builder);
+}
+
+SmallVector<OpFoldResult> AffineLinearizeIndexOp::getPaddedBasis() {
+  SmallVector<OpFoldResult> ret = getMixedBasis();
+  if (!hasOuterBound())
+    ret.insert(ret.begin(), OpFoldResult());
+  return ret;
 }
 
 namespace {
@@ -5027,38 +5055,228 @@ struct DropLinearizeUnitComponentsIfDisjointOrZero final
   }
 };
 
-/// Cancel out linearize_index(delinearize_index(x, B), B).
+/// Return the product of `terms`, creating an `affine.apply` if any of them are
+/// non-constant values. If any of `terms` is `nullptr`, return `nullptr`.
+static OpFoldResult computeProduct(Location loc, OpBuilder &builder,
+                                   ArrayRef<OpFoldResult> terms) {
+  int64_t nDynamic = 0;
+  SmallVector<Value> dynamicPart;
+  AffineExpr result = builder.getAffineConstantExpr(1);
+  for (OpFoldResult term : terms) {
+    if (!term)
+      return term;
+    std::optional<int64_t> maybeConst = getConstantIntValue(term);
+    if (maybeConst) {
+      result = result * builder.getAffineConstantExpr(*maybeConst);
+    } else {
+      dynamicPart.push_back(term.get<Value>());
+      result = result * builder.getAffineSymbolExpr(nDynamic++);
+    }
+  }
+  if (auto constant = dyn_cast<AffineConstantExpr>(result))
+    return getAsIndexOpFoldResult(builder.getContext(), constant.getValue());
+  return builder.create<AffineApplyOp>(loc, result, dynamicPart).getResult();
+}
+
+/// If conseceutive outputs of a delinearize_index are linearized with the same
+/// bounds, canonicalize away the redundant arithmetic.
+///
+/// That is, if we have
+/// ```
+/// %s:N = affine.delinearize_index %x into (...a, B1, B2, ... BK, ...b)
+/// %t = affine.linearize_index [...c, %s#I, %s#(I + 1), ... %s#(I+K-1), ...d]
+///   by (...e, B1, B2, ..., BK, ...f)
+/// ```
 ///
-/// That is, rewrite
+/// We can rewrite this to
 /// ```
-/// %0:N = affine.delinearize_index %x by (%b1, %b2, ... %bN)
-/// %y = affine.linearize_index [%0#0, %0#1, ... %0#(N-1)] by (%b1, %b2, ...
-/// %bN)
+/// B = B1 * B2 ... BK
+/// %sMerged:(N-K+1) affine.delinearize_index %x into (...a, B, ...b)
+/// %t = affine.linearize_index [...c, %s#I, ...d] by (...e, B, ...f)
 /// ```
-/// to replacing `%y` with `%x`.
-struct CancelLinearizeOfDelinearizeExact final
+/// where we replace all results of %s unaffected by the change with results
+/// from %sMerged.
+///
+/// As a special case, if all results of the delinearize are merged in this way
+/// we can replace those usages with %x, thus cancelling the delinearization
+/// entirely, as in
+/// ```
+/// %s:3 = affine.delinearize_index %x into (2, 4, 8)
+/// %t = affine.linearize_index [%s#0, %s#1, %s#2, %c0] by (2, 4, 8, 16)
+/// ```
+/// becoming `%t = affine.linearize_index [%x, %c0] by (64, 16)`
+struct CancelLinearizeOfDelinearizePortion final
     : OpRewritePattern<affine::AffineLinearizeIndexOp> {
   using OpRewritePattern::OpRewritePattern;
 
+private:
+  // Struct representing a case where the cancellation pattern
+  // applies. A `Match` means that `length` inputs to the linearize operation
+  // starting at `linStart` can be cancelled with `length` outputs of
+  // `delinearize`, starting from `delinStart`.
+  struct Match {
+    AffineDelinearizeIndexOp delinearize;
+    unsigned linStart = 0;
+    unsigned delinStart = 0;
+    unsigned length = 0;
+  };
+
+public:
   LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp linearizeOp,
                                 PatternRewriter &rewriter) const override {
-    auto delinearizeOp = linearizeOp.getMultiIndex()
-                             .front()
-                             .getDefiningOp<affine::AffineDelinearizeIndexOp>();
-    if (!delinearizeOp)
-      return rewriter.notifyMatchFailure(
-          linearizeOp, "last entry doesn't come from a delinearize");
+    SmallVector<Match> matches;
+
+    const SmallVector<OpFoldResult> linBasis = linearizeOp.getPaddedBasis();
+    ArrayRef<OpFoldResult> linBasisRef = linBasis;
+
+    ValueRange multiIndex = linearizeOp.getMultiIndex();
+    unsigned numLinArgs = multiIndex.size();
+    unsigned linArgIdx = 0;
+    // We only want to replace one run from the same delinearize op per
+    // pattern invocation lest we run into invalidation issues.
+    llvm::SmallPtrSet<Operation *, 2> alreadyMatchedDelinearize;
+    while (linArgIdx < numLinArgs) {
+      auto asResult = dyn_cast<OpResult>(multiIndex[linArgIdx]);
+      if (!asResult) {
+        linArgIdx++;
+        continue;
+      }
 
-    if (linearizeOp.getEffectiveBasis() != delinearizeOp.getEffectiveBasis())
-      return rewriter.notifyMatchFailure(
-          linearizeOp, "basis of linearize and delinearize don't match exactly "
-                       "(excluding outer bounds)");
+      auto delinearizeOp =
+          dyn_cast<AffineDelinearizeIndexOp>(asResult.getOwner());
+      if (!delinearizeOp) {
+        linArgIdx++;
+        continue;
+      }
+
+      /// Result 0 of the delinearize and argument 0 of the linearize can
+      /// leave their maximum value unspecified. However, even if this happens
+      /// we can still sometimes start the match process. Specifically, if
+      /// - The argument we're matching is result 0 and argument 0 (so the
+      /// bounds don't matter). For example,
+      ///
+      ///     %0:2 = affine.delinearize_index %x into (8) : index, index
+      ///     %1 = affine.linearize_index [%s#0, %s#1, ...] (8, ...)
+      /// allows cancellation
+      /// - The delinearization doesn't specify a bound, but the linearization
+      ///  is `disjoint`, which asserts that the bound on the linearization is
+      ///  correct.
+      unsigned delinArgIdx = asResult.getResultNumber();
+      SmallVector<OpFoldResult> delinBasis = delinearizeOp.getPaddedBasis();
+      OpFoldResult firstDelinBound = delinBasis[delinArgIdx];
+      OpFoldResult firstLinBound = linBasis[linArgIdx];
+      bool boundsMatch = firstDelinBound == firstLinBound;
+      bool bothAtFront = linArgIdx == 0 && delinArgIdx == 0;
+      bool knownByDisjoint =
+          linearizeOp.getDisjoint() && delinArgIdx == 0 && !firstDelinBound;
+      if (!boundsMatch && !bothAtFront && !knownByDisjoint) {
+        linArgIdx++;
+        continue;
+      }
+
+      unsigned j = 1;
+      unsigned numDelinOuts = delinearizeOp.getNumResults();
+      for (; j + linArgIdx < numLinArgs && j + delinArgIdx < numDelinOuts;
+           ++j) {
+        if (multiIndex[linArgIdx + j] !=
+            delinearizeOp.getResult(delinArgIdx + j))
+          break;
+        if (linBasis[linArgIdx + j] != delinBasis[delinArgIdx + j])
+          break;
+      }
+      // If there're multiple matches against the same delinearize_index,
+      // only rewrite the first one we find to prevent invalidations. The next
+      // ones will be taken care of by subsequent pattern invocations.
+      if (j <= 1 || !alreadyMatchedDelinearize.insert(delinearizeOp).second) {
+        linArgIdx++;
+        continue;
+      }
+      matches.push_back(Match{delinearizeOp, linArgIdx, delinArgIdx, j});
+      linArgIdx += j;
+    }
 
-    if (delinearizeOp.getResults() != linearizeOp.getMultiIndex())
+    if (matches.empty())
       return rewriter.notifyMatchFailure(
-          linearizeOp, "not all indices come from delinearize");
+          linearizeOp, "no run of delinearize outputs to deal with");
+
+    // Record all the delinearize replacements so we can do them after creating
+    // the new linearization operation, since the new operation might use
+    // outputs of something we're replacing.
+    SmallVector<SmallVector<Value>> delinearizeReplacements;
+
+    SmallVector<Value> newIndex;
+    newIndex.reserve(numLinArgs);
+    SmallVector<OpFoldResult> newBasis;
+    newBasis.reserve(numLinArgs);
+    unsigned prevMatchEnd = 0;
+    for (Match m : matches) {
+      unsigned gap = m.linStart - prevMatchEnd;
+      llvm::append_range(newIndex, multiIndex.slice(prevMatchEnd, gap));
+      llvm::append_range(newBasis, linBasisRef.slice(prevMatchEnd, gap));
+      // Update here so we don't forget this during early continues
+      prevMatchEnd = m.linStart + m.length;
+
+      PatternRewriter::InsertionGuard g(rewriter);
+      rewriter.setInsertionPoint(m.delinearize);
+
+      ArrayRef<OpFoldResult> basisToMerge =
+          linBasisRef.slice(m.linStart, m.length);
+      // We use the slice from the linearize's basis above because of the
+      // "bounds inferred from `disjoint`" case above.
+      OpFoldResult newSize =
+          computeProduct(linearizeOp.getLoc(), rewriter, basisToMerge);
+
+      // Trivial case where we can just skip past the delinearize all together
+      if (m.length == m.delinearize.getNumResults()) {
+        newIndex.push_back(m.delinearize.getLinearIndex());
+        newBasis.push_back(newSize);
+        // Pad out set of replacements so we don't do anything with this one.
+        delinearizeReplacements.push_back(SmallVector<Value>());
+        continue;
+      }
+
+      SmallVector<Value> newDelinResults;
+      SmallVector<OpFoldResult> newDelinBasis = m.delinearize.getPaddedBasis();
+      newDelinBasis.erase(newDelinBasis.begin() + m.delinStart,
+                          newDelinBasis.begin() + m.delinStart + m.length);
+      newDelinBasis.insert(newDelinBasis.begin() + m.delinStart, newSize);
+      auto newDelinearize = rewriter.create<AffineDelinearizeIndexOp>(
+          m.delinearize.getLoc(), m.delinearize.getLinearIndex(),
+          newDelinBasis);
+
+      // Since there may be other uses of the indices we just merged together,
+      // create a residual affine.delinearize_index that delinearizes the
+      // merged output into its component parts.
+      Value combinedElem = newDelinearize.getResult(m.delinStart);
+      auto residualDelinearize = rewriter.create<AffineDelinearizeIndexOp>(
+          m.delinearize.getLoc(), combinedElem, basisToMerge);
+
+      // Swap all the uses of the unaffected delinearize outputs to the new
+      // delinearization so that the old code can be removed if this
+      // linearize_index is the only user of the merged results.
+      llvm::append_range(newDelinResults,
+                         newDelinearize.getResults().take_front(m.delinStart));
+      llvm::append_range(newDelinResults, residualDelinearize.getResults());
+      llvm::append_range(
+          newDelinResults,
+          newDelinearize.getResults().drop_front(m.delinStart + 1));
+
+      delinearizeReplacements.push_back(newDelinResults);
+      newIndex.push_back(combinedElem);
+      newBasis.push_back(newSize);
+    }
+    llvm::append_range(newIndex, multiIndex.drop_front(prevMatchEnd));
+    llvm::append_range(newBasis, linBasisRef.drop_front(prevMatchEnd));
+    rewriter.replaceOpWithNewOp<AffineLinearizeIndexOp>(
+        linearizeOp, newIndex, newBasis, linearizeOp.getDisjoint());
+
+    for (auto [m, newResults] :
+         llvm::zip_equal(matches, delinearizeReplacements)) {
+      if (newResults.empty())
+        continue;
+      rewriter.replaceOp(m.delinearize, newResults);
+    }
 
-    rewriter.replaceOp(linearizeOp, delinearizeOp.getLinearIndex());
     return success();
   }
 };
@@ -5096,7 +5314,7 @@ struct DropLinearizeLeadingZero final
 
 void affine::AffineLinearizeIndexOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<CancelLinearizeOfDelinearizeExact, DropLinearizeLeadingZero,
+  patterns.add<CancelLinearizeOfDelinearizePortion, DropLinearizeLeadingZero,
                DropLinearizeUnitComponentsIfDisjointOrZero>(context);
 }
 
diff --git a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
index 82a9fb0..e93b99b 100644
--- a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -91,6 +91,64 @@ struct AffineMaxOpInterface
   };
 };
 
+struct AffineDelinearizeIndexOpInterface
+    : public ValueBoundsOpInterface::ExternalModel<
+          AffineDelinearizeIndexOpInterface, AffineDelinearizeIndexOp> {
+  void populateBoundsForIndexValue(Operation *rawOp, Value value,
+                                   ValueBoundsConstraintSet &cstr) const {
+    auto op = cast<AffineDelinearizeIndexOp>(rawOp);
+    auto result = cast<OpResult>(value);
+    assert(result.getOwner() == rawOp &&
+           "bounded value isn't a result of this delinearize_index");
+    unsigned resIdx = result.getResultNumber();
+
+    AffineExpr linearIdx = cstr.getExpr(op.getLinearIndex());
+
+    SmallVector<OpFoldResult> basis = op.getPaddedBasis();
+    AffineExpr divisor = cstr.getExpr(1);
+    for (OpFoldResult basisElem : llvm::drop_begin(basis, resIdx + 1))
+      divisor = divisor * cstr.getExpr(basisElem);
+
+    if (resIdx == 0) {
+      cstr.bound(value) == linearIdx.floorDiv(divisor);
+      if (!basis.front().isNull())
+        cstr.bound(value) < cstr.getExpr(basis.front());
+      return;
+    }
+    AffineExpr thisBasis = cstr.getExpr(basis[resIdx]);
+    cstr.bound(value) == (linearIdx % (thisBasis * divisor)).floorDiv(divisor);
+  }
+};
+
+struct AffineLinearizeIndexOpInterface
+    : public ValueBoundsOpInterface::ExternalModel<
+          AffineLinearizeIndexOpInterface, AffineLinearizeIndexOp> {
+  void populateBoundsForIndexValue(Operation *rawOp, Value value,
+                                   ValueBoundsConstraintSet &cstr) const {
+    auto op = cast<AffineLinearizeIndexOp>(rawOp);
+    assert(value == op.getResult() &&
+           "value isn't the result of this linearize");
+
+    AffineExpr bound = cstr.getExpr(0);
+    AffineExpr stride = cstr.getExpr(1);
+    SmallVector<OpFoldResult> basis = op.getPaddedBasis();
+    OperandRange multiIndex = op.getMultiIndex();
+    unsigned numArgs = multiIndex.size();
+    for (auto [revArgNum, length] : llvm::enumerate(llvm::reverse(basis))) {
+      unsigned argNum = numArgs - (revArgNum + 1);
+      if (argNum == 0)
+        break;
+      OpFoldResult indexAsFoldRes = getAsOpFoldResult(multiIndex[argNum]);
+      bound = bound + cstr.getExpr(indexAsFoldRes) * stride;
+      stride = stride * cstr.getExpr(length);
+    }
+    bound = bound + cstr.getExpr(op.getMultiIndex().front()) * stride;
+    cstr.bound(value) == bound;
+    if (op.getDisjoint() && !basis.front().isNull()) {
+      cstr.bound(value) < stride *cstr.getExpr(basis.front());
+    }
+  }
+};
 } // namespace
 } // namespace mlir
 
@@ -100,6 +158,10 @@ void mlir::affine::registerValueBoundsOpInterfaceExternalModels(
     AffineApplyOp::attachInterface<AffineApplyOpInterface>(*ctx);
     AffineMaxOp::attachInterface<AffineMaxOpInterface>(*ctx);
     AffineMinOp::attachInterface<AffineMinOpInterface>(*ctx);
+    AffineDelinearizeIndexOp::attachInterface<
+        AffineDelinearizeIndexOpInterface>(*ctx);
+    AffineLinearizeIndexOp::attachInterface<AffineLinearizeIndexOpInterface>(
+        *ctx);
   });
 }
 
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 0f2c889..4e02559 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -919,8 +919,9 @@ static void generateUnrolledLoop(
   // 'forOp'.
   auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
 
+  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
   if (!annotateFn)
-    annotateFn = [](unsigned, Operation *, OpBuilder) {};
+    annotateFn = defaultAnnotateFn;
 
   // Keep a pointer to the last non-terminator operation in the original block
   // so that we know what to clone (since we are doing this in-place).
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 4d3ead20..9e3257a 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -51,12 +51,14 @@ public:
         loc(loc) {}
 
   template <typename OpTy>
-  Value buildBinaryExpr(AffineBinaryOpExpr expr) {
+  Value buildBinaryExpr(AffineBinaryOpExpr expr,
+                        arith::IntegerOverflowFlags overflowFlags =
+                            arith::IntegerOverflowFlags::none) {
     auto lhs = visit(expr.getLHS());
     auto rhs = visit(expr.getRHS());
     if (!lhs || !rhs)
       return nullptr;
-    auto op = builder.create<OpTy>(loc, lhs, rhs);
+    auto op = builder.create<OpTy>(loc, lhs, rhs, overflowFlags);
     return op.getResult();
   }
 
@@ -65,7 +67,8 @@ public:
   }
 
   Value visitMulExpr(AffineBinaryOpExpr expr) {
-    return buildBinaryExpr<arith::MulIOp>(expr);
+    return buildBinaryExpr<arith::MulIOp>(expr,
+                                          arith::IntegerOverflowFlags::nsw);
   }
 
   /// Euclidean modulo operation: negative RHS is not allowed.
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index d8b314a..e016a6e 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -580,11 +580,31 @@ void arith::MulUIExtendedOp::getCanonicalizationPatterns(
 // DivUIOp
 //===----------------------------------------------------------------------===//
 
+/// Fold `(a * b) / b -> a`
+static Value foldDivMul(Value lhs, Value rhs,
+                        arith::IntegerOverflowFlags ovfFlags) {
+  auto mul = lhs.getDefiningOp<mlir::arith::MulIOp>();
+  if (!mul || !bitEnumContainsAll(mul.getOverflowFlags(), ovfFlags))
+    return {};
+
+  if (mul.getLhs() == rhs)
+    return mul.getRhs();
+
+  if (mul.getRhs() == rhs)
+    return mul.getLhs();
+
+  return {};
+}
+
 OpFoldResult arith::DivUIOp::fold(FoldAdaptor adaptor) {
   // divui (x, 1) -> x.
   if (matchPattern(adaptor.getRhs(), m_One()))
     return getLhs();
 
+  // (a * b) / b -> a
+  if (Value val = foldDivMul(getLhs(), getRhs(), IntegerOverflowFlags::nuw))
+    return val;
+
   // Don't fold if it would require a division by zero.
   bool div0 = false;
   auto result = constFoldBinaryOp<IntegerAttr>(adaptor.getOperands(),
@@ -621,6 +641,10 @@ OpFoldResult arith::DivSIOp::fold(FoldAdaptor adaptor) {
   if (matchPattern(adaptor.getRhs(), m_One()))
     return getLhs();
 
+  // (a * b) / b -> a
+  if (Value val = foldDivMul(getLhs(), getRhs(), IntegerOverflowFlags::nsw))
+    return val;
+
   // Don't fold if it would overflow or if it requires a division by zero.
   bool overflowOrDiv0 = false;
   auto result = constFoldBinaryOp<IntegerAttr>(
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 61767f3..12c65a7 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -17,7 +17,7 @@
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Transforms/OneToNFuncConversions.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -25,7 +25,8 @@
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
-#include "mlir/Transforms/OneToNTypeConversion.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #define DEBUG_TYPE "arm-sme-vector-legalization"
 
@@ -172,12 +173,12 @@ int getNumberOfSMETilesForVectorType(VectorType type) {
 /// Legalize `arith.constant dense<value>` splat operations to fit within SME
 /// tiles by decomposing them into tile-sized operations.
 struct LegalizeArithConstantOpsByDecomposition
-    : public OneToNOpConversionPattern<arith::ConstantOp> {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern<arith::ConstantOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(arith::ConstantOp constantOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+                  ConversionPatternRewriter &rewriter) const override {
     auto vectorType = dyn_cast<VectorType>(constantOp.getType());
     auto denseAttr = dyn_cast<DenseElementsAttr>(constantOp.getValueAttr());
     if (!vectorType || !denseAttr || !denseAttr.isSplat())
@@ -191,8 +192,8 @@ struct LegalizeArithConstantOpsByDecomposition
     auto tileCount = getNumberOfSMETilesForVectorType(vectorType);
     auto tileSplat = rewriter.create<arith::ConstantOp>(
         constantOp.getLoc(), denseAttr.resizeSplat(smeTileType));
-    rewriter.replaceOp(constantOp, SmallVector<Value>(tileCount, tileSplat),
-                       adaptor.getResultMapping());
+    SmallVector<Value> repl(tileCount, tileSplat);
+    rewriter.replaceOpWithMultiple(constantOp, {repl});
 
     return success();
   }
@@ -201,12 +202,13 @@ struct LegalizeArithConstantOpsByDecomposition
 /// Legalize `vector.outerproduct` operations to fit within SME tiles by
 /// decomposing them into tile-sized operations.
 struct LegalizeVectorOuterProductOpsByDecomposition
-    : public OneToNOpConversionPattern<vector::OuterProductOp> {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern<vector::OuterProductOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::OuterProductOp outerProductOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::OuterProductOp outerProductOp,
+                  OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     auto vectorType = outerProductOp.getResultVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
       return rewriter.notifyMatchFailure(outerProductOp,
@@ -219,6 +221,7 @@ struct LegalizeVectorOuterProductOpsByDecomposition
       auto maskOp = outerProductOp.getMaskingOp();
       mask = maskOp.getMask();
       rootOp = maskOp;
+      rewriter.setInsertionPoint(rootOp);
     }
 
     if (!isSupportedMaskOp(mask))
@@ -248,7 +251,7 @@ struct LegalizeVectorOuterProductOpsByDecomposition
       resultSMETiles.push_back(maskedOuterProduct->getResult(0));
     }
 
-    rewriter.replaceOp(rootOp, resultSMETiles, adaptor.getResultMapping());
+    rewriter.replaceOpWithMultiple(rootOp, {resultSMETiles});
     return success();
   }
 };
@@ -259,12 +262,12 @@ struct LegalizeVectorOuterProductOpsByDecomposition
 // (invalid). This pattern matches on `vector.mask` then calls into the
 // `vector.outerproduct` pattern to work around this issue.
 struct LegalizeMaskedVectorOuterProductOpsByDecomposition
-    : public OneToNOpConversionPattern<vector::MaskOp> {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern<vector::MaskOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::MaskOp maskOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::MaskOp maskOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     if (auto outerProductOp = llvm::dyn_cast_or_null<vector::OuterProductOp>(
             maskOp.getMaskableOp())) {
       LegalizeVectorOuterProductOpsByDecomposition pattern(*getTypeConverter(),
@@ -279,12 +282,12 @@ struct LegalizeMaskedVectorOuterProductOpsByDecomposition
 /// Legalize `vector.transfer_read` operations to fit within SME tiles by
 /// decomposing them into tile-sized operations.
 struct LegalizeTransferReadOpsByDecomposition
-    : public OneToNOpConversionPattern<vector::TransferReadOp> {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern<vector::TransferReadOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::TransferReadOp readOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::TransferReadOp readOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     auto vectorType = readOp.getVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
       return rewriter.notifyMatchFailure(readOp,
@@ -319,7 +322,7 @@ struct LegalizeTransferReadOpsByDecomposition
       resultSMETiles.push_back(smeRead);
     }
 
-    rewriter.replaceOp(readOp, resultSMETiles, adaptor.getResultMapping());
+    rewriter.replaceOpWithMultiple(readOp, {resultSMETiles});
     return success();
   }
 };
@@ -327,12 +330,12 @@ struct LegalizeTransferReadOpsByDecomposition
 /// Legalize `vector.transfer_write` operations to fit within SME tiles by
 /// decomposing them into tile-sized operations.
 struct LegalizeTransferWriteOpsByDecomposition
-    : public OneToNOpConversionPattern<vector::TransferWriteOp> {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern<vector::TransferWriteOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::TransferWriteOp writeOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::TransferWriteOp writeOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     auto vectorType = writeOp.getVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
       return rewriter.notifyMatchFailure(writeOp,
@@ -409,12 +412,12 @@ struct LegalizeTransferWriteOpsByDecomposition
 /// }
 /// ```
 struct LegalizeMultiTileTransferWriteAsStoreLoop
-    : public OneToNOpConversionPattern<vector::TransferWriteOp> {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+    : public OpConversionPattern<vector::TransferWriteOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::TransferWriteOp writeOp, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(vector::TransferWriteOp writeOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     if (writeOp.hasPureTensorSemantics())
       return rewriter.notifyMatchFailure(
           writeOp, "TODO: tensor semantics are unsupported");
@@ -936,10 +939,16 @@ struct VectorLegalizationPass
           return success();
         });
 
-    patterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
-                 LiftIllegalVectorTransposeToMemory,
-                 ConvertIllegalShapeCastOpsToTransposes,
-                 LowerIllegalTransposeStoreViaZA>(context);
+    // Apply preprocessing patterns.
+    RewritePatternSet rewritePatterns(context);
+    rewritePatterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
+                        LiftIllegalVectorTransposeToMemory,
+                        ConvertIllegalShapeCastOpsToTransposes,
+                        LowerIllegalTransposeStoreViaZA>(context);
+    if (failed(
+            applyPatternsGreedily(getOperation(), std::move(rewritePatterns))))
+      return signalPassFailure();
+
     // Note: These two patterns are added with a high benefit to ensure:
     //  - Masked outer products are handled before unmasked ones
     //  - Multi-tile writes are lowered as a store loop (if possible)
@@ -950,11 +959,20 @@ struct VectorLegalizationPass
                  LegalizeVectorOuterProductOpsByDecomposition,
                  LegalizeTransferReadOpsByDecomposition,
                  LegalizeTransferWriteOpsByDecomposition>(converter, context);
-    populateFuncTypeConversionPatterns(converter, patterns);
-    scf::populateSCFStructuralOneToNTypeConversions(converter, patterns);
-
-    if (failed(applyPartialOneToNConversion(getOperation(), converter,
-                                            std::move(patterns))))
+    populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
+                                                                   converter);
+    populateCallOpTypeConversionPattern(patterns, converter);
+    populateReturnOpTypeConversionPattern(patterns, converter);
+    scf::populateSCFStructuralTypeConversions(converter, patterns);
+
+    ConversionTarget target(getContext());
+    target.markUnknownOpDynamicallyLegal(
+        [&](Operation *op) { return converter.isLegal(op); });
+    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
+      return converter.isSignatureLegal(op.getFunctionType());
+    });
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
       return signalPassFailure();
   }
 };
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp
index 845a32c..2bdb640 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp
@@ -20,22 +20,6 @@
 using namespace mlir;
 using namespace mlir::arm_sve;
 
-template <typename OpTy>
-class ForwardOperands : public OpConversionPattern<OpTy> {
-  using OpConversionPattern<OpTy>::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    if (adaptor.getOperands().getTypes() == op->getOperands().getTypes())
-      return rewriter.notifyMatchFailure(op, "operand types already match");
-
-    rewriter.modifyOpInPlace(op,
-                             [&]() { op->setOperands(adaptor.getOperands()); });
-    return success();
-  }
-};
-
 using SdotOpLowering = OneToOneConvertToLLVMPattern<SdotOp, SdotIntrOp>;
 using SmmlaOpLowering = OneToOneConvertToLLVMPattern<SmmlaOp, SmmlaIntrOp>;
 using UdotOpLowering = OneToOneConvertToLLVMPattern<UdotOp, UdotIntrOp>;
@@ -204,10 +188,6 @@ void mlir::populateArmSVELegalizeForLLVMExportPatterns(
   // Populate conversion patterns
 
   // clang-format off
-  patterns.add<ForwardOperands<func::CallOp>,
-               ForwardOperands<func::CallIndirectOp>,
-               ForwardOperands<func::ReturnOp>>(converter,
-                                          &converter.getContext());
   patterns.add<SdotOpLowering,
                SmmlaOpLowering,
                UdotOpLowering,
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 349841f..1eb27e4 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -480,18 +480,21 @@ bool AnalysisState::isValueRead(Value value) const {
   return false;
 }
 
-// Starting from `value`, follow the use-def chain in reverse, always selecting
-// the aliasing OpOperands. Find and return Values for which `condition`
-// evaluates to true. OpOperands of such matching Values are not traversed any
-// further, the visited aliasing opOperands will be preserved through
-// `visitedOpOperands`.
+// Starting from `opOperand`, follow the use-def chain in reverse, always
+// selecting the aliasing OpOperands. Find and return Values for which
+// `condition` evaluates to true. Uses of such matching Values are not
+// traversed any further, the visited aliasing opOperands will be preserved
+// through `visitedOpOperands`.
 llvm::SetVector<Value> AnalysisState::findValueInReverseUseDefChain(
-    Value value, llvm::function_ref<bool(Value)> condition,
+    OpOperand *opOperand, llvm::function_ref<bool(Value)> condition,
     TraversalConfig config,
     llvm::DenseSet<OpOperand *> *visitedOpOperands) const {
   llvm::DenseSet<Value> visited;
   llvm::SetVector<Value> result, workingSet;
-  workingSet.insert(value);
+  workingSet.insert(opOperand->get());
+
+  if (visitedOpOperands)
+    visitedOpOperands->insert(opOperand);
 
   while (!workingSet.empty()) {
     Value value = workingSet.pop_back_val();
@@ -563,12 +566,14 @@ llvm::SetVector<Value> AnalysisState::findValueInReverseUseDefChain(
   return result;
 }
 
-// Find the values that define the contents of the given value.
-llvm::SetVector<Value> AnalysisState::findDefinitions(Value value) const {
+// Find the values that define the contents of the given operand's value.
+llvm::SetVector<Value>
+AnalysisState::findDefinitions(OpOperand *opOperand) const {
   TraversalConfig config;
   config.alwaysIncludeLeaves = false;
   return findValueInReverseUseDefChain(
-      value, [&](Value v) { return this->bufferizesToMemoryWrite(v); }, config);
+      opOperand, [&](Value v) { return this->bufferizesToMemoryWrite(v); },
+      config);
 }
 
 AnalysisState::AnalysisState(const BufferizationOptions &options)
@@ -892,7 +897,7 @@ bool bufferization::detail::defaultResultBufferizesToMemoryWrite(
   config.alwaysIncludeLeaves = false;
   for (AliasingOpOperand alias : opOperands) {
     if (!state
-             .findValueInReverseUseDefChain(alias.opOperand->get(),
+             .findValueInReverseUseDefChain(alias.opOperand,
                                             isMemoryWriteInsideOp, config)
              .empty())
       return true;
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
index b7755b2..2502744 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -21,6 +22,7 @@ namespace bufferization {
 } // namespace mlir
 
 using namespace mlir;
+using AllocationFn = bufferization::BufferResultsToOutParamsOpts::AllocationFn;
 using MemCpyFn = bufferization::BufferResultsToOutParamsOpts::MemCpyFn;
 
 /// Return `true` if the given MemRef type has a fully dynamic layout.
@@ -105,10 +107,9 @@ updateFuncOp(func::FuncOp func,
 // Updates all ReturnOps in the scope of the given func::FuncOp by either
 // keeping them as return values or copying the associated buffer contents into
 // the given out-params.
-static LogicalResult updateReturnOps(func::FuncOp func,
-                                     ArrayRef<BlockArgument> appendedEntryArgs,
-                                     MemCpyFn memCpyFn,
-                                     bool hoistStaticAllocs) {
+static LogicalResult
+updateReturnOps(func::FuncOp func, ArrayRef<BlockArgument> appendedEntryArgs,
+                const bufferization::BufferResultsToOutParamsOpts &options) {
   auto res = func.walk([&](func::ReturnOp op) {
     SmallVector<Value, 6> copyIntoOutParams;
     SmallVector<Value, 6> keepAsReturnOperands;
@@ -120,13 +121,14 @@ static LogicalResult updateReturnOps(func::FuncOp func,
     }
     OpBuilder builder(op);
     for (auto [orig, arg] : llvm::zip(copyIntoOutParams, appendedEntryArgs)) {
-      if (hoistStaticAllocs &&
-          isa_and_nonnull<memref::AllocOp>(orig.getDefiningOp()) &&
+      if (options.hoistStaticAllocs &&
+          isa_and_nonnull<bufferization::AllocationOpInterface>(
+              orig.getDefiningOp()) &&
           mlir::cast<MemRefType>(orig.getType()).hasStaticShape()) {
         orig.replaceAllUsesWith(arg);
         orig.getDefiningOp()->erase();
       } else {
-        if (failed(memCpyFn(builder, op.getLoc(), orig, arg)))
+        if (failed(options.memCpyFn(builder, op.getLoc(), orig, arg)))
           return WalkResult::interrupt();
       }
     }
@@ -175,7 +177,14 @@ updateCalls(ModuleOp module,
       auto allocType =
           MemRefType::get(memrefType.getShape(), memrefType.getElementType(),
                           AffineMap(), memrefType.getMemorySpace());
-      Value outParam = builder.create<memref::AllocOp>(op.getLoc(), allocType);
+      auto maybeOutParam =
+          options.allocationFn(builder, op.getLoc(), allocType);
+      if (failed(maybeOutParam)) {
+        op.emitError() << "failed to create allocation op";
+        didFail = true;
+        return;
+      }
+      Value outParam = maybeOutParam.value();
       if (!hasStaticIdentityLayout(memrefType)) {
         // Layout maps are already checked in `updateFuncOp`.
         assert(hasFullyDynamicLayoutMap(memrefType) &&
@@ -213,14 +222,7 @@ LogicalResult mlir::bufferization::promoteBufferResultsToOutParams(
       return failure();
     if (func.isExternal())
       continue;
-    auto defaultMemCpyFn = [](OpBuilder &builder, Location loc, Value from,
-                              Value to) {
-      builder.create<memref::CopyOp>(loc, from, to);
-      return success();
-    };
-    if (failed(updateReturnOps(func, appendedEntryArgs,
-                               options.memCpyFn.value_or(defaultMemCpyFn),
-                               options.hoistStaticAllocs))) {
+    if (failed(updateReturnOps(func, appendedEntryArgs, options))) {
       return failure();
     }
   }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
index abc0635..2c4e362 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
@@ -93,8 +93,31 @@ findValidInsertionPoint(Operation *emptyTensorOp, Operation *user,
   return nullptr;
 }
 
+Value mlir::bufferization::buildSubsetExtraction(RewriterBase &rewriter,
+                                                 SubsetInsertionOpInterface op,
+                                                 tensor::EmptyOp emptyTensorOp,
+                                                 Operation *user) {
+
+  mlir::OpBuilder::InsertionGuard guard(rewriter);
+  // All values that are needed to create the replacement op.
+  SmallVector<Value> neededValues = op.getValuesNeededToBuildSubsetExtraction();
+  // Find a suitable insertion point. If no suitable insertion point
+  // for the replacement can be found, return an empty value to skip
+  // this replacement.
+  Operation *insertionPoint =
+      findValidInsertionPoint(emptyTensorOp, user, neededValues);
+  if (!insertionPoint)
+    return {};
+
+  rewriter.setInsertionPoint(insertionPoint);
+  Value replacement =
+      op.buildSubsetExtraction(rewriter, emptyTensorOp->getLoc());
+  return replacement;
+}
+
 LogicalResult mlir::bufferization::eliminateEmptyTensors(
-    RewriterBase &rewriter, Operation *op, OneShotAnalysisState &state) {
+    RewriterBase &rewriter, Operation *op, OneShotAnalysisState &state,
+    ControlBuildSubsetExtractionFn subsetsExtractionFn) {
   OpBuilder::InsertionGuard g(rewriter);
   llvm::DenseSet<OpOperand *> visitedOpOperands;
   op->walk([&](SubsetInsertionOpInterface op) {
@@ -105,10 +128,6 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
     if (!state.isInPlace(source))
       return WalkResult::skip();
 
-    // All values that are needed to create the replacement op.
-    SmallVector<Value> neededValues =
-        op.getValuesNeededToBuildSubsetExtraction();
-
     // Find tensor.empty ops on the reverse SSA use-def chain. Only follow
     // equivalent tensors. I.e., stop when there are ops such as extract_slice
     // on the path.
@@ -124,35 +143,23 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
     // %3 = tensor.insert_slice %2 into ...
     config.followSameTypeOrCastsOnly = true;
     SetVector<Value> emptyTensors = state.findValueInReverseUseDefChain(
-        source.get(), /*condition=*/
+        &source, /*condition=*/
         [&](Value val) { return val.getDefiningOp<tensor::EmptyOp>(); }, config,
         &visitedOpOperands);
 
     for (Value v : emptyTensors) {
-      Operation *emptyTensorOp = v.getDefiningOp();
-
+      auto emptyTensorOp = v.getDefiningOp<tensor::EmptyOp>();
+      assert(emptyTensorOp && "expected tensor.empty op");
       // Find the use to be replaced from the use-def chain.
       auto iter = llvm::find_if(
           visitedOpOperands, [&emptyTensorOp](OpOperand *opOperand) {
             return llvm::count(emptyTensorOp->getUses(), *opOperand);
           });
-      // This could be achieved when a use of `emptyTensorOp` is being
-      // consumed by `SubsetInsertionOpInterface`'s source directly.
-      if (iter == visitedOpOperands.end())
-        continue;
+
+      assert(iter != visitedOpOperands.end() && "could not find use");
       OpOperand *useToBeReplaced = *iter;
       Operation *user = useToBeReplaced->getOwner();
-
-      // Find a suitable insertion point. If no suitable insertion point for
-      // the replacement can be found, skip this replacement.
-      Operation *insertionPoint =
-          findValidInsertionPoint(emptyTensorOp, user, neededValues);
-      if (!insertionPoint)
-        continue;
-
-      rewriter.setInsertionPoint(insertionPoint);
-      Value replacement =
-          op.buildSubsetExtraction(rewriter, emptyTensorOp->getLoc());
+      auto replacement = subsetsExtractionFn(rewriter, op, emptyTensorOp, user);
       if (!replacement)
         continue;
       if (emptyTensorOp == replacement.getDefiningOp())
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index d1e6ace..fc1b221 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -196,7 +196,12 @@ void OneShotAnalysisState::gatherUndefinedTensorUses(Operation *op) {
 
       // If there is no preceding definition, the tensor contents are
       // undefined.
-      if (findDefinitionsCached(opResult).empty())
+      if (opResult.getUses().empty())
+        continue;
+      // It does not really matter which use to take to search about
+      // the value's definitions.
+      OpOperand *opOperand = &(*opResult.getUses().begin());
+      if (findDefinitionsCached(opOperand).empty())
         for (OpOperand &use : opResult.getUses())
           undefinedTensorUses.insert(&use);
     }
@@ -464,7 +469,8 @@ static void annotateConflict(OpOperand *uRead, OpOperand *uConflictingWrite,
 /// indexing. I.e., the tensor types do not change along the use-def chain,
 /// apart from static <-> dynamic dim casts.
 static bool hasEquivalentValueInReverseUseDefChain(AnalysisState &state,
-                                                   Value start, Value other) {
+                                                   OpOperand *start,
+                                                   Value other) {
   TraversalConfig config;
   config.followEquivalentOnly = true;
   config.alwaysIncludeLeaves = false;
@@ -475,9 +481,10 @@ static bool hasEquivalentValueInReverseUseDefChain(AnalysisState &state,
               .empty();
 }
 
-/// Return "true" if `value` is originating from a subset that is equivalent to
-/// the subset that `subsetOp` inserts into.
-static bool matchesInsertDestination(const AnalysisState &state, Value value,
+/// Return "true" if the given operand's value is originating from a subset
+/// that is equivalent to the subset that `subsetOp` inserts into.
+static bool matchesInsertDestination(const AnalysisState &state,
+                                     OpOperand *opOperand,
                                      SubsetInsertionOpInterface subsetOp) {
   auto matchingSubset = [&](Value val) {
     if (auto opResult = dyn_cast<OpResult>(val))
@@ -490,7 +497,7 @@ static bool matchesInsertDestination(const AnalysisState &state, Value value,
   // There may be multiple leaves at which the reverse SSA use-def chain lookup
   // terminates. All of them must be equivalent subsets.
   SetVector<Value> backwardSlice =
-      state.findValueInReverseUseDefChain(value, matchingSubset);
+      state.findValueInReverseUseDefChain(opOperand, matchingSubset);
   return static_cast<bool>(llvm::all_of(backwardSlice, matchingSubset));
 }
 
@@ -516,7 +523,7 @@ static bool areNonConflictingSubsets(OpOperand *uRead,
     //     {inplace= [true] }
 
     if (uRead == &subsetOp.getDestinationOperand() &&
-        matchesInsertDestination(state, uConflictingWrite->get(), subsetOp))
+        matchesInsertDestination(state, uConflictingWrite, subsetOp))
       // Case 1: The main insight is that InsertSliceOp reads only part of
       // the destination tensor. The overwritten area is not read. If
       // uConflictingWrite writes into exactly the memory location that is
@@ -533,7 +540,7 @@ static bool areNonConflictingSubsets(OpOperand *uRead,
 
     if (uRead == &subsetOp.getSourceOperand() &&
         uConflictingWrite == &subsetOp.getDestinationOperand() &&
-        matchesInsertDestination(state, uRead->get(), subsetOp))
+        matchesInsertDestination(state, uRead, subsetOp))
       // Case 2: The read of the source tensor and the write to the dest
       // tensor via an InsertSliceOp is not a conflict if the read is
       // reading exactly that part of an equivalent tensor that the
@@ -567,8 +574,7 @@ static bool areNonConflictingSubsets(OpOperand *uRead,
     if (uConflictingWrite == &subsetOp.getDestinationOperand() &&
         state.areEquivalentBufferizedValues(
             uRead->get(), subsetOp.getSourceOperand().get()) &&
-        matchesInsertDestination(state, subsetOp.getSourceOperand().get(),
-                                 subsetOp))
+        matchesInsertDestination(state, &subsetOp.getSourceOperand(), subsetOp))
       return true;
 
   return false;
@@ -600,9 +606,9 @@ hasReadAfterWriteInterference(const DenseSet<OpOperand *> &usesRead,
       // even though that op just bufferizes to an allocation but does define
       // the contents of the buffer.
       SetVector<Value> definitionsOrLeaves =
-          state.findValueInReverseUseDefChain(
-              uConflictingWrite->get(),
-              [&](Value v) { return state.bufferizesToMemoryWrite(v); });
+          state.findValueInReverseUseDefChain(uConflictingWrite, [&](Value v) {
+            return state.bufferizesToMemoryWrite(v);
+          });
       assert(!definitionsOrLeaves.empty() &&
              "expected at least one definition or leaf");
 
@@ -641,8 +647,7 @@ hasReadAfterWriteInterference(const DenseSet<OpOperand *> &usesRead,
     // In the above example, if uRead is the OpOperand of reading_op, the
     // definition is %0. Note that operations that create an alias but do not
     // bufferize to a memory write (such as ExtractSliceOp) are skipped.
-    const SetVector<Value> &definitions =
-        state.findDefinitionsCached(uRead->get());
+    const SetVector<Value> &definitions = state.findDefinitionsCached(uRead);
     if (definitions.empty()) {
       // Fast path: No conflict if there are no definitions.
       LLVM_DEBUG(llvm::dbgs()
@@ -714,9 +719,9 @@ hasReadAfterWriteInterference(const DenseSet<OpOperand *> &usesRead,
             if (bufferizableOp.bufferizesToElementwiseAccess(
                     state, {uRead, uConflictingWrite})) {
               if (hasEquivalentValueInReverseUseDefChain(
-                      state, uRead->get(), uConflictingWrite->get()) ||
+                      state, uRead, uConflictingWrite->get()) ||
                   hasEquivalentValueInReverseUseDefChain(
-                      state, uConflictingWrite->get(), uRead->get())) {
+                      state, uConflictingWrite, uRead->get())) {
                 LLVM_DEBUG(
                     llvm::dbgs()
                     << "  no conflict: op bufferizes to element-wise access\n");
@@ -965,11 +970,12 @@ wouldCreateWriteToNonWritableBuffer(OpOperand &operand,
 // Bufferization analyses.
 //===----------------------------------------------------------------------===//
 
-// Find the values that define the contents of the given value.
+// Find the values that define the contents of the given operand's value.
 const llvm::SetVector<Value> &
-OneShotAnalysisState::findDefinitionsCached(Value value) {
+OneShotAnalysisState::findDefinitionsCached(OpOperand *opOperand) {
+  Value value = opOperand->get();
   if (!cachedDefinitions.count(value))
-    cachedDefinitions[value] = findDefinitions(value);
+    cachedDefinitions[value] = findDefinitions(opOperand);
   return cachedDefinitions[value];
 }
 
diff --git a/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp b/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp
index 0b3a494..72c8fd0 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp
@@ -33,7 +33,6 @@ void mlir::populateEmitCSizeTTypeConversions(TypeConverter &converter) {
 
   converter.addSourceMaterialization(materializeAsUnrealizedCast);
   converter.addTargetMaterialization(materializeAsUnrealizedCast);
-  converter.addArgumentMaterialization(materializeAsUnrealizedCast);
 }
 
 /// Get an unsigned integer or size data type corresponding to \p ty.
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index c7ddc1b..ff1636bc 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -48,10 +48,28 @@ void LLVMDialect::registerAttributes() {
   addAttributes<
 #define GET_ATTRDEF_LIST
 #include "mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.cpp.inc"
+
       >();
 }
 
 //===----------------------------------------------------------------------===//
+// AliasScopeAttr
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+AliasScopeAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                       Attribute id, AliasScopeDomainAttr domain,
+                       StringAttr description) {
+  (void)domain;
+  (void)description;
+  if (!llvm::isa<StringAttr, DistinctAttr>(id))
+    return emitError()
+           << "id of an alias scope must be a StringAttr or a DistrinctAttr";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
 // DINodeAttr
 //===----------------------------------------------------------------------===//
 
@@ -232,7 +250,7 @@ DIRecursiveTypeAttrInterface DISubprogramAttr::withRecId(DistinctAttr recId) {
 
 DIRecursiveTypeAttrInterface DISubprogramAttr::getRecSelf(DistinctAttr recId) {
   return DISubprogramAttr::get(recId.getContext(), recId, /*isRecSelf=*/true,
-                               {}, {}, {}, {}, {}, 0, 0, {}, {}, {}, {}, {});
+                               {}, {}, {}, {}, {}, {}, 0, 0, {}, {}, {}, {});
 }
 
 //===----------------------------------------------------------------------===//
@@ -288,6 +306,16 @@ TargetFeaturesAttr TargetFeaturesAttr::get(MLIRContext *context,
                    }));
 }
 
+TargetFeaturesAttr
+TargetFeaturesAttr::getChecked(function_ref<InFlightDiagnostic()> emitError,
+                               MLIRContext *context,
+                               llvm::ArrayRef<StringRef> features) {
+  return Base::getChecked(emitError, context,
+                          llvm::map_to_vector(features, [&](StringRef feature) {
+                            return StringAttr::get(context, feature);
+                          }));
+}
+
 TargetFeaturesAttr TargetFeaturesAttr::get(MLIRContext *context,
                                            StringRef targetFeatures) {
   SmallVector<StringRef> features;
@@ -296,6 +324,16 @@ TargetFeaturesAttr TargetFeaturesAttr::get(MLIRContext *context,
   return get(context, features);
 }
 
+TargetFeaturesAttr
+TargetFeaturesAttr::getChecked(function_ref<InFlightDiagnostic()> emitError,
+                               MLIRContext *context, StringRef targetFeatures) {
+  SmallVector<StringRef> features;
+  targetFeatures.split(features, ',', /*MaxSplit=*/-1,
+                       /*KeepEmpty=*/false);
+  ArrayRef featuresRef(features);
+  return getChecked(emitError, context, featuresRef);
+}
+
 LogicalResult
 TargetFeaturesAttr::verify(function_ref<InFlightDiagnostic()> emitError,
                            llvm::ArrayRef<StringAttr> features) {
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 221ca27..a1d619c 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2626,12 +2626,19 @@ void transform::TileReductionUsingForOp::build(
 }
 
 DiagnosedSilenceableFailure transform::TileReductionUsingForOp::applyToOne(
-    transform::TransformRewriter &rewriter, LinalgOp target,
+    transform::TransformRewriter &rewriter, Operation *target,
     transform::ApplyToEachResultList &results,
     transform::TransformState &state) {
   rewriter.setInsertionPoint(target);
+
+  auto partialReductionOp = dyn_cast<PartialReductionOpInterface>(target);
+  if (!partialReductionOp) {
+    return emitSilenceableFailure(
+        target->getLoc(),
+        "Operation should implement PartialReductionOpInterface");
+  }
   FailureOr<scf::SCFTilingResult> result = scf::tileReductionUsingScf(
-      rewriter, cast<PartialReductionOpInterface>(target.getOperation()),
+      rewriter, partialReductionOp,
       getAsOpFoldResult(rewriter.getI64ArrayAttr(getTileSizes())));
 
   if (failed(result))
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
index 6801b68a..6c10877 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
@@ -553,7 +553,7 @@ Value linalg::bufferizeToAllocation(
     Value alloc = createAllocationForTensor(
         rewriter, op->getLoc(), operand->get(), options, memorySpace);
     allocs.push_back(alloc);
-    if (!state.findDefinitions(operand->get()).empty()) {
+    if (!state.findDefinitions(operand).empty()) {
       // Initialize buffer with a copy of the operand data. Not needed if the
       // tensor is uninitialized.
       createMemcpy(rewriter, op->getLoc(), operand->get(), alloc, options);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
index 0e651f4..fc6671e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
@@ -154,7 +154,6 @@ public:
     });
 
     addSourceMaterialization(sourceMaterializationCallback);
-    addArgumentMaterialization(sourceMaterializationCallback);
   }
 };
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp b/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
index 4776883..b710bde 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
@@ -59,7 +59,7 @@ LogicalResult linalg::linalgOpAnchoredEmptyTensorEliminationStep(
       config.followEquivalentOnly = true;
       config.alwaysIncludeLeaves = false;
       SetVector<Value> emptyTensors = state.findValueInReverseUseDefChain(
-          in->get(), /*condition=*/
+          in, /*condition=*/
           [&](Value val) {
             return val.getDefiningOp<tensor::EmptyOp>() &&
                    val.getType() == in->get().getType();
diff --git a/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
index 5bf2f91..92cfba2 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
@@ -105,13 +105,13 @@ static ReductionKind getReductionKindOfLinalgOp(LinalgOp op) {
 static MeshOp getMesh(Operation *op, ArrayRef<MeshSharding> operandShardings,
                       ArrayRef<MeshSharding> resultShardings,
                       SymbolTableCollection &symbolTable) {
-  for (const MeshSharding& sharding : operandShardings) {
+  for (const MeshSharding &sharding : operandShardings) {
     if (sharding) {
       return mesh::getMesh(op, sharding.getMeshAttr(), symbolTable);
     }
   }
 
-  for (const MeshSharding& sharding : resultShardings) {
+  for (const MeshSharding &sharding : resultShardings) {
     if (sharding) {
       return mesh::getMesh(op, sharding.getMeshAttr(), symbolTable);
     }
@@ -129,8 +129,9 @@ static MeshOp getMesh(Operation *op, ArrayRef<MeshSharding> operandShardings,
 // the original operand.
 // The other processes would use the reduction operation neutral tensor.
 static Value createDestinationPassingStyleInitOperand(
-    LinalgOp op, Value spmdizedOperand, ArrayRef<MeshAxis> reductionMeshAxes,
-    MeshOp meshOp, ImplicitLocOpBuilder &builder) {
+    LinalgOp op, int operandNumber, Value spmdizedOperand,
+    ArrayRef<MeshAxis> reductionMeshAxes, MeshOp meshOp,
+    ImplicitLocOpBuilder &builder) {
   Value processLinearIndexInReductionGroup = mesh::createProcessLinearIndex(
       meshOp.getSymName(), reductionMeshAxes, builder);
   Value zero = builder.create<arith::ConstantIndexOp>(0);
@@ -152,14 +153,21 @@ static Value createDestinationPassingStyleInitOperand(
     builder.setInsertionPointToEnd(&ifOp.getElseRegion().front());
     SmallVector<OpFoldResult> shape =
         tensor::getMixedSizes(builder, builder.getLoc(), spmdizedOperand);
-    PartialReductionOpInterface partialReductionIface =
-        llvm::cast<PartialReductionOpInterface>(op.getOperation());
-    assert(op->getNumResults() == 1 && "Multiple results not supported.");
-    FailureOr<SmallVector<Value>> reductionNeutralTensor =
-        partialReductionIface.generateInitialTensorForPartialReduction(
-            builder, builder.getLoc(), shape, {});
-    assert(succeeded(reductionNeutralTensor));
-    builder.create<scf::YieldOp>(reductionNeutralTensor.value());
+
+    SmallVector<Operation *> combinerOps;
+    matchReduction(op.getRegionOutputArgs(), operandNumber, combinerOps);
+    assert(combinerOps.size() == 1);
+    std::optional<TypedAttr> neutralEl =
+        arith::getNeutralElement(combinerOps[0]);
+
+    Value init = builder.create<tensor::EmptyOp>(op.getLoc(), shape,
+                                                 neutralEl.value().getType());
+    Value constant =
+        builder.create<arith::ConstantOp>(op.getLoc(), neutralEl.value());
+    Value fill = builder.create<linalg::FillOp>(op.getLoc(), constant, init)
+                     .getResult(0);
+
+    builder.create<scf::YieldOp>(fill);
   }
   return ifOp.getResult(0);
 }
@@ -178,7 +186,7 @@ static SmallVector<Value> createDestinationPassingStyleInitOperands(
   Value spmdizedInitOperand =
       spmdizationMap.lookup(op->getOperands()[operandIdx]);
   newOperands[operandIdx] = createDestinationPassingStyleInitOperand(
-      op, spmdizedInitOperand, reductionMeshAxes, meshOp, builder);
+      op, 0, spmdizedInitOperand, reductionMeshAxes, meshOp, builder);
   return newOperands;
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index f86715a..b7764da 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -324,7 +324,27 @@ struct LinalgOpTilingInterface
 // External Model for implementing `PartialReductionInterface` for `LinalgOp`s.
 //===----------------------------------------------------------------------===//
 
-/// External model implementation of PartialReductionInterface for LinalgOps.
+/// Return an AffineMap for a partial result for the given result number,
+/// assuming the partial tiling strategy is outer-reduction loop +
+/// inner-parallel tile. The returned AffineMap can be used as the replacement
+/// AffineMap for the inner-parallel tile linalg op for the given result number.
+///
+/// The new AffineMap is the old AffineMap with reduction dimensions appended
+/// at end.
+static AffineMap getPartialResultAffineMap(LinalgOp linalgOp,
+                                           ArrayRef<int> reductionDims,
+                                           unsigned resultNumber) {
+  AffineMap map =
+      linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(resultNumber));
+  for (int redPos : reductionDims) {
+    map = map.insertResult(getAffineDimExpr(redPos, linalgOp.getContext()),
+                           map.getNumResults());
+  }
+  return map;
+}
+
+/// External model implementation of PartialReductionInterface for
+/// LinalgOps.
 template <typename LinalgOpTy>
 struct LinalgOpPartialReductionInterface
     : public PartialReductionOpInterface::ExternalModel<
@@ -338,11 +358,24 @@ struct LinalgOpPartialReductionInterface
     if (linalgOp.hasPureBufferSemantics())
       return op->emitOpError("expected operation to have tensor semantics");
 
+    // LinalgOp implements TilingInterface.
+    auto tilingInterfaceOp = cast<TilingInterface>(linalgOp.getOperation());
+    SmallVector<OpFoldResult> shape =
+        llvm::map_to_vector(tilingInterfaceOp.getIterationDomain(b),
+                            [](Range x) { return x.size; });
+
+    SmallVector<OpFoldResult> tiledShape;
+    for (auto [tileSize, dimSize] : llvm::zip_equal(sizes, shape)) {
+      if (isZeroIndex(tileSize)) {
+        tiledShape.push_back(dimSize);
+      } else {
+        tiledShape.push_back(tileSize);
+      }
+    }
+
     SmallVector<Value> inits;
     for (int initIdx = 0, e = linalgOp.getNumDpsInits(); initIdx < e;
          ++initIdx) {
-      // Insert the new parallel dimension based on the index of the reduction
-      // loops. This could be controlled by user for more flexibility.
       SmallVector<Operation *, 4> combinerOps;
       if (!matchReduction(linalgOp.getRegionOutputArgs(), initIdx,
                           combinerOps) ||
@@ -355,33 +388,19 @@ struct LinalgOpPartialReductionInterface
         return op->emitOpError(
             "Failed to get an identity value for the reduction operation.");
 
-      ArrayRef<int64_t> oldShape =
-          linalgOp.getShape(linalgOp.getDpsInitOperand(initIdx));
-
-      // Calculate the new shape, we insert the new dimensions based on the
-      // index of the reduction dimensions.
-      SmallVector<int64_t> newOutputShape;
-      SmallVector<Value> dynamicDims;
-      int64_t currReductionDims = 0;
-      DenseSet<int> reductionDimsSet(reductionDims.begin(),
-                                     reductionDims.end());
-      for (int64_t idx :
-           llvm::seq<int64_t>(0, oldShape.size() + reductionDims.size())) {
-        if (reductionDimsSet.contains(idx)) {
-          dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape);
-          currReductionDims++;
-          continue;
-        }
-        int64_t oldIdx = idx - currReductionDims;
-        int64_t dim = oldShape[oldIdx];
-        newOutputShape.push_back(dim);
-        if (ShapedType::isDynamic(dim))
-          dynamicDims.push_back(b.create<tensor::DimOp>(
-              loc, linalgOp.getDpsInitOperand(initIdx)->get(), oldIdx));
+      // Append the new partial result dimensions.
+      AffineMap partialMap =
+          getPartialResultAffineMap(linalgOp, reductionDims, initIdx);
+      SmallVector<OpFoldResult> partialResultShape;
+      for (AffineExpr dimExpr : partialMap.getResults()) {
+        auto dim = cast<AffineDimExpr>(dimExpr);
+        partialResultShape.push_back(tiledShape[dim.getPosition()]);
       }
-      Value emptyTensor = b.create<tensor::EmptyOp>(
-          loc, newOutputShape,
-          linalgOp.getRegionOutputArgs()[initIdx].getType(), dynamicDims);
+
+      Type elType =
+          getElementTypeOrSelf(linalgOp->getResult(initIdx).getType());
+      Value emptyTensor =
+          b.create<tensor::EmptyOp>(loc, partialResultShape, elType);
       Value constantOp = b.create<arith::ConstantOp>(loc, *identity);
       auto identityTensor =
           b.create<linalg::FillOp>(loc, constantOp, emptyTensor);
@@ -407,11 +426,7 @@ struct LinalgOpPartialReductionInterface
       // TODO: linalg::Generic doesn't have getDpsInitOperands. Can replace
       // this with a for range loop when we have it.
       AffineMap newMap =
-          linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(idx));
-      for (int redPos : reductionDims) {
-        newMap = newMap.insertResult(b.getAffineDimExpr(redPos),
-                                     newMap.getNumResults());
-      }
+          getPartialResultAffineMap(linalgOp, reductionDims, idx);
       newInitMaps.push_back(newMap);
     }
 
@@ -476,29 +491,75 @@ struct LinalgOpPartialReductionInterface
                                          Location loc, ValueRange partialReduce,
                                          ArrayRef<int> reductionDims) const {
     auto linalgOp = cast<LinalgOp>(op);
-    SmallVector<int64_t> reductionDimsInt64(reductionDims);
-    auto reduction = b.create<linalg::ReduceOp>(
-        loc, partialReduce, linalgOp.getDpsInits(), reductionDimsInt64,
-        [&linalgOp](OpBuilder &b, Location loc, ValueRange inputs) {
-          int64_t numInits = linalgOp.getNumDpsInits();
-          SmallVector<Value> yieldedValues;
-          for (int idx : llvm::seq<int>(0, numInits)) {
+
+    // Permute the reduction dims as permuted by the partial result map.
+
+    int64_t numInits = linalgOp.getNumDpsInits();
+    SmallVector<Operation *> mergeOperations;
+    SmallVector<Value> replacements;
+    for (int idx : llvm::seq(numInits)) {
+      // linalg.reduce's iteration space is the tiled result's iteration space
+      // (and not the tiled operation's iteration space). To account for this,
+      // permute the reduction dimensions based on the partial result map of the
+      // tiled result.
+      AffineMap partialMap =
+          getPartialResultAffineMap(linalgOp, reductionDims, idx);
+      SmallVector<int64_t> partialReductionDims;
+      for (auto [resultNum, dimExpr] :
+           llvm::enumerate(partialMap.getResults())) {
+        unsigned dim = cast<AffineDimExpr>(dimExpr).getPosition();
+        if (llvm::find(reductionDims, dim) != reductionDims.end()) {
+          partialReductionDims.push_back(resultNum);
+        }
+      }
+
+      Value partialResult = partialReduce[idx];
+      Value init = linalgOp.getDpsInits()[idx];
+
+      auto reduction = b.create<linalg::ReduceOp>(
+          loc, partialResult, init, partialReductionDims,
+          [&linalgOp, &idx](OpBuilder &b, Location loc, ValueRange inputs) {
             // Get the combiner op.
             SmallVector<Operation *, 4> combinerOps;
             matchReduction(linalgOp.getRegionOutputArgs(), idx, combinerOps);
             Operation *clonedReductionOp = b.clone(*combinerOps[0]);
             // Combine the input at idx and output at numInits + idx.
-            clonedReductionOp->setOperand(0, inputs[idx]);
-            clonedReductionOp->setOperand(1, inputs[numInits + idx]);
-            // Yield.
-            yieldedValues.push_back(clonedReductionOp->getResult(0));
-          }
-          b.create<linalg::YieldOp>(loc, yieldedValues);
-        });
-    return MergeResult{
-        {reduction.getOperation()},
-        llvm::map_to_vector(reduction->getResults(),
-                            [](OpResult r) -> Value { return r; })};
+            clonedReductionOp->setOperand(0, inputs[0]);
+            clonedReductionOp->setOperand(1, inputs[1]);
+            b.create<linalg::YieldOp>(loc, clonedReductionOp->getResult(0));
+          });
+
+      mergeOperations.push_back(reduction);
+      replacements.push_back(reduction->getResult(0));
+    }
+
+    return MergeResult{mergeOperations, replacements};
+  }
+
+  LogicalResult getPartialResultTilePosition(
+      Operation *op, OpBuilder &b, unsigned resultNumber,
+      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+      SmallVector<OpFoldResult> &resultOffsets,
+      SmallVector<OpFoldResult> &resultSizes,
+      ArrayRef<int> reductionDims) const {
+    auto linalgOp = cast<LinalgOp>(op);
+
+    AffineMap partialMap =
+        getPartialResultAffineMap(linalgOp, reductionDims, resultNumber);
+    for (AffineExpr dimExpr : partialMap.getResults()) {
+      unsigned dim = cast<AffineDimExpr>(dimExpr).getPosition();
+      resultSizes.push_back(sizes[dim]);
+
+      if (llvm::find(reductionDims, dim) != reductionDims.end()) {
+        // Reduction dims are reduced, and are always outputed in the same
+        // place. So use offset 0 for them.
+        resultOffsets.push_back(b.getIndexAttr(0));
+      } else {
+        resultOffsets.push_back(offsets[dim]);
+      }
+    }
+
+    return success();
   }
 };
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 60cf897..50593b0 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1656,8 +1656,8 @@ void linalg::populateDecomposeConvolutionPatterns(RewritePatternSet &patterns,
 }
 
 void linalg::populateDecomposePackUnpackPatterns(RewritePatternSet &patterns) {
-  // TODO: Add and test patterns for tensor.unpack
   patterns.add<DecomposeOuterUnitDimsPackOpPattern>(patterns.getContext());
+  patterns.add<DecomposeOuterUnitDimsUnPackOpPattern>(patterns.getContext());
 }
 
 void linalg::populateDecomposePadPatterns(RewritePatternSet &patterns) {
diff --git a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
index 6191272..71b88d1 100644
--- a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
+++ b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
@@ -56,7 +56,6 @@ public:
     addConversion(convertQuantizedType);
     addConversion(convertTensorType);
 
-    addArgumentMaterialization(materializeConversion);
     addSourceMaterialization(materializeConversion);
     addTargetMaterialization(materializeConversion);
   }
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index eded1c3..83ae79c 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -839,8 +839,7 @@ mlir::scf::replaceAndCastForOpIterArg(RewriterBase &rewriter, scf::ForOp forOp,
 namespace {
 // Fold away ForOp iter arguments when:
 // 1) The op yields the iter arguments.
-// 2) The iter arguments have no use and the corresponding outer region
-// iterators (inputs) are yielded.
+// 2) The argument's corresponding outer region iterators (inputs) are yielded.
 // 3) The iter arguments have no use and the corresponding (operation) results
 // have no use.
 //
@@ -872,30 +871,28 @@ struct ForOpIterArgsFolder : public OpRewritePattern<scf::ForOp> {
     newIterArgs.reserve(forOp.getInitArgs().size());
     newYieldValues.reserve(numResults);
     newResultValues.reserve(numResults);
-    for (auto it : llvm::zip(forOp.getInitArgs(),       // iter from outside
-                             forOp.getRegionIterArgs(), // iter inside region
-                             forOp.getResults(),        // op results
-                             forOp.getYieldedValues()   // iter yield
-                             )) {
+    for (auto [init, arg, result, yielded] :
+         llvm::zip(forOp.getInitArgs(),       // iter from outside
+                   forOp.getRegionIterArgs(), // iter inside region
+                   forOp.getResults(),        // op results
+                   forOp.getYieldedValues()   // iter yield
+                   )) {
       // Forwarded is `true` when:
       // 1) The region `iter` argument is yielded.
-      // 2) The region `iter` argument has no use, and the corresponding iter
-      // operand (input) is yielded.
+      // 2) The region `iter` argument the corresponding input is yielded.
       // 3) The region `iter` argument has no use, and the corresponding op
       // result has no use.
-      bool forwarded = ((std::get<1>(it) == std::get<3>(it)) ||
-                        (std::get<1>(it).use_empty() &&
-                         (std::get<0>(it) == std::get<3>(it) ||
-                          std::get<2>(it).use_empty())));
+      bool forwarded = (arg == yielded) || (init == yielded) ||
+                       (arg.use_empty() && result.use_empty());
       keepMask.push_back(!forwarded);
       canonicalize |= forwarded;
       if (forwarded) {
-        newBlockTransferArgs.push_back(std::get<0>(it));
-        newResultValues.push_back(std::get<0>(it));
+        newBlockTransferArgs.push_back(init);
+        newResultValues.push_back(init);
         continue;
       }
-      newIterArgs.push_back(std::get<0>(it));
-      newYieldValues.push_back(std::get<3>(it));
+      newIterArgs.push_back(init);
+      newYieldValues.push_back(yielded);
       newBlockTransferArgs.push_back(Value()); // placeholder with null value
       newResultValues.push_back(Value());      // placeholder with null value
     }
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 90db42d..b548f8c 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -28,6 +28,7 @@
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
@@ -656,21 +657,29 @@ getResultTilePosition(RewriterBase &rewriter, int64_t index, Value tiledResult,
                                     resultOffset, resultSize);
   case scf::SCFTilingOptions::ReductionTilingStrategy::
       PartialReductionOuterReduction: {
-    // TODO: This does not work for non identity accesses to the result tile.
-    // The proper fix is to add a getPartialResultTilePosition method to
-    // PartialReductionOpInterface.
-    resultOffset =
-        SmallVector<OpFoldResult>(offsets.size(), rewriter.getIndexAttr(0));
-    for (size_t i = 0; i < offsets.size(); i++) {
-      resultSize.push_back(
-          tensor::getMixedSize(rewriter, op.getLoc(), tiledResult, i));
+    auto redOp = dyn_cast<PartialReductionOpInterface>(op.getOperation());
+    if (!redOp) {
+      return rewriter.notifyMatchFailure(
+          op, "PartialReductionOuterReduction tiling strategy is only supported"
+              "for operations implementing PartialReductionOpInterface");
     }
-    return success();
+    // Get reduction dimensions.
+    // TODO: PartialReductionOpInterface should really query TilingInterface
+    // itself and find reduction dimensions.
+    SmallVector<int> reductionDims;
+    for (auto [idx, iteratorType] :
+         llvm::enumerate(op.getLoopIteratorTypes())) {
+      if (iteratorType == utils::IteratorType::reduction)
+        reductionDims.push_back(idx);
+    }
+    return redOp.getPartialResultTilePosition(rewriter, index, offsets, sizes,
+                                              resultOffset, resultSize,
+                                              reductionDims);
+  }
   default:
     return rewriter.notifyMatchFailure(op,
                                        "unhandled reduction tiling strategy");
   }
-  }
 }
 
 static FailureOr<MergeResult>
@@ -1467,6 +1476,47 @@ void SliceTrackingListener::notifyOperationReplaced(Operation *op,
                                                     ValueRange replacement) {
   removeOp(op);
 }
+
+//===----------------------------------------------------------------------===//
+// ReplacementListener
+//===----------------------------------------------------------------------===//
+
+/// Listener that tracks updates replacements for values which can be mutated.
+/// This listener runs on top of the existing listener for the rewriter,
+/// to make sure external users can still run listeners.
+class ReplacementListener : public RewriterBase::ForwardingListener {
+public:
+  ReplacementListener(DenseMap<Value, Value> &replacements,
+                      OpBuilder::Listener *listener)
+      : ForwardingListener(listener), replacements(replacements) {}
+
+  void updateReplacementValues(ValueRange origValues,
+                               ValueRange replaceValues) {
+    // This can probably be written better, but just iterates over the map
+    // and the new replacements for now.
+    for (auto &[key, val] : replacements) {
+      for (auto [orig, replace] : llvm::zip_equal(origValues, replaceValues)) {
+        if (val == orig) {
+          val = replace;
+        }
+      }
+    }
+  }
+
+  void notifyOperationReplaced(Operation *op, Operation *newOp) override {
+    ForwardingListener::notifyOperationReplaced(op, newOp);
+    updateReplacementValues(op->getResults(), newOp->getResults());
+  }
+
+  void notifyOperationReplaced(Operation *op, ValueRange values) override {
+    ForwardingListener::notifyOperationReplaced(op, values);
+    updateReplacementValues(op->getResults(), values);
+  }
+
+private:
+  DenseMap<Value, Value> &replacements;
+};
+
 } // namespace
 
 /// Implementation of tile consumer and fuse producer greedily.
@@ -1493,26 +1543,27 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF(
   for (auto *tiledOp : tilingResult->tiledOps)
     tiledAndFusedOps.insert(tiledOp);
 
+  DenseMap<Value, Value> replacements;
+  for (auto [origVal, replacement] : llvm::zip_equal(
+           consumer->getResults(), tilingResult->mergeResult.replacements)) {
+    replacements[origVal] = replacement;
+  }
+
   // If there are no loops generated, fusion is immaterial.
   auto &loops = tilingResult->loops;
   if (loops.empty()) {
-    DenseMap<Value, Value> replacements;
-    for (auto [origVal, replacement] : llvm::zip_equal(
-             consumer->getResults(), tilingResult->mergeResult.replacements)) {
-      replacements[origVal] = replacement;
-    }
     return scf::SCFTileAndFuseResult{fusedProducers, tiledAndFusedOps, loops,
                                      replacements};
   }
 
-  // To keep track of replacements for now just record the map from the
-  // original untiled value to the result number of the for loop. Since the
-  // loop gets potentially replaced during fusion, keeping the value directly
-  // wont work.
-  DenseMap<Value, size_t> origValToResultNumber;
-  for (auto [index, result] : llvm::enumerate(consumer->getResults())) {
-    origValToResultNumber[result] = index;
-  }
+  // Since the loop gets potentially replaced during fusion, we need to track
+  // the mutation of replacement values. To do this, we attach a listener to
+  // update the replacements as they happen.
+  OpBuilder::Listener *previousListener = rewriter.getListener();
+  auto resetListener =
+      llvm::make_scope_exit([&]() { rewriter.setListener(previousListener); });
+  ReplacementListener replaceListener(replacements, previousListener);
+  rewriter.setListener(&replaceListener);
 
   // 2. Typically, the operands of the tiled operation are slices of the
   //    operands of the untiled operation. These are expressed in IR using
@@ -1581,9 +1632,9 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF(
       worklistCandidates.append(newSlices.value());
       for (auto [index, result] :
            llvm::enumerate(fusableProducerOp->getResults())) {
-        origValToResultNumber[result] = loops.front()->getNumResults() -
-                                        fusableProducerOp->getNumResults() +
-                                        index;
+        replacements[result] = loops.front()->getResult(
+            loops.front()->getNumResults() -
+            fusableProducerOp->getNumResults() + index);
       }
     }
     if (Operation *tiledAndFusedOp =
@@ -1597,11 +1648,6 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF(
     }
   }
 
-  DenseMap<Value, Value> replacements;
-  for (auto [origVal, resultNumber] : origValToResultNumber) {
-    replacements[origVal] = loops.front()->getResult(resultNumber);
-  }
-
   return scf::SCFTileAndFuseResult{fusedProducers, tiledAndFusedOps, loops,
                                    replacements};
 }
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index 41410a0..6cda710 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -329,8 +329,9 @@ static void generateUnrolledLoop(
   // 'forOp'.
   auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
 
+  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
   if (!annotateFn)
-    annotateFn = [](unsigned, Operation *, OpBuilder) {};
+    annotateFn = defaultAnnotateFn;
 
   // Keep a pointer to the last non-terminator operation in the original block
   // so that we know what to clone (since we are doing this in-place).
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
index e8a40b1..9e9fea7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
@@ -7,11 +7,17 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
-#include "mlir/Transforms/OneToNTypeConversion.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
 using namespace mlir::sparse_tensor;
 
+/// Assert that the given value range contains a single value and return it.
+static Value getSingleValue(ValueRange values) {
+  assert(values.size() == 1 && "expected single value");
+  return values.front();
+}
+
 static void convertLevelType(SparseTensorEncodingAttr enc, Level lvl,
                              SmallVectorImpl<Type> &fields) {
   // Position and coordinate buffer in the sparse structure.
@@ -54,14 +60,17 @@ static ValueRange
 genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op,
                        Value loopCrd,
                        ArrayRef<std::unique_ptr<SparseIterator>> iters,
-                       ArrayRef<Region *> subCases, ArrayRef<Value> userReduc) {
-  if (subCases.empty())
+                       ArrayRef<Block *> newBlocks, ArrayRef<Block *> oldBlocks,
+                       ArrayRef<Value> userReduc) {
+  if (newBlocks.empty())
     return userReduc;
 
   // The current branch that we are handling.
-  Region *b = subCases.front();
+  Block *newBlock = newBlocks.front();
+  Block *oldBlock = oldBlocks.front();
   Value casePred = constantI1(rewriter, loc, true);
-  I64BitSet caseBits = op.getRegionDefinedSpace(b->getRegionNumber());
+  I64BitSet caseBits =
+      op.getRegionDefinedSpace(newBlock->getParent()->getRegionNumber());
   for (unsigned i : caseBits.bits()) {
     SparseIterator *it = iters[i].get();
     Value pred = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
@@ -80,16 +89,20 @@ genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op,
   for (unsigned idx : caseBits.bits())
     llvm::append_range(blockArgs, iters[idx]->getCursor());
 
+  // Map the old block arguments, because the dialect conversion driver does
+  // not immediately perform SSA value replacements. This function is still
+  // seeing the old uses.
   IRMapping mapping;
-  for (auto [from, to] :
-       llvm::zip_equal(b->front().getArguments(), blockArgs)) {
+  for (auto [from, to] : llvm::zip_equal(oldBlock->getArguments(), blockArgs)) {
     mapping.map(from, to);
   }
 
   // Clone the region, we can not erase the region now because the same region
   // might be a subcase for multiple lattice point.
-  rewriter.cloneRegionBefore(*b, ifOp.getThenRegion(),
+  rewriter.cloneRegionBefore(*newBlock->getParent(), ifOp.getThenRegion(),
                              ifOp.getThenRegion().begin(), mapping);
+  // Remove the block arguments, they were already replaced via `mapping`.
+  ifOp.getThenRegion().front().eraseArguments(0, blockArgs.size());
 
   // replace sparse_tensor::YieldOp -> scf::YieldOp
   auto spY = cast<sparse_tensor::YieldOp>(&ifOp.getThenRegion().front().back());
@@ -101,7 +114,8 @@ genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op,
   // Generates remaining case recursively.
   rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
   ValueRange res = genCoIterateBranchNest(rewriter, loc, op, loopCrd, iters,
-                                          subCases.drop_front(), userReduc);
+                                          newBlocks.drop_front(),
+                                          oldBlocks.drop_front(), userReduc);
   if (!res.empty())
     rewriter.create<scf::YieldOp>(loc, res);
 
@@ -119,15 +133,13 @@ static ValueRange genLoopWithIterator(
   if (it->iteratableByFor()) {
     auto [lo, hi] = it->genForCond(rewriter, loc);
     Value step = constantIndex(rewriter, loc, 1);
-    scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step, reduc);
+    scf::ForOp forOp = rewriter.create<scf::ForOp>(
+        loc, lo, hi, step, reduc,
+        [&](OpBuilder &b, Location loc, Value iv, ValueRange iterArgs) {
+          // Empty builder function to ensure that no terminator is created.
+        });
     {
       OpBuilder::InsertionGuard guard(rewriter);
-      // Erase the implicit yield operation created by ForOp when there is no
-      // yielding values.
-      if (!forOp.getBody()->empty())
-        rewriter.eraseOp(&forOp.getBody()->front());
-      assert(forOp.getBody()->empty());
-
       it->linkNewScope(forOp.getInductionVar());
       rewriter.setInsertionPointToStart(forOp.getBody());
       SmallVector<Value> ret = bodyBuilder(rewriter, loc, forOp.getBodyRegion(),
@@ -178,46 +190,47 @@ namespace {
 
 /// Sparse codegen rule for number of entries operator.
 class ExtractIterSpaceConverter
-    : public OneToNOpConversionPattern<ExtractIterSpaceOp> {
+    : public OpConversionPattern<ExtractIterSpaceOp> {
 public:
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+  using OpConversionPattern::OpConversionPattern;
   LogicalResult
-  matchAndRewrite(ExtractIterSpaceOp op, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(ExtractIterSpaceOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    const OneToNTypeMapping &resultMapping = adaptor.getResultMapping();
 
     // Construct the iteration space.
-    SparseIterationSpace space(loc, rewriter, op.getTensor(), 0,
+    SparseIterationSpace space(loc, rewriter,
+                               getSingleValue(adaptor.getTensor()), 0,
                                op.getLvlRange(), adaptor.getParentIter());
 
     SmallVector<Value> result = space.toValues();
-    rewriter.replaceOp(op, result, resultMapping);
+    rewriter.replaceOpWithMultiple(op, {result});
     return success();
   }
 };
 
 /// Sparse codegen rule for number of entries operator.
-class ExtractValOpConverter : public OneToNOpConversionPattern<ExtractValOp> {
+class ExtractValOpConverter : public OpConversionPattern<ExtractValOp> {
 public:
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+  using OpConversionPattern::OpConversionPattern;
   LogicalResult
-  matchAndRewrite(ExtractValOp op, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(ExtractValOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     Value pos = adaptor.getIterator().back();
-    Value valBuf = rewriter.create<ToValuesOp>(loc, op.getTensor());
+    Value valBuf =
+        rewriter.create<ToValuesOp>(loc, getSingleValue(adaptor.getTensor()));
     rewriter.replaceOpWithNewOp<memref::LoadOp>(op, valBuf, pos);
     return success();
   }
 };
 
-class SparseIterateOpConverter : public OneToNOpConversionPattern<IterateOp> {
+class SparseIterateOpConverter : public OpConversionPattern<IterateOp> {
 public:
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+  using OpConversionPattern::OpConversionPattern;
   LogicalResult
-  matchAndRewrite(IterateOp op, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(IterateOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     if (!op.getCrdUsedLvls().empty())
       return rewriter.notifyMatchFailure(
           op, "non-empty coordinates list not implemented.");
@@ -235,14 +248,15 @@ public:
       llvm::append_range(ivs, inits);
 
     // Type conversion on iterate op block.
-    OneToNTypeMapping blockTypeMapping(op.getBody()->getArgumentTypes());
+    unsigned numOrigArgs = op.getBody()->getArgumentTypes().size();
+    TypeConverter::SignatureConversion signatureConversion(numOrigArgs);
     if (failed(typeConverter->convertSignatureArgs(
-            op.getBody()->getArgumentTypes(), blockTypeMapping)))
+            op.getBody()->getArgumentTypes(), signatureConversion)))
       return rewriter.notifyMatchFailure(
           op, "failed to convert iterate region argurment types");
-    rewriter.applySignatureConversion(op.getBody(), blockTypeMapping);
 
-    Block *block = op.getBody();
+    Block *block = rewriter.applySignatureConversion(
+        op.getBody(), signatureConversion, getTypeConverter());
     ValueRange ret = genLoopWithIterator(
         rewriter, loc, it.get(), ivs,
         [block](PatternRewriter &rewriter, Location loc, Region &loopBody,
@@ -263,19 +277,17 @@ public:
           return result;
         });
 
-    const OneToNTypeMapping &resultMapping = adaptor.getResultMapping();
-    rewriter.replaceOp(op, ret, resultMapping);
+    rewriter.replaceOp(op, ret);
     return success();
   }
 };
 
-class SparseCoIterateOpConverter
-    : public OneToNOpConversionPattern<CoIterateOp> {
-  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+class SparseCoIterateOpConverter : public OpConversionPattern<CoIterateOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(CoIterateOp op, OpAdaptor adaptor,
-                  OneToNPatternRewriter &rewriter) const override {
+  matchAndRewrite(CoIterateOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
     assert(op.getSpaceDim() == 1 && "Not implemented");
     Location loc = op.getLoc();
 
@@ -299,18 +311,23 @@ class SparseCoIterateOpConverter
     assert(!needUniv && "Not implemented");
     (void)needUniv;
 
+    SmallVector<Block *> newBlocks;
+    DenseMap<Block *, Block *> newToOldBlockMap;
     for (Region &region : op.getCaseRegions()) {
       // Do a one-shot type conversion on all region blocks, since the same
       // region might be used multiple time.
       Block *block = &region.getBlocks().front();
-      OneToNTypeMapping blockTypeMapping(block->getArgumentTypes());
+      TypeConverter::SignatureConversion blockTypeMapping(
+          block->getArgumentTypes().size());
       if (failed(typeConverter->convertSignatureArgs(block->getArgumentTypes(),
                                                      blockTypeMapping))) {
         return rewriter.notifyMatchFailure(
             op, "failed to convert coiterate region argurment types");
       }
 
-      rewriter.applySignatureConversion(block, blockTypeMapping);
+      newBlocks.push_back(rewriter.applySignatureConversion(
+          block, blockTypeMapping, getTypeConverter()));
+      newToOldBlockMap[newBlocks.back()] = block;
     }
 
     SmallVector<SparseIterationSpace> spaces;
@@ -343,7 +360,7 @@ class SparseCoIterateOpConverter
 
     // Generates a loop sequence, one loop per case.
     for (auto [r, caseBits] :
-         llvm::zip_equal(op.getCaseRegions(), op.getRegionDefinedSpaces())) {
+         llvm::zip_equal(newBlocks, op.getRegionDefinedSpaces())) {
       assert(caseBits.count() > 0 && "Complement space not implemented");
 
       // Retrives a vector of pointers to the iterators used in the case.
@@ -359,11 +376,17 @@ class SparseCoIterateOpConverter
         // The subcases are never empty, it must contains at least the current
         // region itself.
         // TODO: these cases should be sorted.
-        SmallVector<Region *> subCases = op.getSubCasesOf(r.getRegionNumber());
+        SmallVector<Region *> subCases =
+            op.getSubCasesOf(r->getParent()->getRegionNumber());
+        SmallVector<Block *> newBlocks, oldBlocks;
+        for (Region *r : subCases) {
+          newBlocks.push_back(&r->front());
+          oldBlocks.push_back(newToOldBlockMap[newBlocks.back()]);
+        }
         assert(!subCases.empty());
 
-        ValueRange res = genCoIterateBranchNest(rewriter, loc, op, loopCrd,
-                                                iters, subCases, userReduc);
+        ValueRange res = genCoIterateBranchNest(
+            rewriter, loc, op, loopCrd, iters, newBlocks, oldBlocks, userReduc);
 
         SmallVector<Value> nextIterYields(res);
         // 2nd. foward the loop.
@@ -388,7 +411,7 @@ class SparseCoIterateOpConverter
         // This is a simple iteration loop.
         assert(caseBits.count() == 1);
 
-        Block *block = &r.getBlocks().front();
+        Block *block = r;
         ValueRange curResult = genLoopWithIterator(
             rewriter, loc, validIters.front(), userReduc,
             /*bodyBuilder=*/
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index 1cac949..153b9b1 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -172,11 +172,16 @@ struct LowerSparseIterationToSCFPass
     ConversionTarget target(*ctx);
 
     // The actual conversion.
-    target.addIllegalOp<ExtractIterSpaceOp, IterateOp>();
+    target.addLegalDialect<arith::ArithDialect, linalg::LinalgDialect,
+                           memref::MemRefDialect, scf::SCFDialect,
+                           sparse_tensor::SparseTensorDialect>();
+    target.addIllegalOp<CoIterateOp, ExtractIterSpaceOp, ExtractValOp,
+                        IterateOp>();
+    target.addLegalOp<UnrealizedConversionCastOp>();
     populateLowerSparseIterationToSCFPatterns(converter, patterns);
 
-    if (failed(applyPartialOneToNConversion(getOperation(), converter,
-                                            std::move(patterns))))
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
       signalPassFailure();
   }
 };
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
index 834e363..8bbb2cac 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
@@ -69,9 +69,6 @@ SparseTensorTypeToBufferConverter::SparseTensorTypeToBufferConverter() {
 
   // Required by scf.for 1:N type conversion.
   addSourceMaterialization(materializeTuple);
-
-  // Required as a workaround until we have full 1:N support.
-  addArgumentMaterialization(materializeTuple);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index f79c774..24a1d55 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -4795,6 +4795,44 @@ static SmallVector<Value> getNewOperands(DestinationStyleOpInterface op,
   return newOperands;
 }
 
+// Given the (potentially) updated packed type, `newPackedTy`, generates an
+// updated mixed-tile-sizes attribute. A tile size is updated only
+// when:
+//  * a dim from newPackedTy is static, and
+//  * the corresponding size from mixedTiles is still dynamic.
+// Otherwise, the original tile size is preserved.
+// Note - packed-type-dim and mixed-tile-size should always match!
+static SmallVector<OpFoldResult>
+getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy,
+                     SmallVector<OpFoldResult> mixedTiles) {
+  SmallVector<OpFoldResult> newMixedTileSizes;
+  for (auto it : llvm::zip(cast<ShapedType>(newPackedTy)
+                               .getShape()
+                               .take_back(mixedTiles.size()),
+                           mixedTiles)) {
+    int64_t shape = std::get<0>(it);
+    if (shape == ShapedType::kDynamic) {
+      newMixedTileSizes.push_back(std::get<1>(it));
+      continue;
+    }
+
+    // If the current result dim is static, update the dynamic mixed-size
+    // (provided the original value is dynamic).
+    OpFoldResult tile = std::get<1>(it);
+    if (Attribute attr = llvm::dyn_cast_if_present<Attribute>(tile)) {
+      // Already a constant
+      newMixedTileSizes.push_back(tile);
+    } else {
+      assert(getConstantIntValue(tile).value() == shape &&
+             "tile size and dim size don't match!");
+      newMixedTileSizes.push_back(
+          (rewriter.getIntegerAttr(rewriter.getIndexType(), shape)));
+    }
+  }
+
+  return newMixedTileSizes;
+}
+
 /// Folds a tensor.cast op into a consuming tensor::PackOp op if the
 /// `tensor.cast` has source that is more static than the consuming op.
 ///
@@ -4821,31 +4859,13 @@ struct FoldTensorCastPackOp : public OpRewritePattern<PackOp> {
     SmallVector<Value> newOperands = getNewOperands(op, newResultTypes);
 
     // Get the updated mixed-tile-sizes attribute.
-    SmallVector<OpFoldResult> newMixedTileSizes;
-    for (auto it : llvm::zip(cast<ShapedType>(newResultTypes[0])
-                                 .getShape()
-                                 .take_back(op.getMixedTiles().size()),
-                             op.getMixedTiles())) {
-      int64_t shape = std::get<0>(it);
-      if (shape == ShapedType::kDynamic) {
-        newMixedTileSizes.push_back(std::get<1>(it));
-        continue;
-      }
-
-      if (Attribute attr =
-              llvm::dyn_cast_if_present<Attribute>(std::get<1>(it))) {
-        // Already a constant
-        newMixedTileSizes.push_back(std::get<1>(it));
-      } else {
-        int64_t tileSize = getConstantIntValue(std::get<1>(it)).value();
-        assert(tileSize == shape && "tile size and dim size don't match!");
-        (void)tileSize;
-        newMixedTileSizes.push_back(
-            (rewriter.getIntegerAttr(rewriter.getIndexType(), shape)));
-      }
-    }
+    SmallVector<OpFoldResult> newMixedTileSizes =
+        getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles());
 
     // Clone op.
+    // TODO: Strictly speaking, discardable attributes should be _discarded_ at
+    // this point. However, in practice, we use them for things that we'd like
+    // to preserve. Implement a better abstraction.
     PackOp newOp = rewriter.create<PackOp>(
         op.getLoc(), newOperands[0], newOperands[1], op.getInnerDimsPos(),
         newMixedTileSizes, op.getPaddingValue(), op.getOuterDimsPerm());
@@ -4865,6 +4885,59 @@ struct FoldTensorCastPackOp : public OpRewritePattern<PackOp> {
   }
 };
 
+/// Folds a tensor.cast op into a consuming tensor::UnPackOp op if the
+/// `tensor.cast` has source that is more static than the consuming op.
+///
+/// Example:
+/// ```mlir
+///   %1 = tensor.cast %0 : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32>
+///   %2 = tensor.unpack %1 ... : tensor<1x1x?x1xi32> -> tensor<7x?xi32>
+/// ```
+///
+/// folds into:
+///
+/// ```mlir
+///   %2 = tensor.unpack %0  ... tensor<1x1x8x1xi32> -> tensor<7x?xi32>
+/// ```
+struct FoldTensorCastUnPackOp : public OpRewritePattern<UnPackOp> {
+  using OpRewritePattern<UnPackOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(UnPackOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!foldTensorCastPrecondition(op))
+      return failure();
+
+    SmallVector<Type> newResultTypes(op->getResultTypes());
+    SmallVector<Value> newOperands = getNewOperands(op, newResultTypes);
+    Value sourceTensor = newOperands[0];
+
+    // Get the updated mixed-tile-sizes attribute.
+    SmallVector<OpFoldResult> newMixedTileSizes = getNewMixedTileSizes(
+        rewriter, sourceTensor.getType(), op.getMixedTiles());
+
+    // Clone op.
+    // TODO: Strictly speaking, discardable attributes should be _discarded_ at
+    // this point. However, in practice, we use them for things that we'd like
+    // to preserve. Implement a better abstraction.
+    UnPackOp newOp = rewriter.create<UnPackOp>(
+        op.getLoc(), sourceTensor, newOperands[1], op.getInnerDimsPos(),
+        newMixedTileSizes, op.getOuterDimsPerm());
+    newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary());
+
+    // Replace op.
+    Value oldResult = op.getResult();
+    Value newResult = newOp.getResult();
+    Value replacement = (newResult.getType() != oldResult.getType())
+                            ? rewriter.create<tensor::CastOp>(
+                                  op->getLoc(), oldResult.getType(), newResult)
+                            : newResult;
+
+    rewriter.replaceOp(op, {replacement});
+
+    return success();
+  }
+};
+
 /// Folds a tensor.cast op into a consuming DestinationStyleOpInterface op if
 /// the `tensor.cast` has source that is more static than the consuming op.
 ///
@@ -4890,7 +4963,8 @@ struct FoldTensorCastProducerOp
                                 PatternRewriter &rewriter) const override {
 
     // Reject tensor::PackOp - there's dedicated pattern for that instead.
-    if (!foldTensorCastPrecondition(op) || dyn_cast<tensor::PackOp>(*op))
+    if (!foldTensorCastPrecondition(op) ||
+        isa<tensor::PackOp, tensor::UnPackOp>(*op))
       return failure();
 
     SmallVector<Type> newResultTypes(op->getResultTypes());
@@ -4923,6 +4997,7 @@ struct FoldTensorCastProducerOp
 void TensorDialect::getCanonicalizationPatterns(
     RewritePatternSet &results) const {
   results.add<FoldTensorCastPackOp>(getContext());
+  results.add<FoldTensorCastUnPackOp>(getContext());
   results.add<FoldTensorCastProducerOp>(getContext());
 }
 
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 39d0ee1..f51c3db 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -1002,10 +1002,6 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
       return input.reshape(resultTy);
   }
 
-  // Transpose does not change the input type.
-  if (getInput1().getType() != getType())
-    return {};
-
   // Transpose is not the identity transpose.
   SmallVector<int32_t> perms;
   if (getConstantPerms(perms).failed())
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 631d3c48..764a5db 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -210,7 +210,12 @@ template <typename T>
 static LogicalResult verifyConvOp(T op) {
   // All TOSA conv ops have an input() and weight().
   auto inputType = llvm::dyn_cast<RankedTensorType>(op.getInput().getType());
-  auto weightType = llvm::dyn_cast<RankedTensorType>(op.getWeight().getType());
+
+  RankedTensorType weightType;
+  if constexpr (std::is_same_v<T, tosa::TransposeConv2DOp>)
+    weightType = llvm::dyn_cast<RankedTensorType>(op.getFilter().getType());
+  else
+    weightType = llvm::dyn_cast<RankedTensorType>(op.getWeight().getType());
 
   // Must be ranked tensor types
   if (!inputType) {
@@ -218,7 +223,13 @@ static LogicalResult verifyConvOp(T op) {
     return failure();
   }
   if (!weightType) {
-    op.emitOpError("expect a ranked tensor for weight, got ") << op.getWeight();
+    if constexpr (std::is_same_v<T, tosa::TransposeConv2DOp>) {
+      op.emitOpError("expect a ranked tensor for filter, got ")
+          << op.getFilter();
+    } else {
+      op.emitOpError("expect a ranked tensor for weight, got ")
+          << op.getWeight();
+    }
     return failure();
   }
 
@@ -271,6 +282,38 @@ LogicalResult tosa::ConstOp::verify() {
   return success();
 }
 
+template <typename T>
+static LogicalResult verifyConvOpModes(T op) {
+  auto inputEType =
+      llvm::cast<ShapedType>(op.getInput().getType()).getElementType();
+
+  if (auto quantType =
+          llvm::dyn_cast<mlir::quant::UniformQuantizedType>(inputEType))
+    inputEType = quantType.getStorageType();
+
+  auto accType = op.getAccType();
+  if (inputEType.isInteger(8) && !accType.isInteger(32))
+    return op.emitOpError("accumulator type for i8 tensor is not i32");
+
+  if (inputEType.isInteger(16) && !accType.isInteger(48))
+    return op.emitOpError("accumulator type for i16 tensor is not i48");
+
+  if ((inputEType.isFloat8E5M2() || inputEType.isFloat8E4M3()) &&
+      !accType.isF16())
+    return op.emitOpError("accumulator type for f8 tensor is not f16");
+
+  if (inputEType.isF16() && !(accType.isF16() || accType.isF32()))
+    return op.emitOpError("accumulator type for f16 tensor is not f16/f32");
+
+  if (inputEType.isBF16() && !accType.isF32())
+    return op.emitOpError("accumulator type for bf16 tensor is not f32");
+
+  if (inputEType.isF32() && !accType.isF32())
+    return op.emitOpError("accumulator type for f32 tensor is not f32");
+
+  return success();
+}
+
 LogicalResult tosa::ArgMaxOp::verify() {
   // Ensure output is of 32-bit integer
   const auto resultETy = llvm::cast<ShapedType>(getType()).getElementType();
@@ -368,12 +411,14 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result,
                                      Type outputType, Value input, Value weight,
                                      Value bias, DenseI64ArrayAttr pad,
                                      DenseI64ArrayAttr stride,
-                                     DenseI64ArrayAttr dilation) {
+                                     DenseI64ArrayAttr dilation,
+                                     TypeAttr accType) {
 
   result.addOperands({input, weight, bias});
   result.addAttribute("pad", pad);
   result.addAttribute("stride", stride);
   result.addAttribute("dilation", dilation);
+  result.addAttribute("acc_type", accType);
 
   auto quantAttr = buildConvOpQuantizationAttr(builder, input, weight);
   if (quantAttr) {
@@ -390,11 +435,12 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result,
 static void buildTransConvOpWithQuantInfo(
     OpBuilder &builder, OperationState &result, Type outputType, Value input,
     Value weight, Value bias, DenseI64ArrayAttr outpad,
-    DenseI64ArrayAttr stride, DenseI64ArrayAttr outputShape) {
+    DenseI64ArrayAttr stride, DenseI64ArrayAttr outputShape, TypeAttr accType) {
   result.addOperands({input, weight, bias});
   result.addAttribute("out_pad", outpad);
   result.addAttribute("stride", stride);
   result.addAttribute("out_shape", outputShape);
+  result.addAttribute("acc_type", accType);
   auto quantAttr = ::buildConvOpQuantizationAttr(builder, input, weight);
 
   if (quantAttr) {
@@ -787,7 +833,7 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents(
       return success();
     }
 
-    outputShape.resize(paddingShape.getDimSize(0), ShapedType::kDynamic);
+    outputShape.resize(paddingShape.getDimSize(0) / 2, ShapedType::kDynamic);
     inferredReturnShapes.push_back(ShapedTypeComponents(outputShape));
     return success();
   }
@@ -823,13 +869,17 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents(
 LogicalResult tosa::PadOp::verify() {
   RankedTensorType inputType = getInput1().getType();
   RankedTensorType outputType = getOutput().getType();
-  TensorType paddingType = getPadding().getType();
+  RankedTensorType paddingType = getPadding().getType();
 
   if (inputType.getRank() != outputType.getRank())
     return emitOpError() << "expect same input and output tensor rank.";
 
-  if (paddingType.hasRank() && paddingType.getRank() != 2)
-    return emitOpError() << "expect 'padding' tensor rank equal to 2.";
+  if (!paddingType.isDynamicDim(0) &&
+      paddingType.getDimSize(0) != inputType.getRank() * 2)
+    return emitOpError() << "expected padding tensor dim 0 to have size "
+                         << inputType.getRank() * 2
+                         << " (2*rank(shape1)) but got size "
+                         << paddingType.getDimSize(0);
 
   return success();
 }
@@ -1595,7 +1645,11 @@ LogicalResult Conv2DOp::inferReturnTypeComponents(
   return success();
 }
 
-LogicalResult Conv2DOp::verify() { return verifyConvOp(*this); }
+LogicalResult Conv2DOp::verify() {
+  if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed())
+    return failure();
+  return success();
+}
 
 LogicalResult Conv3DOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
@@ -1667,7 +1721,11 @@ LogicalResult Conv3DOp::inferReturnTypeComponents(
   return success();
 }
 
-LogicalResult Conv3DOp::verify() { return verifyConvOp(*this); }
+LogicalResult Conv3DOp::verify() {
+  if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed())
+    return failure();
+  return success();
+}
 
 LogicalResult AvgPool2dOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
@@ -1762,7 +1820,11 @@ LogicalResult DepthwiseConv2DOp::inferReturnTypeComponents(
   return success();
 }
 
-LogicalResult DepthwiseConv2DOp::verify() { return verifyConvOp(*this); }
+LogicalResult DepthwiseConv2DOp::verify() {
+  if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed())
+    return failure();
+  return success();
+}
 
 LogicalResult TransposeConv2DOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
@@ -1828,6 +1890,12 @@ LogicalResult TransposeConv2DOp::inferReturnTypeComponents(
   return success();
 }
 
+LogicalResult TransposeConv2DOp::verify() {
+  if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed())
+    return failure();
+  return success();
+}
+
 LogicalResult IfOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     IfOp::Adaptor adaptor,
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
index 44f64f7..04a709c 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
@@ -81,7 +81,7 @@ struct Conv2DIsFullyConnected : public OpRewritePattern<tosa::Conv2DOp> {
         }
       }
 
-      auto padSizeTy = RankedTensorType::get({4, 2}, rewriter.getI64Type());
+      auto padSizeTy = RankedTensorType::get({8}, rewriter.getI64Type());
       auto padSize =
           DenseIntElementsAttr::get(padSizeTy, ArrayRef<int64_t>(pad));
       Value padSizeVal =
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
index e6fba21..14f392a 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
@@ -108,7 +108,7 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern<tosa::DepthwiseConv2DOp> {
         }
       }
 
-      auto padSizeTy = RankedTensorType::get({5, 2}, rewriter.getI64Type());
+      auto padSizeTy = RankedTensorType::get({10}, rewriter.getI64Type());
       auto padSize =
           DenseIntElementsAttr::get(padSizeTy, ArrayRef<int64_t>(pad));
       Value padSizeVal =
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index 0779cdb..db1e219 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -75,13 +75,15 @@ public:
           loc, resultTy, input, reverse2, bias,
           rewriter.getDenseI64ArrayAttr(convPad),
           rewriter.getDenseI64ArrayAttr(stride),
-          rewriter.getDenseI64ArrayAttr({1, 1}), *op.getQuantizationInfo());
+          rewriter.getDenseI64ArrayAttr({1, 1}),
+          /* acc_type = */ op.getAccType(), *op.getQuantizationInfo());
     } else {
       conv2d = rewriter.create<tosa::Conv2DOp>(
           loc, resultTy, input, reverse2, bias,
           rewriter.getDenseI64ArrayAttr(convPad),
           rewriter.getDenseI64ArrayAttr(stride),
-          rewriter.getDenseI64ArrayAttr({1, 1}));
+          rewriter.getDenseI64ArrayAttr({1, 1}),
+          /* acc_type = */ op.getAccTypeAttr());
     }
 
     rewriter.replaceOp(op, conv2d);
@@ -139,7 +141,7 @@ public:
     weightPadding[5] =
         (weightWidth % stride[1]) ? (stride[1] - weightWidth % stride[1]) : 0;
     DenseElementsAttr weightPaddingAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({4, 2}, rewriter.getI32Type()), weightPadding);
+        RankedTensorType::get({8}, rewriter.getI32Type()), weightPadding);
     Value weightPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, weightPaddingAttr.getType(), weightPaddingAttr);
 
@@ -202,7 +204,7 @@ public:
     inputPadding[5] += restridedWeightTy.getDimSize(2) - 1;
 
     DenseElementsAttr inputPaddingAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({4, 2}, rewriter.getI32Type()), inputPadding);
+        RankedTensorType::get({8}, rewriter.getI32Type()), inputPadding);
 
     Value inputPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, inputPaddingAttr.getType(), inputPaddingAttr);
@@ -238,7 +240,7 @@ public:
                    /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}),
                    /*stride=*/rewriter.getDenseI64ArrayAttr({1, 1}),
                    /*dilation=*/rewriter.getDenseI64ArrayAttr({1, 1}),
-                   *op.getQuantizationInfo())
+                   /* acc_type = */ op.getAccType(), *op.getQuantizationInfo())
                    .getResult();
     } else {
       conv2d = CreateOpAndInferShape<tosa::Conv2DOp>(
@@ -246,7 +248,8 @@ public:
                    weight, zeroBias,
                    /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}),
                    /*stride=*/rewriter.getDenseI64ArrayAttr({1, 1}),
-                   /*dilation=*/rewriter.getDenseI64ArrayAttr({1, 1}))
+                   /*dilation=*/rewriter.getDenseI64ArrayAttr({1, 1}),
+                   /* acc_type = */ op.getAccTypeAttr())
                    .getResult();
     }
 
@@ -314,7 +317,7 @@ public:
     resultPadding[5] = resultTy.getDimSize(2) - resultPadLeft - sliceSize[2];
 
     DenseElementsAttr resultPaddingAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({4, 2}, rewriter.getI32Type()), resultPadding);
+        RankedTensorType::get({8}, rewriter.getI32Type()), resultPadding);
 
     Value resultPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, resultPaddingAttr.getType(), resultPaddingAttr);
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 6fd6710..8588c87 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -542,9 +542,13 @@ bool TosaValidation::isValidElementType(Type type) {
 
 void TosaValidation::runOnOperation() {
   configLevelAndProfile();
+
+  TosaDialect *tosaDialect = getContext().getLoadedDialect<TosaDialect>();
+  if (!tosaDialect)
+    return;
+
   getOperation().walk([&](Operation *op) {
-    if (!op->getDialect() ||
-        op->getDialect()->getNamespace() != TosaDialect::getDialectNamespace())
+    if (op->getDialect() != tosaDialect)
       return;
 
     for (Value operand : op->getOperands()) {
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 106a794..798853a 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -2840,6 +2840,7 @@ transform::PrintOp::apply(transform::TransformRewriter &rewriter,
     llvm::outs() << "top-level ]]]\n";
     state.getTopLevel()->print(llvm::outs(), printFlags);
     llvm::outs() << "\n";
+    llvm::outs().flush();
     return DiagnosedSilenceableFailure::success();
   }
 
@@ -2849,6 +2850,7 @@ transform::PrintOp::apply(transform::TransformRewriter &rewriter,
     llvm::outs() << "\n";
   }
 
+  llvm::outs().flush();
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 491b5f4..ae1cf95 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2523,8 +2523,16 @@ OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) {
   if (!adaptor.getSource())
     return {};
   auto vectorType = getResultVectorType();
-  if (llvm::isa<IntegerAttr, FloatAttr>(adaptor.getSource()))
-    return DenseElementsAttr::get(vectorType, adaptor.getSource());
+  if (auto attr = llvm::dyn_cast<IntegerAttr>(adaptor.getSource())) {
+    if (vectorType.getElementType() != attr.getType())
+      return {};
+    return DenseElementsAttr::get(vectorType, attr);
+  }
+  if (auto attr = llvm::dyn_cast<FloatAttr>(adaptor.getSource())) {
+    if (vectorType.getElementType() != attr.getType())
+      return {};
+    return DenseElementsAttr::get(vectorType, attr);
+  }
   if (auto attr = llvm::dyn_cast<SplatElementsAttr>(adaptor.getSource()))
     return DenseElementsAttr::get(vectorType, attr.getSplatValue<Attribute>());
   return {};
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 7576319..68535ae 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -481,7 +481,6 @@ void mlir::vector::populateVectorLinearizeTypeConversionsAndLegality(
 
     return builder.create<vector::ShapeCastOp>(loc, type, inputs.front());
   };
-  typeConverter.addArgumentMaterialization(materializeCast);
   typeConverter.addSourceMaterialization(materializeCast);
   typeConverter.addTargetMaterialization(materializeCast);
   target.markUnknownOpDynamicallyLegal(
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 6fe9650..c603db4 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -284,22 +284,29 @@ OpPrintingFlags &OpPrintingFlags::skipRegions(bool skip) {
 }
 
 /// Do not verify the operation when using custom operation printers.
-OpPrintingFlags &OpPrintingFlags::assumeVerified() {
-  assumeVerifiedFlag = true;
+OpPrintingFlags &OpPrintingFlags::assumeVerified(bool enable) {
+  assumeVerifiedFlag = enable;
   return *this;
 }
 
 /// Use local scope when printing the operation. This allows for using the
 /// printer in a more localized and thread-safe setting, but may not necessarily
 /// be identical of what the IR will look like when dumping the full module.
-OpPrintingFlags &OpPrintingFlags::useLocalScope() {
-  printLocalScope = true;
+OpPrintingFlags &OpPrintingFlags::useLocalScope(bool enable) {
+  printLocalScope = enable;
   return *this;
 }
 
 /// Print users of values as comments.
-OpPrintingFlags &OpPrintingFlags::printValueUsers() {
-  printValueUsersFlag = true;
+OpPrintingFlags &OpPrintingFlags::printValueUsers(bool enable) {
+  printValueUsersFlag = enable;
+  return *this;
+}
+
+/// Print unique SSA ID numbers for values, block arguments and naming conflicts
+/// across all regions
+OpPrintingFlags &OpPrintingFlags::printUniqueSSAIDs(bool enable) {
+  printUniqueSSAIDsFlag = enable;
   return *this;
 }
 
diff --git a/mlir/lib/IR/Dominance.cpp b/mlir/lib/IR/Dominance.cpp
index 406e0f2..1c54e09d 100644
--- a/mlir/lib/IR/Dominance.cpp
+++ b/mlir/lib/IR/Dominance.cpp
@@ -213,61 +213,73 @@ DominanceInfoBase<IsPostDom>::findNearestCommonDominator(Block *a,
   return getDomTree(a->getParent()).findNearestCommonDominator(a, b);
 }
 
-/// Return true if the specified block A properly dominates block B.
-template <bool IsPostDom>
-bool DominanceInfoBase<IsPostDom>::properlyDominatesImpl(Block *a,
-                                                         Block *b) const {
-  assert(a && b && "null blocks not allowed");
+/// Returns the given block iterator if it lies within the region region.
+/// Otherwise, otherwise finds the ancestor of the given block iterator that
+/// lies within the given region. Returns and "empty" iterator if the latter
+/// fails.
+///
+/// Note: This is a variant of Region::findAncestorOpInRegion that operates on
+/// block iterators instead of ops.
+static std::pair<Block *, Block::iterator>
+findAncestorIteratorInRegion(Region *r, Block *b, Block::iterator it) {
+  // Case 1: The iterator lies within the region region.
+  if (b->getParent() == r)
+    return std::make_pair(b, it);
+
+  // Otherwise: Find ancestor iterator. Bail if we run out of parent ops.
+  Operation *parentOp = b->getParentOp();
+  if (!parentOp)
+    return std::make_pair(static_cast<Block *>(nullptr), Block::iterator());
+  Operation *op = r->findAncestorOpInRegion(*parentOp);
+  if (!op)
+    return std::make_pair(static_cast<Block *>(nullptr), Block::iterator());
+  return std::make_pair(op->getBlock(), op->getIterator());
+}
 
-  // A block dominates, but does not properly dominate, itself unless this
-  // is a graph region.
+/// Given two iterators into the same block, return "true" if `a` is before `b.
+/// Note: This is a variant of Operation::isBeforeInBlock that operates on
+/// block iterators instead of ops.
+static bool isBeforeInBlock(Block *block, Block::iterator a,
+                            Block::iterator b) {
   if (a == b)
-    return !hasSSADominance(a);
-
-  // If both blocks are not in the same region, `a` properly dominates `b` if
-  // `b` is defined in an operation region that (recursively) ends up being
-  // dominated by `a`. Walk up the list of containers enclosing B.
-  Region *regionA = a->getParent();
-  if (regionA != b->getParent()) {
-    b = regionA ? regionA->findAncestorBlockInRegion(*b) : nullptr;
-    // If we could not find a valid block b then it is a not a dominator.
-    if (!b)
-      return false;
-
-    // Check to see if the ancestor of `b` is the same block as `a`.  A properly
-    // dominates B if it contains an op that contains the B block.
-    if (a == b)
-      return true;
-  }
-
-  // Otherwise, they are two different blocks in the same region, use DomTree.
-  return getDomTree(regionA).properlyDominates(a, b);
+    return false;
+  if (a == block->end())
+    return false;
+  if (b == block->end())
+    return true;
+  return a->isBeforeInBlock(&*b);
 }
 
 template <bool IsPostDom>
 bool DominanceInfoBase<IsPostDom>::properlyDominatesImpl(
-    Operation *a, Operation *b, bool enclosingOpOk) const {
-  Block *aBlock = a->getBlock(), *bBlock = b->getBlock();
-  assert(aBlock && bBlock && "operations must be in a block");
+    Block *aBlock, Block::iterator aIt, Block *bBlock, Block::iterator bIt,
+    bool enclosingOk) const {
+  assert(aBlock && bBlock && "expected non-null blocks");
 
-  // An operation (pos)dominates, but does not properly (pos)dominate, itself
-  // unless this is a graph region.
-  if (a == b)
+  // A block iterator (post)dominates, but does not properly (post)dominate,
+  // itself unless this is a graph region.
+  if (aBlock == bBlock && aIt == bIt)
     return !hasSSADominance(aBlock);
 
-  // If these ops are in different regions, then normalize one into the other.
+  // If the iterators are in different regions, then normalize one into the
+  // other.
   Region *aRegion = aBlock->getParent();
   if (aRegion != bBlock->getParent()) {
-    // Scoot up b's region tree until we find an operation in A's region that
+    // Scoot up b's region tree until we find a location in A's region that
     // encloses it.  If this fails, then we know there is no (post)dom relation.
-    b = aRegion ? aRegion->findAncestorOpInRegion(*b) : nullptr;
-    if (!b)
+    if (!aRegion) {
+      bBlock = nullptr;
+      bIt = Block::iterator();
+    } else {
+      std::tie(bBlock, bIt) =
+          findAncestorIteratorInRegion(aRegion, bBlock, bIt);
+    }
+    if (!bBlock)
       return false;
-    bBlock = b->getBlock();
-    assert(bBlock->getParent() == aRegion);
+    assert(bBlock->getParent() == aRegion && "expected block in regionA");
 
     // If 'a' encloses 'b', then we consider it to (post)dominate.
-    if (a == b && enclosingOpOk)
+    if (aBlock == bBlock && aIt == bIt && enclosingOk)
       return true;
   }
 
@@ -279,9 +291,9 @@ bool DominanceInfoBase<IsPostDom>::properlyDominatesImpl(
     if (!hasSSADominance(aBlock))
       return true;
     if constexpr (IsPostDom) {
-      return b->isBeforeInBlock(a);
+      return isBeforeInBlock(aBlock, bIt, aIt);
     } else {
-      return a->isBeforeInBlock(b);
+      return isBeforeInBlock(aBlock, aIt, bIt);
     }
   }
 
@@ -309,6 +321,18 @@ template class detail::DominanceInfoBase</*IsPostDom=*/false>;
 // DominanceInfo
 //===----------------------------------------------------------------------===//
 
+bool DominanceInfo::properlyDominates(Operation *a, Operation *b,
+                                      bool enclosingOpOk) const {
+  return super::properlyDominatesImpl(a->getBlock(), a->getIterator(),
+                                      b->getBlock(), b->getIterator(),
+                                      enclosingOpOk);
+}
+
+bool DominanceInfo::properlyDominates(Block *a, Block *b) const {
+  return super::properlyDominatesImpl(a, a->begin(), b, b->begin(),
+                                      /*enclosingOk=*/true);
+}
+
 /// Return true if the `a` value properly dominates operation `b`, i.e if the
 /// operation that defines `a` properlyDominates `b` and the operation that
 /// defines `a` does not contain `b`.
@@ -322,3 +346,19 @@ bool DominanceInfo::properlyDominates(Value a, Operation *b) const {
   // `b`, but `a` does not itself enclose `b` in one of its regions.
   return properlyDominates(a.getDefiningOp(), b, /*enclosingOpOk=*/false);
 }
+
+//===----------------------------------------------------------------------===//
+// PostDominanceInfo
+//===----------------------------------------------------------------------===//
+
+bool PostDominanceInfo::properlyPostDominates(Operation *a, Operation *b,
+                                              bool enclosingOpOk) const {
+  return super::properlyDominatesImpl(a->getBlock(), a->getIterator(),
+                                      b->getBlock(), b->getIterator(),
+                                      enclosingOpOk);
+}
+
+bool PostDominanceInfo::properlyPostDominates(Block *a, Block *b) const {
+  return super::properlyDominatesImpl(a, a->end(), b, b->end(),
+                                      /*enclosingOk=*/true);
+}
diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index 7a73a94..1eab413 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -386,7 +386,15 @@ mlir::intrange::inferCeilDivS(ArrayRef<ConstantIntRanges> argRanges) {
     }
     return result;
   };
-  return inferDivSRange(lhs, rhs, ceilDivSIFix);
+  ConstantIntRanges result = inferDivSRange(lhs, rhs, ceilDivSIFix);
+  if (lhs.smin().isMinSignedValue() && lhs.smax().sgt(lhs.smin())) {
+    // If lhs range includes INT_MIN and lhs is not a single value, we can
+    // suddenly wrap to positive val, skipping entire negative range, add
+    // [INT_MIN + 1, smax()] range to the result to handle this.
+    auto newLhs = ConstantIntRanges::fromSigned(lhs.smin() + 1, lhs.smax());
+    result = result.rangeUnion(inferDivSRange(newLhs, rhs, ceilDivSIFix));
+  }
+  return result;
 }
 
 ConstantIntRanges
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index cf58bc5..659ab12 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -237,15 +237,7 @@ public:
       generateMetadata(value.getInt(), "maxnreg");
     } else if (attribute.getName() ==
                NVVM::NVVMDialect::getKernelFuncAttrName()) {
-      llvm::Metadata *llvmMetadataKernel[] = {
-          llvm::ValueAsMetadata::get(llvmFunc),
-          llvm::MDString::get(llvmContext, "kernel"),
-          llvm::ValueAsMetadata::get(
-              llvm::ConstantInt::get(llvm::Type::getInt32Ty(llvmContext), 1))};
-      llvm::MDNode *llvmMetadataNode =
-          llvm::MDNode::get(llvmContext, llvmMetadataKernel);
-      moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations")
-          ->addOperand(llvmMetadataNode);
+      llvmFunc->setCallingConv(llvm::CallingConv::PTX_Kernel);
     }
     return success();
   }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 060113c..87cb7f0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -150,10 +150,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
                           << " operation";
   };
 
-  auto checkAligned = [&todo](auto op, LogicalResult &result) {
-    if (!op.getAlignedVars().empty() || op.getAlignments())
-      result = todo("aligned");
-  };
   auto checkAllocate = [&todo](auto op, LogicalResult &result) {
     if (!op.getAllocateVars().empty() || !op.getAllocatorVars().empty())
       result = todo("allocate");
@@ -275,7 +271,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       })
       .Case([&](omp::ParallelOp op) { checkAllocate(op, result); })
       .Case([&](omp::SimdOp op) {
-        checkAligned(op, result);
         checkLinear(op, result);
         checkNontemporal(op, result);
         checkPrivate(op, result);
@@ -1701,6 +1696,12 @@ buildDependData(std::optional<ArrayAttr> dependKinds, OperandRange dependVars,
     case mlir::omp::ClauseTaskDepend::taskdependinout:
       type = llvm::omp::RTLDependenceKindTy::DepInOut;
       break;
+    case mlir::omp::ClauseTaskDepend::taskdependmutexinoutset:
+      type = llvm::omp::RTLDependenceKindTy::DepMutexInOutSet;
+      break;
+    case mlir::omp::ClauseTaskDepend::taskdependinoutset:
+      type = llvm::omp::RTLDependenceKindTy::DepInOutSet;
+      break;
     };
     llvm::Value *depVal = moduleTranslation.lookupValue(std::get<0>(dep));
     llvm::OpenMPIRBuilder::DependData dd(type, depVal->getType(), depVal);
@@ -2296,6 +2297,24 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::MapVector<llvm::Value *, llvm::Value *> alignedVars;
   llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrder());
+  llvm::BasicBlock *sourceBlock = builder.GetInsertBlock();
+  std::optional<ArrayAttr> alignmentValues = simdOp.getAlignments();
+  mlir::OperandRange operands = simdOp.getAlignedVars();
+  for (size_t i = 0; i < operands.size(); ++i) {
+    llvm::Value *alignment = nullptr;
+    llvm::Value *llvmVal = moduleTranslation.lookupValue(operands[i]);
+    llvm::Type *ty = llvmVal->getType();
+    if (auto intAttr = llvm::dyn_cast<IntegerAttr>((*alignmentValues)[i])) {
+      alignment = builder.getInt64(intAttr.getInt());
+      assert(ty->isPointerTy() && "Invalid type for aligned variable");
+      assert(alignment && "Invalid alignment value");
+      auto curInsert = builder.saveIP();
+      builder.SetInsertPoint(sourceBlock->getTerminator());
+      llvmVal = builder.CreateLoad(ty, llvmVal);
+      builder.restoreIP(curInsert);
+      alignedVars[llvmVal] = alignment;
+    }
+  }
   ompBuilder->applySimd(loopInfo, alignedVars,
                         simdOp.getIfExpr()
                             ? moduleTranslation.lookupValue(simdOp.getIfExpr())
@@ -2569,6 +2588,7 @@ static LogicalResult
 convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation) {
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   auto threadprivateOp = cast<omp::ThreadprivateOp>(opInst);
 
   if (failed(checkImplementationStatus(opInst)))
@@ -2576,6 +2596,10 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder,
 
   Value symAddr = threadprivateOp.getSymAddr();
   auto *symOp = symAddr.getDefiningOp();
+
+  if (auto asCast = dyn_cast<LLVM::AddrSpaceCastOp>(symOp))
+    symOp = asCast.getOperand().getDefiningOp();
+
   if (!isa<LLVM::AddressOfOp>(symOp))
     return opInst.emitError("Addressing symbol not found");
   LLVM::AddressOfOp addressOfOp = dyn_cast<LLVM::AddressOfOp>(symOp);
@@ -2583,17 +2607,20 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder,
   LLVM::GlobalOp global =
       addressOfOp.getGlobal(moduleTranslation.symbolTable());
   llvm::GlobalValue *globalValue = moduleTranslation.lookupGlobal(global);
-  llvm::Type *type = globalValue->getValueType();
-  llvm::TypeSize typeSize =
-      builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize(
-          type);
-  llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue());
-  llvm::StringRef suffix = llvm::StringRef(".cache", 6);
-  std::string cacheName = (Twine(global.getSymName()).concat(suffix)).str();
-  llvm::Value *callInst =
-      moduleTranslation.getOpenMPBuilder()->createCachedThreadPrivate(
-          ompLoc, globalValue, size, cacheName);
-  moduleTranslation.mapValue(opInst.getResult(0), callInst);
+
+  if (!ompBuilder->Config.isTargetDevice()) {
+    llvm::Type *type = globalValue->getValueType();
+    llvm::TypeSize typeSize =
+        builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize(
+            type);
+    llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue());
+    llvm::Value *callInst = ompBuilder->createCachedThreadPrivate(
+        ompLoc, globalValue, size, global.getSymName() + ".cache");
+    moduleTranslation.mapValue(opInst.getResult(0), callInst);
+  } else {
+    moduleTranslation.mapValue(opInst.getResult(0), globalValue);
+  }
+
   return success();
 }
 
@@ -4193,6 +4220,14 @@ static bool isTargetDeviceOp(Operation *op) {
   if (op->getParentOfType<omp::TargetOp>())
     return true;
 
+  // Certain operations return results, and whether utilised in host or
+  // target there is a chance an LLVM Dialect operation depends on it
+  // by taking it in as an operand, so we must always lower these in
+  // some manner or result in an ICE (whether they end up in a no-op
+  // or otherwise).
+  if (mlir::isa<omp::ThreadprivateOp>(op))
+    return true;
+
   if (auto parentFn = op->getParentOfType<LLVM::LLVMFuncOp>())
     if (auto declareTargetIface =
             llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index b0d5e63..2d8d774 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -427,19 +427,33 @@ ModuleImport::processAliasScopeMetadata(const llvm::MDNode *node) {
     return node->getNumOperands() != 0 &&
            node == dyn_cast<llvm::MDNode>(node->getOperand(0));
   };
+  auto verifySelfRefOrString = [](const llvm::MDNode *node) {
+    return node->getNumOperands() != 0 &&
+           (node == dyn_cast<llvm::MDNode>(node->getOperand(0)) ||
+            isa<llvm::MDString>(node->getOperand(0)));
+  };
   // Helper that verifies the given operand is a string or does not exist.
   auto verifyDescription = [](const llvm::MDNode *node, unsigned idx) {
     return idx >= node->getNumOperands() ||
            isa<llvm::MDString>(node->getOperand(idx));
   };
+
+  auto getIdAttr = [&](const llvm::MDNode *node) -> Attribute {
+    if (verifySelfRef(node))
+      return DistinctAttr::create(builder.getUnitAttr());
+
+    auto name = cast<llvm::MDString>(node->getOperand(0));
+    return builder.getStringAttr(name->getString());
+  };
+
   // Helper that creates an alias scope domain attribute.
   auto createAliasScopeDomainOp = [&](const llvm::MDNode *aliasDomain) {
     StringAttr description = nullptr;
     if (aliasDomain->getNumOperands() >= 2)
       if (auto *operand = dyn_cast<llvm::MDString>(aliasDomain->getOperand(1)))
         description = builder.getStringAttr(operand->getString());
-    return builder.getAttr<AliasScopeDomainAttr>(
-        DistinctAttr::create(builder.getUnitAttr()), description);
+    Attribute idAttr = getIdAttr(aliasDomain);
+    return builder.getAttr<AliasScopeDomainAttr>(idAttr, description);
   };
 
   // Collect the alias scopes and domains to translate them.
@@ -452,10 +466,11 @@ ModuleImport::processAliasScopeMetadata(const llvm::MDNode *node) {
       // verifying its domain. Perform the verification before looking it up in
       // the alias scope mapping since it could have been inserted as a domain
       // node before.
-      if (!verifySelfRef(scope) || !domain || !verifyDescription(scope, 2))
+      if (!verifySelfRefOrString(scope) || !domain ||
+          !verifyDescription(scope, 2))
         return emitError(loc) << "unsupported alias scope node: "
                               << diagMD(scope, llvmModule.get());
-      if (!verifySelfRef(domain) || !verifyDescription(domain, 1))
+      if (!verifySelfRefOrString(domain) || !verifyDescription(domain, 1))
         return emitError(loc) << "unsupported alias domain node: "
                               << diagMD(domain, llvmModule.get());
 
@@ -473,9 +488,10 @@ ModuleImport::processAliasScopeMetadata(const llvm::MDNode *node) {
       StringAttr description = nullptr;
       if (!aliasScope.getName().empty())
         description = builder.getStringAttr(aliasScope.getName());
+      Attribute idAttr = getIdAttr(scope);
       auto aliasScopeOp = builder.getAttr<AliasScopeAttr>(
-          DistinctAttr::create(builder.getUnitAttr()),
-          cast<AliasScopeDomainAttr>(it->second), description);
+          idAttr, cast<AliasScopeDomainAttr>(it->second), description);
+
       aliasScopeMapping.try_emplace(aliasScope.getNode(), aliasScopeOp);
     }
   }
@@ -1473,18 +1489,20 @@ ModuleImport::convertBranchArgs(llvm::Instruction *branch,
   return success();
 }
 
-LogicalResult
-ModuleImport::convertCallTypeAndOperands(llvm::CallBase *callInst,
-                                         SmallVectorImpl<Type> &types,
-                                         SmallVectorImpl<Value> &operands) {
+LogicalResult ModuleImport::convertCallTypeAndOperands(
+    llvm::CallBase *callInst, SmallVectorImpl<Type> &types,
+    SmallVectorImpl<Value> &operands, bool allowInlineAsm) {
   if (!callInst->getType()->isVoidTy())
     types.push_back(convertType(callInst->getType()));
 
   if (!callInst->getCalledFunction()) {
-    FailureOr<Value> called = convertValue(callInst->getCalledOperand());
-    if (failed(called))
-      return failure();
-    operands.push_back(*called);
+    if (!allowInlineAsm ||
+        !isa<llvm::InlineAsm>(callInst->getCalledOperand())) {
+      FailureOr<Value> called = convertValue(callInst->getCalledOperand());
+      if (failed(called))
+        return failure();
+      operands.push_back(*called);
+    }
   }
   SmallVector<llvm::Value *> args(callInst->args());
   FailureOr<SmallVector<Value>> arguments = convertValues(args);
@@ -1579,7 +1597,8 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
 
     SmallVector<Type> types;
     SmallVector<Value> operands;
-    if (failed(convertCallTypeAndOperands(callInst, types, operands)))
+    if (failed(convertCallTypeAndOperands(callInst, types, operands,
+                                          /*allowInlineAsm=*/true)))
       return failure();
 
     auto funcTy =
@@ -1587,45 +1606,59 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     if (!funcTy)
       return failure();
 
-    CallOp callOp;
-
-    if (llvm::Function *callee = callInst->getCalledFunction()) {
-      callOp = builder.create<CallOp>(
-          loc, funcTy, SymbolRefAttr::get(context, callee->getName()),
-          operands);
+    if (auto asmI = dyn_cast<llvm::InlineAsm>(callInst->getCalledOperand())) {
+      auto callOp = builder.create<InlineAsmOp>(
+          loc, funcTy.getReturnType(), operands,
+          builder.getStringAttr(asmI->getAsmString()),
+          builder.getStringAttr(asmI->getConstraintString()),
+          /*has_side_effects=*/true,
+          /*is_align_stack=*/false, /*asm_dialect=*/nullptr,
+          /*operand_attrs=*/nullptr);
+      if (!callInst->getType()->isVoidTy())
+        mapValue(inst, callOp.getResult(0));
+      else
+        mapNoResultOp(inst, callOp);
     } else {
-      callOp = builder.create<CallOp>(loc, funcTy, operands);
+      CallOp callOp;
+
+      if (llvm::Function *callee = callInst->getCalledFunction()) {
+        callOp = builder.create<CallOp>(
+            loc, funcTy, SymbolRefAttr::get(context, callee->getName()),
+            operands);
+      } else {
+        callOp = builder.create<CallOp>(loc, funcTy, operands);
+      }
+      callOp.setCConv(convertCConvFromLLVM(callInst->getCallingConv()));
+      callOp.setTailCallKind(
+          convertTailCallKindFromLLVM(callInst->getTailCallKind()));
+      setFastmathFlagsAttr(inst, callOp);
+
+      // Handle function attributes.
+      if (callInst->hasFnAttr(llvm::Attribute::Convergent))
+        callOp.setConvergent(true);
+      if (callInst->hasFnAttr(llvm::Attribute::NoUnwind))
+        callOp.setNoUnwind(true);
+      if (callInst->hasFnAttr(llvm::Attribute::WillReturn))
+        callOp.setWillReturn(true);
+
+      llvm::MemoryEffects memEffects = callInst->getMemoryEffects();
+      ModRefInfo othermem = convertModRefInfoFromLLVM(
+          memEffects.getModRef(llvm::MemoryEffects::Location::Other));
+      ModRefInfo argMem = convertModRefInfoFromLLVM(
+          memEffects.getModRef(llvm::MemoryEffects::Location::ArgMem));
+      ModRefInfo inaccessibleMem = convertModRefInfoFromLLVM(
+          memEffects.getModRef(llvm::MemoryEffects::Location::InaccessibleMem));
+      auto memAttr = MemoryEffectsAttr::get(callOp.getContext(), othermem,
+                                            argMem, inaccessibleMem);
+      // Only set the attribute when it does not match the default value.
+      if (!memAttr.isReadWrite())
+        callOp.setMemoryEffectsAttr(memAttr);
+
+      if (!callInst->getType()->isVoidTy())
+        mapValue(inst, callOp.getResult());
+      else
+        mapNoResultOp(inst, callOp);
     }
-    callOp.setCConv(convertCConvFromLLVM(callInst->getCallingConv()));
-    callOp.setTailCallKind(
-        convertTailCallKindFromLLVM(callInst->getTailCallKind()));
-    setFastmathFlagsAttr(inst, callOp);
-
-    // Handle function attributes.
-    if (callInst->hasFnAttr(llvm::Attribute::Convergent))
-      callOp.setConvergent(true);
-    if (callInst->hasFnAttr(llvm::Attribute::NoUnwind))
-      callOp.setNoUnwind(true);
-    if (callInst->hasFnAttr(llvm::Attribute::WillReturn))
-      callOp.setWillReturn(true);
-
-    llvm::MemoryEffects memEffects = callInst->getMemoryEffects();
-    ModRefInfo othermem = convertModRefInfoFromLLVM(
-        memEffects.getModRef(llvm::MemoryEffects::Location::Other));
-    ModRefInfo argMem = convertModRefInfoFromLLVM(
-        memEffects.getModRef(llvm::MemoryEffects::Location::ArgMem));
-    ModRefInfo inaccessibleMem = convertModRefInfoFromLLVM(
-        memEffects.getModRef(llvm::MemoryEffects::Location::InaccessibleMem));
-    auto memAttr = MemoryEffectsAttr::get(callOp.getContext(), othermem, argMem,
-                                          inaccessibleMem);
-    // Only set the attribute when it does not match the default value.
-    if (!memAttr.isReadWrite())
-      callOp.setMemoryEffectsAttr(memAttr);
-
-    if (!callInst->getType()->isVoidTy())
-      mapValue(inst, callOp.getResult());
-    else
-      mapNoResultOp(inst, callOp);
     return success();
   }
   if (inst->getOpcode() == llvm::Instruction::LandingPad) {
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index ad62ae0..4367100 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1724,25 +1724,36 @@ ModuleTranslation::getOrCreateAliasScope(AliasScopeAttr aliasScopeAttr) {
       aliasScopeAttr.getDomain(), nullptr);
   if (insertedDomain) {
     llvm::SmallVector<llvm::Metadata *, 2> operands;
-    // Placeholder for self-reference.
+    // Placeholder for potential self-reference.
     operands.push_back(dummy.get());
     if (StringAttr description = aliasScopeAttr.getDomain().getDescription())
       operands.push_back(llvm::MDString::get(ctx, description));
     domainIt->second = llvm::MDNode::get(ctx, operands);
     // Self-reference for uniqueness.
-    domainIt->second->replaceOperandWith(0, domainIt->second);
+    llvm::Metadata *replacement;
+    if (auto stringAttr =
+            dyn_cast<StringAttr>(aliasScopeAttr.getDomain().getId()))
+      replacement = llvm::MDString::get(ctx, stringAttr.getValue());
+    else
+      replacement = domainIt->second;
+    domainIt->second->replaceOperandWith(0, replacement);
   }
   // Convert the scope metadata node.
   assert(domainIt->second && "Scope's domain should already be valid");
   llvm::SmallVector<llvm::Metadata *, 3> operands;
-  // Placeholder for self-reference.
+  // Placeholder for potential self-reference.
   operands.push_back(dummy.get());
   operands.push_back(domainIt->second);
   if (StringAttr description = aliasScopeAttr.getDescription())
     operands.push_back(llvm::MDString::get(ctx, description));
   scopeIt->second = llvm::MDNode::get(ctx, operands);
   // Self-reference for uniqueness.
-  scopeIt->second->replaceOperandWith(0, scopeIt->second);
+  llvm::Metadata *replacement;
+  if (auto stringAttr = dyn_cast<StringAttr>(aliasScopeAttr.getId()))
+    replacement = llvm::MDString::get(ctx, stringAttr.getValue());
+  else
+    replacement = scopeIt->second;
+  scopeIt->second->replaceOperandWith(0, replacement);
   return scopeIt->second;
 }
 
diff --git a/mlir/lib/Transforms/LocationSnapshot.cpp b/mlir/lib/Transforms/LocationSnapshot.cpp
index b85850a..f701c8b 100644
--- a/mlir/lib/Transforms/LocationSnapshot.cpp
+++ b/mlir/lib/Transforms/LocationSnapshot.cpp
@@ -10,6 +10,7 @@
 
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/Support/FileSystem.h"
@@ -131,29 +132,23 @@ LogicalResult mlir::generateLocationsFromIR(StringRef fileName, StringRef tag,
 namespace {
 struct LocationSnapshotPass
     : public impl::LocationSnapshotBase<LocationSnapshotPass> {
-  LocationSnapshotPass() = default;
-  LocationSnapshotPass(OpPrintingFlags flags, StringRef fileName, StringRef tag)
-      : flags(flags) {
-    this->fileName = fileName.str();
-    this->tag = tag.str();
-  }
+  using impl::LocationSnapshotBase<LocationSnapshotPass>::LocationSnapshotBase;
 
   void runOnOperation() override {
     Operation *op = getOperation();
-    if (failed(generateLocationsFromIR(fileName, op, OpPrintingFlags(), tag)))
+    if (failed(generateLocationsFromIR(fileName, op, getFlags(), tag)))
       return signalPassFailure();
   }
 
-  /// The printing flags to use when creating the snapshot.
-  OpPrintingFlags flags;
+private:
+  /// build the flags from the command line arguments to the pass
+  OpPrintingFlags getFlags() {
+    OpPrintingFlags flags;
+    flags.enableDebugInfo(enableDebugInfo, printPrettyDebugInfo);
+    flags.printGenericOpForm(printGenericOpForm);
+    if (useLocalScope)
+      flags.useLocalScope();
+    return flags;
+  }
 };
 } // namespace
-
-std::unique_ptr<Pass> mlir::createLocationSnapshotPass(OpPrintingFlags flags,
-                                                       StringRef fileName,
-                                                       StringRef tag) {
-  return std::make_unique<LocationSnapshotPass>(flags, fileName, tag);
-}
-std::unique_ptr<Pass> mlir::createLocationSnapshotPass() {
-  return std::make_unique<LocationSnapshotPass>();
-}
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 255b0ba..403321d 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -11,6 +11,7 @@
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dominance.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -63,11 +64,55 @@ static OpBuilder::InsertPoint computeInsertPoint(Value value) {
   return OpBuilder::InsertPoint(insertBlock, insertPt);
 }
 
+/// Helper function that computes an insertion point where the given values are
+/// defined and can be used without a dominance violation.
+static OpBuilder::InsertPoint computeInsertPoint(ArrayRef<Value> vals) {
+  assert(!vals.empty() && "expected at least one value");
+  DominanceInfo domInfo;
+  OpBuilder::InsertPoint pt = computeInsertPoint(vals.front());
+  for (Value v : vals.drop_front()) {
+    // Choose the "later" insertion point.
+    OpBuilder::InsertPoint nextPt = computeInsertPoint(v);
+    if (domInfo.dominates(pt.getBlock(), pt.getPoint(), nextPt.getBlock(),
+                          nextPt.getPoint())) {
+      // pt is before nextPt => choose nextPt.
+      pt = nextPt;
+    } else {
+#ifndef NDEBUG
+      // nextPt should be before pt => choose pt.
+      // If pt, nextPt are no dominance relationship, then there is no valid
+      // insertion point at which all given values are defined.
+      bool dom = domInfo.dominates(nextPt.getBlock(), nextPt.getPoint(),
+                                   pt.getBlock(), pt.getPoint());
+      assert(dom && "unable to find valid insertion point");
+#endif // NDEBUG
+    }
+  }
+  return pt;
+}
+
 //===----------------------------------------------------------------------===//
 // ConversionValueMapping
 //===----------------------------------------------------------------------===//
 
+/// A vector of SSA values, optimized for the most common case of a single
+/// value.
+using ValueVector = SmallVector<Value, 1>;
+
 namespace {
+
+/// Helper class to make it possible to use `ValueVector` as a key in DenseMap.
+struct ValueVectorMapInfo {
+  static ValueVector getEmptyKey() { return ValueVector{Value()}; }
+  static ValueVector getTombstoneKey() { return ValueVector{Value(), Value()}; }
+  static ::llvm::hash_code getHashValue(const ValueVector &val) {
+    return ::llvm::hash_combine_range(val.begin(), val.end());
+  }
+  static bool isEqual(const ValueVector &LHS, const ValueVector &RHS) {
+    return LHS == RHS;
+  }
+};
+
 /// This class wraps a IRMapping to provide recursive lookup
 /// functionality, i.e. we will traverse if the mapped value also has a mapping.
 struct ConversionValueMapping {
@@ -75,68 +120,128 @@ struct ConversionValueMapping {
   /// false positives.
   bool isMappedTo(Value value) const { return mappedTo.contains(value); }
 
-  /// Lookup the most recently mapped value with the desired type in the
+  /// Lookup the most recently mapped values with the desired types in the
   /// mapping.
   ///
   /// Special cases:
-  /// - If the desired type is "null", simply return the most recently mapped
-  ///   value.
-  /// - If there is no mapping to the desired type, also return the most
-  ///   recently mapped value.
-  /// - If there is no mapping for the given value at all, return the given
+  /// - If the desired type range is empty, simply return the most recently
+  ///   mapped values.
+  /// - If there is no mapping to the desired types, also return the most
+  ///   recently mapped values.
+  /// - If there is no mapping for the given values at all, return the given
   ///   value.
-  Value lookupOrDefault(Value from, Type desiredType = nullptr) const;
+  ValueVector lookupOrDefault(Value from, TypeRange desiredTypes = {}) const;
 
-  /// Lookup a mapped value within the map, or return null if a mapping does not
-  /// exist. If a mapping exists, this follows the same behavior of
-  /// `lookupOrDefault`.
-  Value lookupOrNull(Value from, Type desiredType = nullptr) const;
+  /// Lookup the given value within the map, or return an empty vector if the
+  /// value is not mapped. If it is mapped, this follows the same behavior
+  /// as `lookupOrDefault`.
+  ValueVector lookupOrNull(Value from, TypeRange desiredTypes = {}) const;
 
-  /// Map a value to the one provided.
-  void map(Value oldVal, Value newVal) {
+  template <typename T>
+  struct IsValueVector : std::is_same<std::decay_t<T>, ValueVector> {};
+
+  /// Map a value vector to the one provided.
+  template <typename OldVal, typename NewVal>
+  std::enable_if_t<IsValueVector<OldVal>::value && IsValueVector<NewVal>::value>
+  map(OldVal &&oldVal, NewVal &&newVal) {
     LLVM_DEBUG({
-      for (Value it = newVal; it; it = mapping.lookupOrNull(it))
-        assert(it != oldVal && "inserting cyclic mapping");
+      ValueVector next(newVal);
+      while (true) {
+        assert(next != oldVal && "inserting cyclic mapping");
+        auto it = mapping.find(next);
+        if (it == mapping.end())
+          break;
+        next = it->second;
+      }
     });
-    mapping.map(oldVal, newVal);
-    mappedTo.insert(newVal);
+    for (Value v : newVal)
+      mappedTo.insert(v);
+
+    mapping[std::forward<OldVal>(oldVal)] = std::forward<NewVal>(newVal);
   }
 
-  /// Drop the last mapping for the given value.
-  void erase(Value value) { mapping.erase(value); }
+  /// Map a value vector or single value to the one provided.
+  template <typename OldVal, typename NewVal>
+  std::enable_if_t<!IsValueVector<OldVal>::value ||
+                   !IsValueVector<NewVal>::value>
+  map(OldVal &&oldVal, NewVal &&newVal) {
+    if constexpr (IsValueVector<OldVal>{}) {
+      map(std::forward<OldVal>(oldVal), ValueVector{newVal});
+    } else if constexpr (IsValueVector<NewVal>{}) {
+      map(ValueVector{oldVal}, std::forward<NewVal>(newVal));
+    } else {
+      map(ValueVector{oldVal}, ValueVector{newVal});
+    }
+  }
+
+  /// Drop the last mapping for the given values.
+  void erase(const ValueVector &value) { mapping.erase(value); }
 
 private:
   /// Current value mappings.
-  IRMapping mapping;
+  DenseMap<ValueVector, ValueVector, ValueVectorMapInfo> mapping;
 
   /// All SSA values that are mapped to. May contain false positives.
   DenseSet<Value> mappedTo;
 };
 } // namespace
 
-Value ConversionValueMapping::lookupOrDefault(Value from,
-                                              Type desiredType) const {
-  // Try to find the deepest value that has the desired type. If there is no
-  // such value, simply return the deepest value.
-  Value desiredValue;
+ValueVector
+ConversionValueMapping::lookupOrDefault(Value from,
+                                        TypeRange desiredTypes) const {
+  // Try to find the deepest values that have the desired types. If there is no
+  // such mapping, simply return the deepest values.
+  ValueVector desiredValue;
+  ValueVector current{from};
   do {
-    if (!desiredType || from.getType() == desiredType)
-      desiredValue = from;
+    // Store the current value if the types match.
+    if (TypeRange(ValueRange(current)) == desiredTypes)
+      desiredValue = current;
+
+    // If possible, Replace each value with (one or multiple) mapped values.
+    ValueVector next;
+    for (Value v : current) {
+      auto it = mapping.find({v});
+      if (it != mapping.end()) {
+        llvm::append_range(next, it->second);
+      } else {
+        next.push_back(v);
+      }
+    }
+    if (next != current) {
+      // If at least one value was replaced, continue the lookup from there.
+      current = std::move(next);
+      continue;
+    }
 
-    Value mappedValue = mapping.lookupOrNull(from);
-    if (!mappedValue)
+    // Otherwise: Check if there is a mapping for the entire vector. Such
+    // mappings are materializations. (N:M mapping are not supported for value
+    // replacements.)
+    //
+    // Note: From a correctness point of view, materializations do not have to
+    // be stored (and looked up) in the mapping. But for performance reasons,
+    // we choose to reuse existing IR (when possible) instead of creating it
+    // multiple times.
+    auto it = mapping.find(current);
+    if (it == mapping.end()) {
+      // No mapping found: The lookup stops here.
       break;
-    from = mappedValue;
+    }
+    current = it->second;
   } while (true);
 
-  // If the desired value was found use it, otherwise default to the leaf value.
-  return desiredValue ? desiredValue : from;
+  // If the desired values were found use them, otherwise default to the leaf
+  // values.
+  // Note: If `desiredTypes` is empty, this function always returns `current`.
+  return !desiredValue.empty() ? std::move(desiredValue) : std::move(current);
 }
 
-Value ConversionValueMapping::lookupOrNull(Value from, Type desiredType) const {
-  Value result = lookupOrDefault(from, desiredType);
-  if (result == from || (desiredType && result.getType() != desiredType))
-    return nullptr;
+ValueVector ConversionValueMapping::lookupOrNull(Value from,
+                                                 TypeRange desiredTypes) const {
+  ValueVector result = lookupOrDefault(from, desiredTypes);
+  if (result == ValueVector{from} ||
+      (!desiredTypes.empty() && TypeRange(ValueRange(result)) != desiredTypes))
+    return {};
   return result;
 }
 
@@ -651,10 +756,6 @@ public:
 
 /// The type of materialization.
 enum MaterializationKind {
-  /// This materialization materializes a conversion for an illegal block
-  /// argument type, to the original one.
-  Argument,
-
   /// This materialization materializes a conversion from an illegal type to a
   /// legal one.
   Target,
@@ -673,7 +774,7 @@ public:
                                    UnrealizedConversionCastOp op,
                                    const TypeConverter *converter,
                                    MaterializationKind kind, Type originalType,
-                                   Value mappedValue);
+                                   ValueVector mappedValues);
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() == Kind::UnresolvedMaterialization;
@@ -708,9 +809,9 @@ private:
   /// materializations.
   Type originalType;
 
-  /// The value in the conversion value mapping that is being replaced by the
+  /// The values in the conversion value mapping that are being replaced by the
   /// results of this unresolved materialization.
-  Value mappedValue;
+  ValueVector mappedValues;
 };
 } // namespace
 
@@ -779,7 +880,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   LogicalResult remapValues(StringRef valueDiagTag,
                             std::optional<Location> inputLoc,
                             PatternRewriter &rewriter, ValueRange values,
-                            SmallVector<SmallVector<Value>> &remapped);
+                            SmallVector<ValueVector> &remapped);
 
   /// Return "true" if the given operation is ignored, and does not need to be
   /// converted.
@@ -820,39 +921,14 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// If a cast op was built, it can optionally be returned with the `castOp`
   /// output argument.
   ///
-  /// If `valueToMap` is set to a non-null Value, then that value is mapped to
+  /// If `valuesToMap` is set to a non-null Value, then that value is mapped to
   /// the results of the unresolved materialization in the conversion value
   /// mapping.
   ValueRange buildUnresolvedMaterialization(
       MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc,
-      Value valueToMap, ValueRange inputs, TypeRange outputTypes,
+      ValueVector valuesToMap, ValueRange inputs, TypeRange outputTypes,
       Type originalType, const TypeConverter *converter,
       UnrealizedConversionCastOp *castOp = nullptr);
-  Value buildUnresolvedMaterialization(
-      MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc,
-      Value valueToMap, ValueRange inputs, Type outputType, Type originalType,
-      const TypeConverter *converter,
-      UnrealizedConversionCastOp *castOp = nullptr) {
-    return buildUnresolvedMaterialization(kind, ip, loc, valueToMap, inputs,
-                                          TypeRange(outputType), originalType,
-                                          converter, castOp)
-        .front();
-  }
-
-  /// Build an N:1 materialization for the given original value that was
-  /// replaced with the given replacement values.
-  ///
-  /// This is a workaround around incomplete 1:N support in the dialect
-  /// conversion driver. The conversion mapping can store only 1:1 replacements
-  /// and the conversion patterns only support single Value replacements in the
-  /// adaptor, so N values must be converted back to a single value. This
-  /// function will be deleted when full 1:N support has been added.
-  ///
-  /// This function inserts an argument materialization back to the original
-  /// type.
-  void insertNTo1Materialization(OpBuilder::InsertPoint ip, Location loc,
-                                 ValueRange replacements, Value originalValue,
-                                 const TypeConverter *converter);
 
   /// Find a replacement value for the given SSA value in the conversion value
   /// mapping. The replacement value must have the same type as the given SSA
@@ -862,16 +938,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   Value findOrBuildReplacementValue(Value value,
                                     const TypeConverter *converter);
 
-  /// Unpack an N:1 materialization and return the inputs of the
-  /// materialization. This function unpacks only those materializations that
-  /// were built with `insertNTo1Materialization`.
-  ///
-  /// This is a workaround around incomplete 1:N support in the dialect
-  /// conversion driver. It allows us to write 1:N conversion patterns while
-  /// 1:N support is still missing in the conversion value mapping. This
-  /// function will be deleted when full 1:N support has been added.
-  SmallVector<Value> unpackNTo1Materialization(Value value);
-
   //===--------------------------------------------------------------------===//
   // Rewriter Notification Hooks
   //===--------------------------------------------------------------------===//
@@ -974,10 +1040,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationRewrite *>
       unresolvedMaterializations;
 
-  /// A set of all N:1 materializations that were added to work around
-  /// incomplete 1:N support in the dialect conversion driver.
-  DenseSet<UnrealizedConversionCastOp> nTo1TempMaterializations;
-
   /// The current type converter, or nullptr if no type converter is currently
   /// active.
   const TypeConverter *currentTypeConverter = nullptr;
@@ -1041,7 +1103,7 @@ void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) {
   });
 }
 
-void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase(arg); }
+void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase({arg}); }
 
 void ReplaceOperationRewrite::commit(RewriterBase &rewriter) {
   auto *listener =
@@ -1082,7 +1144,7 @@ void ReplaceOperationRewrite::commit(RewriterBase &rewriter) {
 
 void ReplaceOperationRewrite::rollback() {
   for (auto result : op->getResults())
-    rewriterImpl.mapping.erase(result);
+    rewriterImpl.mapping.erase({result});
 }
 
 void ReplaceOperationRewrite::cleanup(RewriterBase &rewriter) {
@@ -1101,20 +1163,19 @@ void CreateOperationRewrite::rollback() {
 UnresolvedMaterializationRewrite::UnresolvedMaterializationRewrite(
     ConversionPatternRewriterImpl &rewriterImpl, UnrealizedConversionCastOp op,
     const TypeConverter *converter, MaterializationKind kind, Type originalType,
-    Value mappedValue)
+    ValueVector mappedValues)
     : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op),
       converterAndKind(converter, kind), originalType(originalType),
-      mappedValue(mappedValue) {
+      mappedValues(std::move(mappedValues)) {
   assert((!originalType || kind == MaterializationKind::Target) &&
          "original type is valid only for target materializations");
   rewriterImpl.unresolvedMaterializations[op] = this;
 }
 
 void UnresolvedMaterializationRewrite::rollback() {
-  if (mappedValue)
-    rewriterImpl.mapping.erase(mappedValue);
+  if (!mappedValues.empty())
+    rewriterImpl.mapping.erase(mappedValues);
   rewriterImpl.unresolvedMaterializations.erase(getOperation());
-  rewriterImpl.nTo1TempMaterializations.erase(getOperation());
   op->erase();
 }
 
@@ -1160,7 +1221,7 @@ void ConversionPatternRewriterImpl::undoRewrites(unsigned numRewritesToKeep) {
 LogicalResult ConversionPatternRewriterImpl::remapValues(
     StringRef valueDiagTag, std::optional<Location> inputLoc,
     PatternRewriter &rewriter, ValueRange values,
-    SmallVector<SmallVector<Value>> &remapped) {
+    SmallVector<ValueVector> &remapped) {
   remapped.reserve(llvm::size(values));
 
   for (const auto &it : llvm::enumerate(values)) {
@@ -1168,18 +1229,11 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
     Type origType = operand.getType();
     Location operandLoc = inputLoc ? *inputLoc : operand.getLoc();
 
-    // Find the most recently mapped value. Unpack all temporary N:1
-    // materializations. Such conversions are a workaround around missing
-    // 1:N support in the ConversionValueMapping. (The conversion patterns
-    // already support 1:N replacements.)
-    Value repl = mapping.lookupOrDefault(operand);
-    SmallVector<Value> unpacked = unpackNTo1Materialization(repl);
-
     if (!currentTypeConverter) {
       // The current pattern does not have a type converter. I.e., it does not
       // distinguish between legal and illegal types. For each operand, simply
-      // pass through the most recently mapped value.
-      remapped.push_back(std::move(unpacked));
+      // pass through the most recently mapped values.
+      remapped.push_back(mapping.lookupOrDefault(operand));
       continue;
     }
 
@@ -1192,51 +1246,28 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
       });
       return failure();
     }
-
     // If a type is converted to 0 types, there is nothing to do.
     if (legalTypes.empty()) {
       remapped.push_back({});
       continue;
     }
 
-    if (legalTypes.size() != 1) {
-      // TODO: This is a 1:N conversion. The conversion value mapping does not
-      // store such materializations yet. If the types of the most recently
-      // mapped values do not match, build a target materialization.
-      ValueRange unpackedRange(unpacked);
-      if (TypeRange(unpackedRange) == legalTypes) {
-        remapped.push_back(std::move(unpacked));
-        continue;
-      }
-
-      // Insert a target materialization if the current pattern expects
-      // different legalized types.
-      ValueRange targetMat = buildUnresolvedMaterialization(
-          MaterializationKind::Target, computeInsertPoint(repl), operandLoc,
-          /*valueToMap=*/Value(), /*inputs=*/unpacked,
-          /*outputType=*/legalTypes, /*originalType=*/origType,
-          currentTypeConverter);
-      remapped.push_back(targetMat);
+    ValueVector repl = mapping.lookupOrDefault(operand, legalTypes);
+    if (!repl.empty() && TypeRange(ValueRange(repl)) == legalTypes) {
+      // Mapped values have the correct type or there is an existing
+      // materialization. Or the operand is not mapped at all and has the
+      // correct type.
+      remapped.push_back(std::move(repl));
       continue;
     }
 
-    // Handle 1->1 type conversions.
-    Type desiredType = legalTypes.front();
-    // Try to find a mapped value with the desired type. (Or the operand itself
-    // if the value is not mapped at all.)
-    Value newOperand = mapping.lookupOrDefault(operand, desiredType);
-    if (newOperand.getType() != desiredType) {
-      // If the looked up value's type does not have the desired type, it means
-      // that the value was replaced with a value of different type and no
-      // target materialization was created yet.
-      Value castValue = buildUnresolvedMaterialization(
-          MaterializationKind::Target, computeInsertPoint(newOperand),
-          operandLoc, /*valueToMap=*/newOperand, /*inputs=*/unpacked,
-          /*outputType=*/desiredType, /*originalType=*/origType,
-          currentTypeConverter);
-      newOperand = castValue;
-    }
-    remapped.push_back({newOperand});
+    // Create a materialization for the most recently mapped values.
+    repl = mapping.lookupOrDefault(operand);
+    ValueRange castValues = buildUnresolvedMaterialization(
+        MaterializationKind::Target, computeInsertPoint(repl), operandLoc,
+        /*valuesToMap=*/repl, /*inputs=*/repl, /*outputTypes=*/legalTypes,
+        /*originalType=*/origType, currentTypeConverter);
+    remapped.push_back(castValues);
   }
   return success();
 }
@@ -1353,7 +1384,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
       buildUnresolvedMaterialization(
           MaterializationKind::Source,
           OpBuilder::InsertPoint(newBlock, newBlock->begin()), origArg.getLoc(),
-          /*valueToMap=*/origArg, /*inputs=*/ValueRange(),
+          /*valuesToMap=*/{origArg}, /*inputs=*/ValueRange(),
           /*outputType=*/origArgType, /*originalType=*/Type(), converter);
       appendRewrite<ReplaceBlockArgRewrite>(block, origArg, converter);
       continue;
@@ -1369,19 +1400,11 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
       continue;
     }
 
-    // This is a 1->1+ mapping. 1->N mappings are not fully supported in the
-    // dialect conversion. Therefore, we need an argument materialization to
-    // turn the replacement block arguments into a single SSA value that can be
-    // used as a replacement.
+    // This is a 1->1+ mapping.
     auto replArgs =
         newBlock->getArguments().slice(inputMap->inputNo, inputMap->size);
-    if (replArgs.size() == 1) {
-      mapping.map(origArg, replArgs.front());
-    } else {
-      insertNTo1Materialization(
-          OpBuilder::InsertPoint(newBlock, newBlock->begin()), origArg.getLoc(),
-          /*replacements=*/replArgs, /*outputValue=*/origArg, converter);
-    }
+    ValueVector replArgVals = llvm::to_vector_of<Value, 1>(replArgs);
+    mapping.map(origArg, std::move(replArgVals));
     appendRewrite<ReplaceBlockArgRewrite>(block, origArg, converter);
   }
 
@@ -1402,20 +1425,13 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
 /// of input operands.
 ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
     MaterializationKind kind, OpBuilder::InsertPoint ip, Location loc,
-    Value valueToMap, ValueRange inputs, TypeRange outputTypes,
+    ValueVector valuesToMap, ValueRange inputs, TypeRange outputTypes,
     Type originalType, const TypeConverter *converter,
     UnrealizedConversionCastOp *castOp) {
   assert((!originalType || kind == MaterializationKind::Target) &&
          "original type is valid only for target materializations");
-
-  // Avoid materializing an unnecessary cast.
-  if (TypeRange(inputs) == outputTypes) {
-    if (valueToMap) {
-      assert(inputs.size() == 1 && "1:N mapping is not supported");
-      mapping.map(valueToMap, inputs.front());
-    }
-    return inputs;
-  }
+  assert(TypeRange(inputs) != outputTypes &&
+         "materialization is not necessary");
 
   // Create an unresolved materialization. We use a new OpBuilder to avoid
   // tracking the materialization like we do for other operations.
@@ -1423,37 +1439,23 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
   builder.setInsertionPoint(ip.getBlock(), ip.getPoint());
   auto convertOp =
       builder.create<UnrealizedConversionCastOp>(loc, outputTypes, inputs);
-  if (valueToMap) {
-    assert(outputTypes.size() == 1 && "1:N mapping is not supported");
-    mapping.map(valueToMap, convertOp.getResult(0));
-  }
+  if (!valuesToMap.empty())
+    mapping.map(valuesToMap, convertOp.getResults());
   if (castOp)
     *castOp = convertOp;
-  appendRewrite<UnresolvedMaterializationRewrite>(convertOp, converter, kind,
-                                                  originalType, valueToMap);
+  appendRewrite<UnresolvedMaterializationRewrite>(
+      convertOp, converter, kind, originalType, std::move(valuesToMap));
   return convertOp.getResults();
 }
 
-void ConversionPatternRewriterImpl::insertNTo1Materialization(
-    OpBuilder::InsertPoint ip, Location loc, ValueRange replacements,
-    Value originalValue, const TypeConverter *converter) {
-  // Insert argument materialization back to the original type.
-  Type originalType = originalValue.getType();
-  UnrealizedConversionCastOp argCastOp;
-  buildUnresolvedMaterialization(
-      MaterializationKind::Argument, ip, loc, /*valueToMap=*/originalValue,
-      /*inputs=*/replacements, originalType,
-      /*originalType=*/Type(), converter, &argCastOp);
-  if (argCastOp)
-    nTo1TempMaterializations.insert(argCastOp);
-}
-
 Value ConversionPatternRewriterImpl::findOrBuildReplacementValue(
     Value value, const TypeConverter *converter) {
-  // Find a replacement value with the same type.
-  Value repl = mapping.lookupOrNull(value, value.getType());
-  if (repl)
-    return repl;
+  // Try to find a replacement value with the same type in the conversion value
+  // mapping. This includes cached materializations. We try to reuse those
+  // instead of generating duplicate IR.
+  ValueVector repl = mapping.lookupOrNull(value, value.getType());
+  if (!repl.empty())
+    return repl.front();
 
   // Check if the value is dead. No replacement value is needed in that case.
   // This is an approximate check that may have false negatives but does not
@@ -1468,7 +1470,7 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue(
   // (regardless of the type) and build a source materialization to the
   // original type.
   repl = mapping.lookupOrNull(value);
-  if (!repl) {
+  if (repl.empty()) {
     // No replacement value is registered in the mapping. This means that the
     // value is dropped and no longer needed. (If the value were still needed,
     // a source materialization producing a replacement value "out of thin air"
@@ -1476,34 +1478,22 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue(
     // `applySignatureConversion`.)
     return Value();
   }
-  Value castValue = buildUnresolvedMaterialization(
-      MaterializationKind::Source, computeInsertPoint(repl), value.getLoc(),
-      /*valueToMap=*/value, /*inputs=*/repl, /*outputType=*/value.getType(),
-      /*originalType=*/Type(), converter);
-  mapping.map(value, castValue);
-  return castValue;
-}
 
-SmallVector<Value>
-ConversionPatternRewriterImpl::unpackNTo1Materialization(Value value) {
-  // Unpack unrealized_conversion_cast ops that were inserted as a N:1
-  // workaround.
-  auto castOp = value.getDefiningOp<UnrealizedConversionCastOp>();
-  if (!castOp)
-    return {value};
-  if (!nTo1TempMaterializations.contains(castOp))
-    return {value};
-  assert(castOp->getNumResults() == 1 && "expected single result");
-
-  SmallVector<Value> result;
-  for (Value v : castOp.getOperands()) {
-    // Keep unpacking if possible. This is needed because during block
-    // signature conversions and 1:N op replacements, the driver may have
-    // inserted two materializations back-to-back: first an argument
-    // materialization, then a target materialization.
-    llvm::append_range(result, unpackNTo1Materialization(v));
-  }
-  return result;
+  // Note: `computeInsertPoint` computes the "earliest" insertion point at
+  // which all values in `repl` are defined. It is important to emit the
+  // materialization at that location because the same materialization may be
+  // reused in a different context. (That's because materializations are cached
+  // in the conversion value mapping.) The insertion point of the
+  // materialization must be valid for all future users that may be created
+  // later in the conversion process.
+  Value castValue =
+      buildUnresolvedMaterialization(MaterializationKind::Source,
+                                     computeInsertPoint(repl), value.getLoc(),
+                                     /*valuesToMap=*/repl, /*inputs=*/repl,
+                                     /*outputType=*/value.getType(),
+                                     /*originalType=*/Type(), converter)
+          .front();
+  return castValue;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1554,7 +1544,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(
       // Materialize a replacement value "out of thin air".
       buildUnresolvedMaterialization(
           MaterializationKind::Source, computeInsertPoint(result),
-          result.getLoc(), /*valueToMap=*/result, /*inputs=*/ValueRange(),
+          result.getLoc(), /*valuesToMap=*/{result}, /*inputs=*/ValueRange(),
           /*outputType=*/result.getType(), /*originalType=*/Type(),
           currentTypeConverter);
       continue;
@@ -1572,16 +1562,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(
     // Remap result to replacement value.
     if (repl.empty())
       continue;
-
-    if (repl.size() == 1) {
-      // Single replacement value: replace directly.
-      mapping.map(result, repl.front());
-    } else {
-      // Multiple replacement values: insert N:1 materialization.
-      insertNTo1Materialization(computeInsertPoint(result), result.getLoc(),
-                                /*replacements=*/repl, /*outputValue=*/result,
-                                currentTypeConverter);
-    }
+    mapping.map(result, repl);
   }
 
   appendRewrite<ReplaceOperationRewrite>(op, currentTypeConverter);
@@ -1660,8 +1641,13 @@ void ConversionPatternRewriter::replaceOp(Operation *op, ValueRange newValues) {
         << "** Replace : '" << op->getName() << "'(" << op << ")\n";
   });
   SmallVector<ValueRange> newVals;
-  for (size_t i = 0; i < newValues.size(); ++i)
-    newVals.push_back(newValues.slice(i, 1));
+  for (size_t i = 0; i < newValues.size(); ++i) {
+    if (newValues[i]) {
+      newVals.push_back(newValues.slice(i, 1));
+    } else {
+      newVals.push_back(ValueRange());
+    }
+  }
   impl->notifyOpReplaced(op, newVals);
 }
 
@@ -1733,7 +1719,7 @@ void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from,
 }
 
 Value ConversionPatternRewriter::getRemappedValue(Value key) {
-  SmallVector<SmallVector<Value>> remappedValues;
+  SmallVector<ValueVector> remappedValues;
   if (failed(impl->remapValues("value", /*inputLoc=*/std::nullopt, *this, key,
                                remappedValues)))
     return nullptr;
@@ -1746,7 +1732,7 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
                                              SmallVectorImpl<Value> &results) {
   if (keys.empty())
     return success();
-  SmallVector<SmallVector<Value>> remapped;
+  SmallVector<ValueVector> remapped;
   if (failed(impl->remapValues("value", /*inputLoc=*/std::nullopt, *this, keys,
                                remapped)))
     return failure();
@@ -1872,7 +1858,7 @@ ConversionPattern::matchAndRewrite(Operation *op,
                                              getTypeConverter());
 
   // Remap the operands of the operation.
-  SmallVector<SmallVector<Value>> remapped;
+  SmallVector<ValueVector> remapped;
   if (failed(rewriterImpl.remapValues("operand", op->getLoc(), rewriter,
                                       op->getOperands(), remapped))) {
     return failure();
@@ -2625,19 +2611,6 @@ legalizeUnresolvedMaterialization(RewriterBase &rewriter,
     rewriter.setInsertionPoint(op);
     SmallVector<Value> newMaterialization;
     switch (rewrite->getMaterializationKind()) {
-    case MaterializationKind::Argument: {
-      // Try to materialize an argument conversion.
-      assert(op->getNumResults() == 1 && "expected single result");
-      Value argMat = converter->materializeArgumentConversion(
-          rewriter, op->getLoc(), op.getResultTypes().front(), inputOperands);
-      if (argMat) {
-        newMaterialization.push_back(argMat);
-        break;
-      }
-    }
-      // If an argument materialization failed, fallback to trying a target
-      // materialization.
-      [[fallthrough]];
     case MaterializationKind::Target:
       newMaterialization = converter->materializeTargetConversion(
           rewriter, op->getLoc(), op.getResultTypes(), inputOperands,
diff --git a/mlir/python/mlir/_mlir_libs/__init__.py b/mlir/python/mlir/_mlir_libs/__init__.py
index c5cb22c..d021dde0 100644
--- a/mlir/python/mlir/_mlir_libs/__init__.py
+++ b/mlir/python/mlir/_mlir_libs/__init__.py
@@ -58,6 +58,7 @@ def get_include_dirs() -> Sequence[str]:
 # needs.
 
 _dialect_registry = None
+_load_on_create_dialects = None
 
 
 def get_dialect_registry():
@@ -71,6 +72,21 @@ def get_dialect_registry():
     return _dialect_registry
 
 
+def append_load_on_create_dialect(dialect: str):
+    global _load_on_create_dialects
+    if _load_on_create_dialects is None:
+        _load_on_create_dialects = [dialect]
+    else:
+        _load_on_create_dialects.append(dialect)
+
+
+def get_load_on_create_dialects():
+    global _load_on_create_dialects
+    if _load_on_create_dialects is None:
+        _load_on_create_dialects = []
+    return _load_on_create_dialects
+
+
 def _site_initialize():
     import importlib
     import itertools
@@ -132,15 +148,35 @@ def _site_initialize():
             break
 
     class Context(ir._BaseContext):
-        def __init__(self, *args, **kwargs):
+        def __init__(self, load_on_create_dialects=None, *args, **kwargs):
             super().__init__(*args, **kwargs)
             self.append_dialect_registry(get_dialect_registry())
             for hook in post_init_hooks:
                 hook(self)
             if not disable_multithreading:
                 self.enable_multithreading(True)
-            if not disable_load_all_available_dialects:
-                self.load_all_available_dialects()
+            if load_on_create_dialects is not None:
+                logger.debug(
+                    "Loading all dialects from load_on_create_dialects arg %r",
+                    load_on_create_dialects,
+                )
+                for dialect in load_on_create_dialects:
+                    # This triggers loading the dialect into the context.
+                    _ = self.dialects[dialect]
+            else:
+                if disable_load_all_available_dialects:
+                    dialects = get_load_on_create_dialects()
+                    if dialects:
+                        logger.debug(
+                            "Loading all dialects from global load_on_create_dialects %r",
+                            dialects,
+                        )
+                        for dialect in dialects:
+                            # This triggers loading the dialect into the context.
+                            _ = self.dialects[dialect]
+                else:
+                    logger.debug("Loading all available dialects")
+                    self.load_all_available_dialects()
             if init_module:
                 logger.debug(
                     "Registering translations from initializer %r", init_module
diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py
index 9121aa8..bf40cc5 100644
--- a/mlir/python/mlir/dialects/transform/structured.py
+++ b/mlir/python/mlir/dialects/transform/structured.py
@@ -141,6 +141,77 @@ class FuseIntoContainingOp(FuseIntoContainingOp):
 
 
 @_ods_cext.register_operation(_Dialect, replace=True)
+class FuseOp(FuseOp):
+    """Specialization for FuseOp class."""
+
+    @overload
+    def __init__(
+        self,
+        loop_types: Union[Type, Sequence[Type]],
+        target: Union[Operation, Value, OpView],
+        *,
+        tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None,
+        tile_interchange: OptionalIntList = None,
+        apply_cleanup: Optional[bool] = False,
+        loc=None,
+        ip=None,
+    ):
+        ...
+
+    @overload
+    def __init__(
+        self,
+        target: Union[Operation, Value, OpView],
+        *,
+        tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None,
+        tile_interchange: OptionalIntList = None,
+        apply_cleanup: Optional[bool] = False,
+        loc=None,
+        ip=None,
+    ):
+        ...
+
+    def __init__(
+        self,
+        loop_types_or_target: Union[Type, Sequence[Type], Operation, OpView, Value],
+        target_or_none: Optional[Union[Operation, Value, OpView]] = None,
+        *,
+        tile_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None,
+        tile_interchange: OptionalIntList = None,
+        apply_cleanup: Optional[bool] = False,
+        loc=None,
+        ip=None,
+    ):
+        tile_sizes = tile_sizes if tile_sizes else []
+        tile_interchange = tile_interchange if tile_interchange else []
+        _, tile_sizes, _ = _dispatch_dynamic_index_list(tile_sizes)
+        _, tile_interchange, _ = _dispatch_dynamic_index_list(tile_interchange)
+        num_loops = sum(0 if v == 0 else 1 for v in tile_sizes)
+
+        if isinstance(loop_types_or_target, (Operation, Value, OpView)):
+            loop_types = [transform.AnyOpType.get()] * num_loops
+            target = loop_types_or_target
+            assert target_or_none is None, "Cannot construct FuseOp with two targets."
+        else:
+            loop_types = (
+                ([loop_types_or_target] * num_loops)
+                if isinstance(loop_types_or_target, Type)
+                else loop_types_or_target
+            )
+            target = target_or_none
+        super().__init__(
+            target.type,
+            loop_types,
+            target,
+            tile_sizes=tile_sizes,
+            tile_interchange=tile_interchange,
+            apply_cleanup=apply_cleanup,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
 class GeneralizeOp(GeneralizeOp):
     """Specialization for GeneralizeOp class."""
 
diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py
index 9a6ce462..6f37266 100644
--- a/mlir/python/mlir/ir.py
+++ b/mlir/python/mlir/ir.py
@@ -5,7 +5,11 @@
 from ._mlir_libs._mlir.ir import *
 from ._mlir_libs._mlir.ir import _GlobalDebug
 from ._mlir_libs._mlir import register_type_caster, register_value_caster
-from ._mlir_libs import get_dialect_registry
+from ._mlir_libs import (
+    get_dialect_registry,
+    append_load_on_create_dialect,
+    get_load_on_create_dialects,
+)
 
 
 # Convenience decorator for registering user-friendly Attribute builders.
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index f181a91..58d16a6 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -99,7 +99,6 @@ configure_lit_site_cfg(
   )
 
 set(MLIR_TEST_DEPENDS
-  FileCheck count not split-file
   mlir-capi-ir-test
   mlir-capi-irdl-test
   mlir-capi-llvm-test
@@ -121,6 +120,9 @@ set(MLIR_TEST_DEPENDS
   tblgen-lsp-server
   tblgen-to-irdl
   )
+if(NOT MLIR_STANDALONE_BUILD)
+  list(APPEND MLIR_TEST_DEPENDS FileCheck count not split-file)
+endif()
 
 set(MLIR_TEST_DEPENDS ${MLIR_TEST_DEPENDS}
   mlir-capi-pdl-test
diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine-to-vector.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine-to-vector.mlir
index 58580a1..f2e0306 100644
--- a/mlir/test/Conversion/AffineToStandard/lower-affine-to-vector.mlir
+++ b/mlir/test/Conversion/AffineToStandard/lower-affine-to-vector.mlir
@@ -26,7 +26,7 @@ func.func @affine_vector_store(%arg0 : index) {
 // CHECK:       %[[buf:.*]] = memref.alloc
 // CHECK:       %[[val:.*]] = arith.constant dense
 // CHECK:       %[[c_1:.*]] = arith.constant -1 : index
-// CHECK-NEXT:  %[[a:.*]] = arith.muli %arg0, %[[c_1]] : index
+// CHECK-NEXT:  %[[a:.*]] = arith.muli %arg0, %[[c_1]] overflow<nsw> : index
 // CHECK-NEXT:  %[[b:.*]] = arith.addi %{{.*}}, %[[a]] : index
 // CHECK-NEXT:  %[[c7:.*]] = arith.constant 7 : index
 // CHECK-NEXT:  %[[c:.*]] = arith.addi %[[b]], %[[c7]] : index
diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
index 00d7b6b..550ea71 100644
--- a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
+++ b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
@@ -156,7 +156,7 @@ func.func private @get_idx() -> (index)
 // CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
 // CHECK-NEXT:   %[[c0:.*]] = arith.constant 0 : index
 // CHECK-NEXT:   %[[cm1:.*]] = arith.constant -1 : index
-// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] overflow<nsw> : index
 // CHECK-NEXT:   %[[c20:.*]] = arith.constant 20 : index
 // CHECK-NEXT:   %[[v2:.*]] = arith.addi %[[v1]], %[[c20]] : index
 // CHECK-NEXT:   %[[v3:.*]] = arith.cmpi sge, %[[v2]], %[[c0]] : index
@@ -177,7 +177,7 @@ func.func @if_only() {
 // CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
 // CHECK-NEXT:   %[[c0:.*]] = arith.constant 0 : index
 // CHECK-NEXT:   %[[cm1:.*]] = arith.constant -1 : index
-// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] overflow<nsw> : index
 // CHECK-NEXT:   %[[c20:.*]] = arith.constant 20 : index
 // CHECK-NEXT:   %[[v2:.*]] = arith.addi %[[v1]], %[[c20]] : index
 // CHECK-NEXT:   %[[v3:.*]] = arith.cmpi sge, %[[v2]], %[[c0]] : index
@@ -202,7 +202,7 @@ func.func @if_else() {
 // CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
 // CHECK-NEXT:   %[[c0:.*]] = arith.constant 0 : index
 // CHECK-NEXT:   %[[cm1:.*]] = arith.constant -1 : index
-// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] overflow<nsw> : index
 // CHECK-NEXT:   %[[c20:.*]] = arith.constant 20 : index
 // CHECK-NEXT:   %[[v2:.*]] = arith.addi %[[v1]], %[[c20]] : index
 // CHECK-NEXT:   %[[v3:.*]] = arith.cmpi sge, %[[v2]], %[[c0]] : index
@@ -272,7 +272,7 @@ func.func @if_with_yield() -> (i64) {
 // CHECK-NEXT:   %[[v0:.*]] = call @get_idx() : () -> index
 // CHECK-NEXT:   %[[c0:.*]] = arith.constant 0 : index
 // CHECK-NEXT:   %[[cm1:.*]] = arith.constant -1 : index
-// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] overflow<nsw> : index
 // CHECK-NEXT:   %[[v2:.*]] = arith.addi %[[v1]], %{{.*}} : index
 // CHECK-NEXT:   %[[c1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[v3:.*]] = arith.addi %[[v2]], %[[c1]] : index
@@ -316,7 +316,7 @@ func.func @if_for() {
   %i = call @get_idx() : () -> (index)
 // CHECK-NEXT:   %[[c0:.*]] = arith.constant 0 : index
 // CHECK-NEXT:   %[[cm1:.*]] = arith.constant -1 : index
-// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] : index
+// CHECK-NEXT:   %[[v1:.*]] = arith.muli %[[v0]], %[[cm1]] overflow<nsw> : index
 // CHECK-NEXT:   %[[c20:.*]] = arith.constant 20 : index
 // CHECK-NEXT:   %[[v2:.*]] = arith.addi %[[v1]], %[[c20]] : index
 // CHECK-NEXT:   %[[v3:.*]] = arith.cmpi sge, %[[v2]], %[[c0]] : index
@@ -371,7 +371,7 @@ func.func @if_for() {
 // CHECK-NEXT:   %[[c1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   for %{{.*}} = %[[c0]] to %[[c42]] step %[[c1]] {
 // CHECK-NEXT:     %[[cm1:.*]] = arith.constant -1 : index
-// CHECK-NEXT:     %[[mul0:.*]] = arith.muli %{{.*}}, %[[cm1]] : index
+// CHECK-NEXT:     %[[mul0:.*]] = arith.muli %{{.*}}, %[[cm1]] overflow<nsw> : index
 // CHECK-NEXT:     %[[add0:.*]] = arith.addi %[[mul0]], %{{.*}} : index
 // CHECK-NEXT:     %[[max:.*]] = arith.maxsi %{{.*}}, %[[add0]] : index
 // CHECK-NEXT:     %[[c10:.*]] = arith.constant 10 : index
@@ -448,22 +448,22 @@ func.func @affine_applies(%arg0 : index) {
   %one = affine.apply #map3(%symbZero)[%zero]
 
 // CHECK-NEXT: %[[c2:.*]] = arith.constant 2 : index
-// CHECK-NEXT: %[[v2:.*]] = arith.muli %arg0, %[[c2]] : index
+// CHECK-NEXT: %[[v2:.*]] = arith.muli %arg0, %[[c2]] overflow<nsw> : index
 // CHECK-NEXT: %[[v3:.*]] = arith.addi %arg0, %[[v2]] : index
 // CHECK-NEXT: %[[c3:.*]] = arith.constant 3 : index
-// CHECK-NEXT: %[[v4:.*]] = arith.muli %arg0, %[[c3]] : index
+// CHECK-NEXT: %[[v4:.*]] = arith.muli %arg0, %[[c3]] overflow<nsw> : index
 // CHECK-NEXT: %[[v5:.*]] = arith.addi %[[v3]], %[[v4]] : index
 // CHECK-NEXT: %[[c4:.*]] = arith.constant 4 : index
-// CHECK-NEXT: %[[v6:.*]] = arith.muli %arg0, %[[c4]] : index
+// CHECK-NEXT: %[[v6:.*]] = arith.muli %arg0, %[[c4]] overflow<nsw> : index
 // CHECK-NEXT: %[[v7:.*]] = arith.addi %[[v5]], %[[v6]] : index
 // CHECK-NEXT: %[[c5:.*]] = arith.constant 5 : index
-// CHECK-NEXT: %[[v8:.*]] = arith.muli %arg0, %[[c5]] : index
+// CHECK-NEXT: %[[v8:.*]] = arith.muli %arg0, %[[c5]] overflow<nsw> : index
 // CHECK-NEXT: %[[v9:.*]] = arith.addi %[[v7]], %[[v8]] : index
 // CHECK-NEXT: %[[c6:.*]] = arith.constant 6 : index
-// CHECK-NEXT: %[[v10:.*]] = arith.muli %arg0, %[[c6]] : index
+// CHECK-NEXT: %[[v10:.*]] = arith.muli %arg0, %[[c6]] overflow<nsw> : index
 // CHECK-NEXT: %[[v11:.*]] = arith.addi %[[v9]], %[[v10]] : index
 // CHECK-NEXT: %[[c7:.*]] = arith.constant 7 : index
-// CHECK-NEXT: %[[v12:.*]] = arith.muli %arg0, %[[c7]] : index
+// CHECK-NEXT: %[[v12:.*]] = arith.muli %arg0, %[[c7]] overflow<nsw> : index
 // CHECK-NEXT: %[[v13:.*]] = arith.addi %[[v11]], %[[v12]] : index
   %four = affine.apply #map4(%arg0, %arg0, %arg0, %arg0)[%arg0, %arg0, %arg0]
   return
@@ -610,7 +610,7 @@ func.func @affine_store(%arg0 : index) {
     affine.store %1, %0[%i0 - symbol(%arg0) + 7] : memref<10xf32>
   }
 // CHECK:       %[[cm1:.*]] = arith.constant -1 : index
-// CHECK-NEXT:  %[[a:.*]] = arith.muli %{{.*}}, %[[cm1]] : index
+// CHECK-NEXT:  %[[a:.*]] = arith.muli %{{.*}}, %[[cm1]] overflow<nsw> : index
 // CHECK-NEXT:  %[[b:.*]] = arith.addi %{{.*}}, %[[a]] : index
 // CHECK-NEXT:  %[[c7:.*]] = arith.constant 7 : index
 // CHECK-NEXT:  %[[c:.*]] = arith.addi %[[b]], %[[c7]] : index
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 748dfe8..f52dd6c 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -633,7 +633,7 @@ gpu.module @test_module_29 {
     // CHECK-NEXT: %[[EL1:.*]] = llvm.getelementptr %[[ALLOC]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i32, f64)>
     // CHECK-NEXT: llvm.store %[[EXT]], %[[EL1]] : f64, !llvm.ptr
     // CHECK-NEXT: llvm.call @vprintf(%[[FORMATSTART]], %[[ALLOC]]) : (!llvm.ptr, !llvm.ptr) -> i32
-    gpu.printf "Hello: %d\n" %arg0, %arg1 : i32, f32
+    gpu.printf "Hello: %d\n", %arg0, %arg1 : i32, f32
     gpu.return
   }
 }
@@ -969,6 +969,35 @@ gpu.module @test_module_50 {
   }
 }
 
+// CHECK-LABEL: gpu.module @test_module_51
+//       CHECK:   llvm.mlir.global internal constant @[[func_name:.*]]("(unknown)\00") {addr_space = 0 : i32}
+//       CHECK:   llvm.mlir.global internal constant @[[file_name:.*]]("{{.*}}gpu-to-nvvm.mlir{{.*}}") {addr_space = 0 : i32}
+//       CHECK:   llvm.mlir.global internal constant @[[message:.*]]("assert message\00") {addr_space = 0 : i32}
+//       CHECK:   llvm.func @__assertfail(!llvm.ptr, !llvm.ptr, i32, !llvm.ptr, i64) attributes {passthrough = ["noreturn"]}
+//       CHECK:   llvm.func @test_assert(%[[cond:.*]]: i1) attributes {gpu.kernel, nvvm.kernel} {
+//       CHECK:     llvm.cond_br %[[cond]], ^[[after_block:.*]], ^[[assert_block:.*]]
+//       CHECK:   ^[[assert_block]]:
+//       CHECK:     %[[message_ptr:.*]] = llvm.mlir.addressof @[[message]] : !llvm.ptr
+//       CHECK:     %[[message_start:.*]] = llvm.getelementptr %[[message_ptr]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<15 x i8>
+//       CHECK:     %[[file_ptr:.*]] = llvm.mlir.addressof @[[file_name]] : !llvm.ptr
+//       CHECK:     %[[file_start:.*]] = llvm.getelementptr %[[file_ptr]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<{{.*}} x i8>
+//       CHECK:     %[[func_ptr:.*]] = llvm.mlir.addressof @[[func_name]] : !llvm.ptr
+//       CHECK:     %[[func_start:.*]] = llvm.getelementptr %[[func_ptr]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<{{.*}} x i8>
+//       CHECK:     %[[line_num:.*]] = llvm.mlir.constant({{.*}} : i32) : i32
+//       CHECK:     %[[ptr:.*]] = llvm.mlir.constant(1 : i64) : i64
+//       CHECK:     llvm.call @__assertfail(%[[message_start]], %[[file_start]], %[[line_num]], %[[func_start]], %[[ptr]]) : (!llvm.ptr, !llvm.ptr, i32, !llvm.ptr, i64) -> ()
+//       CHECK:     llvm.br ^[[after_block]]
+//       CHECK:   ^[[after_block]]:
+//       CHECK:     llvm.return
+//       CHECK:   }
+
+gpu.module @test_module_51 {
+  gpu.func @test_assert(%arg0: i1) kernel {
+    cf.assert %arg0, "assert message"
+    gpu.return
+  }
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
index 1b904fa..2dc6a5a 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
@@ -36,7 +36,7 @@ gpu.module @test_module {
     // CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64
     // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
-    gpu.printf "Hello: %d\n" %arg0 : i32
+    gpu.printf "Hello: %d\n", %arg0 : i32
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
index 870f5c5..00d1d7d 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
@@ -9,7 +9,7 @@ gpu.module @test_module {
     // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<4>
     // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<4>) -> !llvm.ptr<4>, !llvm.array<11 x i8>
     // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<4>, ...)>) : (!llvm.ptr<4>, i32) -> i32
-    gpu.printf "Hello: %d\n" %arg0 : i32
+    gpu.printf "Hello: %d\n", %arg0 : i32
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToSPIRV/printf.mlir b/mlir/test/Conversion/GPUToSPIRV/printf.mlir
index bc09112..7fe9752 100644
--- a/mlir/test/Conversion/GPUToSPIRV/printf.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/printf.mlir
@@ -62,7 +62,7 @@ module attributes {
         // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant>
         // CHECK-NEXT: [[FMTSTR_PTR1:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant> to !spirv.ptr<i8, UniformConstant>
         // CHECK-NEXT:  {{%.*}} = spirv.CL.printf [[FMTSTR_PTR1]] {{%.*}}, {{%.*}}, {{%.*}} : !spirv.ptr<i8, UniformConstant>, i32, f32, i32 -> i32
-        gpu.printf "\nHello, world : %d %f \n Thread id: %d\n" %arg0, %arg1, %2: i32, f32, index
+        gpu.printf "\nHello, world : %d %f \n Thread id: %d\n", %arg0, %arg1, %2: i32, f32, index
 
         // CHECK: spirv.Return
         gpu.return
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index a78db97..1fe4217 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -59,7 +59,7 @@ func.func @subview(%0 : memref<64x4xf32, strided<[4, 1], offset: 0>>, %arg0 : in
   // CHECK: %[[BASE:.*]] = llvm.extractvalue %[[MEMREF]][0] : !llvm.struct<(ptr, ptr, i64
   // CHECK: %[[BASE_ALIGNED:.*]] = llvm.extractvalue %[[MEMREF]][1] : !llvm.struct<(ptr, ptr, i64
   // CHECK: %[[STRIDE0:.*]] = llvm.mlir.constant(4 : index) : i64
-  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] : i64
+  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] overflow<nsw> : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[DESCSTRIDE0]] : i64 to index
   // CHECK: %[[DESCSTRIDE0_V2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
   // CHECK: %[[OFF2:.*]] = llvm.add %[[DESCSTRIDE0]], %[[ARG1]] : i64
@@ -95,10 +95,10 @@ func.func @subview_non_zero_addrspace(%0 : memref<64x4xf32, strided<[4, 1], offs
   // CHECK: %[[BASE:.*]] = llvm.extractvalue %[[MEMREF]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[BASE_ALIGNED:.*]] = llvm.extractvalue %[[MEMREF]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[STRIDE0:.*]] = llvm.mlir.constant(4 : index) : i64
-  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]]  : i64
+  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] overflow<nsw> : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[DESCSTRIDE0]] : i64 to index
   // CHECK: %[[DESCSTRIDE0_V2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
-  // CHECK: %[[OFF2:.*]] = llvm.add %[[DESCSTRIDE0]], %[[ARG1]]  : i64
+  // CHECK: %[[OFF2:.*]] = llvm.add %[[DESCSTRIDE0]], %[[ARG1]] : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[OFF2]] : i64 to index
   // CHECK: %[[OFF2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
   // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
@@ -131,10 +131,10 @@ func.func @subview_const_size(%0 : memref<64x4xf32, strided<[4, 1], offset: 0>>,
   // CHECK: %[[BASE:.*]] = llvm.extractvalue %[[MEMREF]][0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[BASE_ALIGNED:.*]] = llvm.extractvalue %[[MEMREF]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : index) : i64
-  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[C4]]  : i64
+  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[C4]] overflow<nsw> : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[DESCSTRIDE0]] : i64 to index
   // CHECK: %[[DESCSTRIDE0_V2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
-  // CHECK: %[[OFF2:.*]] = llvm.add %[[DESCSTRIDE0]], %[[ARG1]]  : i64
+  // CHECK: %[[OFF2:.*]] = llvm.add %[[DESCSTRIDE0]], %[[ARG1]] : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[OFF2]] : i64 to index
   // CHECK: %[[OFF2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
   // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
@@ -168,8 +168,8 @@ func.func @subview_const_stride(%0 : memref<64x4xf32, strided<[4, 1], offset: 0>
   // CHECK: %[[BASE:.*]] = llvm.extractvalue %[[MEMREF]][0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[BASE_ALIGNED:.*]] = llvm.extractvalue %[[MEMREF]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : index) : i64
-  // CHECK: %[[OFF0:.*]] = llvm.mul %[[ARG0]], %[[C4]]  : i64
-  // CHECK: %[[OFF2:.*]] = llvm.add %[[OFF0]], %[[ARG1]]  : i64
+  // CHECK: %[[OFF0:.*]] = llvm.mul %[[ARG0]], %[[C4]] overflow<nsw> : i64
+  // CHECK: %[[OFF2:.*]] = llvm.add %[[OFF0]], %[[ARG1]] : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[OFF2]] : i64 to index
   // CHECK: %[[OFF2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
   // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
@@ -234,12 +234,12 @@ func.func @subview_mixed_static_dynamic(%0 : memref<64x4xf32, strided<[4, 1], of
   // CHECK: %[[BASE:.*]] = llvm.extractvalue %[[MEMREF]][0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[BASE_ALIGNED:.*]] = llvm.extractvalue %[[MEMREF]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
   // CHECK: %[[STRIDE0:.*]] = llvm.mlir.constant(4 : index) : i64
-  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]]  : i64
+  // CHECK: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] overflow<nsw> : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[DESCSTRIDE0]] : i64 to index
   // CHECK: %[[DESCSTRIDE0_V2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
-  // CHECK: %[[OFF0:.*]] = llvm.mul %[[ARG1]], %[[STRIDE0]]  : i64
+  // CHECK: %[[OFF0:.*]] = llvm.mul %[[ARG1]], %[[STRIDE0]] overflow<nsw> : i64
   // CHECK: %[[BASE_OFF:.*]] = llvm.mlir.constant(8 : index)  : i64
-  // CHECK: %[[OFF2:.*]] = llvm.add %[[OFF0]], %[[BASE_OFF]]  : i64
+  // CHECK: %[[OFF2:.*]] = llvm.add %[[OFF0]], %[[BASE_OFF]] : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[OFF2]] : i64 to index
   // CHECK: %[[OFF2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
   // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
@@ -301,7 +301,7 @@ func.func @subview_leading_operands_dynamic(%0 : memref<5x?xf32>) -> memref<3x?x
   // CHECK: %[[STRIDE0:.*]] = llvm.extractvalue %[[MEMREF]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
   // Compute and insert offset from 2 + dynamic value.
   // CHECK: %[[CST_OFF0:.*]] = llvm.mlir.constant(2 : index) : i64
-  // CHECK: %[[OFF0:.*]] = llvm.mul %[[STRIDE0]], %[[CST_OFF0]] : i64
+  // CHECK: %[[OFF0:.*]] = llvm.mul %[[STRIDE0]], %[[CST_OFF0]] overflow<nsw> : i64
   // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[OFF0]] : i64 to index
   // CHECK: %[[OFF0:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64
   // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
@@ -425,7 +425,7 @@ func.func @collapse_shape_dynamic_with_non_identity_layout(
 // CHECK:           %[[SIZE1:.*]] = llvm.extractvalue %[[MEM]][3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)>
 // CHECK:           %[[SIZE2:.*]] = llvm.extractvalue %[[MEM]][3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)>
 // CHECK:           %[[STRIDE0:.*]] = llvm.extractvalue %[[MEM]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)>
-// CHECK:           %[[FINAL_SIZE1:.*]] = llvm.mul %[[SIZE1]], %[[SIZE2]]  : i64
+// CHECK:           %[[FINAL_SIZE1:.*]] = llvm.mul %[[SIZE1]], %[[SIZE2]] overflow<nsw> : i64
 // CHECK:           %[[SIZE1_TO_IDX:.*]] = builtin.unrealized_conversion_cast %[[FINAL_SIZE1]] : i64 to index
 // CHECK:           %[[FINAL_SIZE1:.*]] = builtin.unrealized_conversion_cast %[[SIZE1_TO_IDX]] : index to i64
 // CHECK:           %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
@@ -547,7 +547,7 @@ func.func @collapse_shape_dynamic(%arg0 : memref<1x2x?xf32>) -> memref<1x?xf32>
 // CHECK:           %[[SIZE2:.*]] = llvm.extractvalue %[[MEM]][3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)>
 // CHECK:           %[[STRIDE0:.*]] = llvm.extractvalue %[[MEM]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)>
 // CHECK:           %[[C2:.*]] = llvm.mlir.constant(2 : index) : i64
-// CHECK:           %[[FINAL_SIZE1:.*]] = llvm.mul %[[SIZE2]], %[[C2]]  : i64
+// CHECK:           %[[FINAL_SIZE1:.*]] = llvm.mul %[[SIZE2]], %[[C2]] overflow<nsw> : i64
 // CHECK:           %[[SIZE1_TO_IDX:.*]] = builtin.unrealized_conversion_cast %[[FINAL_SIZE1]] : i64 to index
 // CHECK:           %[[FINAL_SIZE1:.*]] = builtin.unrealized_conversion_cast %[[SIZE1_TO_IDX]] : index to i64
 // CHECK:           %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
diff --git a/mlir/test/Conversion/SCFToEmitC/for.mlir b/mlir/test/Conversion/SCFToEmitC/for.mlir
index 8359218..7f41e63 100644
--- a/mlir/test/Conversion/SCFToEmitC/for.mlir
+++ b/mlir/test/Conversion/SCFToEmitC/for.mlir
@@ -7,8 +7,11 @@ func.func @simple_std_for_loop(%arg0 : index, %arg1 : index, %arg2 : index) {
   return
 }
 // CHECK-LABEL: func.func @simple_std_for_loop(
-// CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) {
-// CHECK-NEXT:    emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-SAME:      %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) {
+// CHECK-NEXT:    %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t
+// CHECK-NEXT:    %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t
+// CHECK-NEXT:    %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
+// CHECK-NEXT:    emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t {
 // CHECK-NEXT:      %[[VAL_4:.*]] = arith.constant 1 : index
 // CHECK-NEXT:    }
 // CHECK-NEXT:    return
@@ -24,10 +27,13 @@ func.func @simple_std_2_for_loops(%arg0 : index, %arg1 : index, %arg2 : index) {
   return
 }
 // CHECK-LABEL: func.func @simple_std_2_for_loops(
-// CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) {
-// CHECK-NEXT:    emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-SAME:      %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) {
+// CHECK-NEXT:    %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t
+// CHECK-NEXT:    %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t
+// CHECK-NEXT:    %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
+// CHECK-NEXT:    emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t {
 // CHECK-NEXT:      %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK-NEXT:      emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:      emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t {
 // CHECK-NEXT:        %[[VAL_6:.*]] = arith.constant 1 : index
 // CHECK-NEXT:      }
 // CHECK-NEXT:    }
@@ -44,14 +50,17 @@ func.func @for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> (f32, f32)
   return %result#0, %result#1 : f32, f32
 }
 // CHECK-LABEL: func.func @for_yield(
-// CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> (f32, f32) {
+// CHECK-SAME:      %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> (f32, f32) {
+// CHECK-NEXT:    %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t
+// CHECK-NEXT:    %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t
+// CHECK-NEXT:    %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
 // CHECK-NEXT:    %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:    %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK-NEXT:    %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<f32>
 // CHECK-NEXT:    %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<f32>
 // CHECK-NEXT:    emitc.assign %[[VAL_3]] : f32 to %[[VAL_5]] : <f32>
 // CHECK-NEXT:    emitc.assign %[[VAL_4]] : f32 to %[[VAL_6]] : <f32>
-// CHECK-NEXT:    emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:    emitc.for %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t {
 // CHECK-NEXT:      %[[VAL_8:.*]] = emitc.load %[[VAL_5]] : <f32>
 // CHECK-NEXT:      %[[VAL_9:.*]] = emitc.load %[[VAL_6]] : <f32>
 // CHECK-NEXT:      %[[VAL_10:.*]] = arith.addf %[[VAL_8]], %[[VAL_9]] : f32
@@ -75,15 +84,18 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32
   return %r : f32
 }
 // CHECK-LABEL: func.func @nested_for_yield(
-// CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> f32 {
+// CHECK-SAME:      %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> f32 {
+// CHECK-NEXT:    %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t
+// CHECK-NEXT:    %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t
+// CHECK-NEXT:    %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
 // CHECK-NEXT:    %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK-NEXT:    %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<f32>
 // CHECK-NEXT:    emitc.assign %[[VAL_3]] : f32 to %[[VAL_4]] : <f32>
-// CHECK-NEXT:    emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:    emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t {
 // CHECK-NEXT:      %[[VAL_6:.*]] = emitc.load %[[VAL_4]] : <f32>
 // CHECK-NEXT:      %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<f32>
 // CHECK-NEXT:      emitc.assign %[[VAL_6]] : f32 to %[[VAL_7]] : <f32>
-// CHECK-NEXT:      emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:      emitc.for %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] : !emitc.size_t {
 // CHECK-NEXT:        %[[VAL_9:.*]] = emitc.load %[[VAL_7]] : <f32>  
 // CHECK-NEXT:        %[[VAL_10:.*]] = arith.addf %[[VAL_9]], %[[VAL_9]] : f32
 // CHECK-NEXT:        emitc.assign %[[VAL_10]] : f32 to %[[VAL_7]] : <f32>
@@ -94,3 +106,60 @@ func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32
 // CHECK-NEXT:    %[[VAL_12:.*]] = emitc.load %[[VAL_4]] : <f32>  
 // CHECK-NEXT:    return %[[VAL_12]] : f32
 // CHECK-NEXT:  }
+
+func.func @for_yield_index(%arg0 : index, %arg1 : index, %arg2 : index) -> index {
+  %zero = arith.constant 0 : index
+  %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index {
+    scf.yield %acc : index
+  }
+  return %r : index
+}
+
+// CHECK-LABEL: func.func @for_yield_index(
+// CHECK-SAME: %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index {
+// CHECK:     %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t
+// CHECK:     %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t
+// CHECK:     %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
+// CHECK:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:     %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t
+// CHECK:     %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<!emitc.size_t>
+// CHECK:     emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : <!emitc.size_t>
+// CHECK:     emitc.for %[[VAL_5:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t {
+// CHECK:       %[[V:.*]] = emitc.load %[[VAL_4]] : <!emitc.size_t>
+// CHECK:       emitc.assign %[[V]] : !emitc.size_t to %[[VAL_4]] : <!emitc.size_t>
+// CHECK:     }
+// CHECK:     %[[V2:.*]] = emitc.load %[[VAL_4]] : <!emitc.size_t>
+// CHECK:     %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index
+// CHECK:     return %[[VAL_8]] : index
+// CHECK:   }
+
+
+func.func @for_yield_update_loop_carried_var(%arg0 : index, %arg1 : index, %arg2 : index) -> index {
+   %zero = arith.constant 0 : index
+   %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%acc = %zero) -> index {
+     %sn = arith.addi %acc, %acc : index
+     scf.yield %sn: index
+   }
+   return %r : index
+ }
+
+// CHECK-LABEL: func.func @for_yield_update_loop_carried_var(
+// CHECK-SAME:  %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index) -> index {
+// CHECK:   %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_2]] : index to !emitc.size_t
+// CHECK:   %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG_1]] : index to !emitc.size_t
+// CHECK:   %[[VAL_2:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
+// CHECK:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK:   %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to !emitc.size_t
+// CHECK:   %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<!emitc.size_t>
+// CHECK:   emitc.assign %[[VAL_3]] : !emitc.size_t to %[[VAL_4]] : <!emitc.size_t>
+// CHECK:   emitc.for %[[ARG_3:.*]] = %[[VAL_2]] to %[[VAL_1]] step %[[VAL_0]] : !emitc.size_t {
+// CHECK:     %[[V:.*]] = emitc.load %[[VAL_4]] : <!emitc.size_t>
+// CHECK:     %[[VAL_5:.*]] = builtin.unrealized_conversion_cast %[[V]] : !emitc.size_t to index
+// CHECK:     %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_5]] : index
+// CHECK:     %[[VAL_8:.*]] = builtin.unrealized_conversion_cast %[[VAL_6]] : index to !emitc.size_t
+// CHECK:     emitc.assign %[[VAL_8]] : !emitc.size_t to %[[VAL_4]] : <!emitc.size_t>
+// CHECK:   }
+// CHECK:   %[[V2:.*]] = emitc.load %[[VAL_4]] : <!emitc.size_t>
+// CHECK:   %[[VAL_9:.*]] = builtin.unrealized_conversion_cast %[[V2]] : !emitc.size_t to index
+// CHECK:   return %[[VAL_9]] : index
+// CHECK: }
diff --git a/mlir/test/Conversion/SCFToEmitC/switch.mlir b/mlir/test/Conversion/SCFToEmitC/switch.mlir
index 86d96ed..61015b0 100644
--- a/mlir/test/Conversion/SCFToEmitC/switch.mlir
+++ b/mlir/test/Conversion/SCFToEmitC/switch.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-emitc %s | FileCheck %s
 
 // CHECK-LABEL:   func.func @switch_no_result(
-// CHECK-SAME:                                %[[VAL_0:.*]]: index) {
+// CHECK-SAME:                                %[[ARG_0:.*]]: index) {
+// CHECK:           %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
 // CHECK:           emitc.switch %[[VAL_0]]
 // CHECK:           case 2 {
 // CHECK:             %[[VAL_1:.*]] = arith.constant 10 : i32
@@ -33,7 +34,8 @@ func.func @switch_no_result(%arg0 : index) {
 }
 
 // CHECK-LABEL:   func.func @switch_one_result(
-// CHECK-SAME:                                 %[[VAL_0:.*]]: index) {
+// CHECK-SAME:                                 %[[ARG_0:.*]]: index) {
+// CHECK:           %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
 // CHECK:           %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
 // CHECK:           emitc.switch %[[VAL_0]]
 // CHECK:           case 2 {
@@ -70,7 +72,8 @@ func.func @switch_one_result(%arg0 : index) {
 }
 
 // CHECK-LABEL:   func.func @switch_two_results(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: index) -> (i32, f32) {
+// CHECK-SAME:                                  %[[ARG_0:.*]]: index) -> (i32, f32) {
+// CHECK:           %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG_0]] : index to !emitc.size_t
 // CHECK:           %[[VAL_1:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
 // CHECK:           %[[VAL_2:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<f32>
 // CHECK:           emitc.switch %[[VAL_0]]
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index bfdc72e..453a861 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -510,7 +510,7 @@ func.func @avg_pool_dyn(%arg0: tensor<?x6x34x62xf32>) -> (tensor<?x5x33x62xf32>)
 func.func @conv2d_scalar_bias_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<1xf32>) -> () {
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<1xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) {
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<1xf32>) -> tensor<1x45x40x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<1xf32>) -> tensor<1x45x40x28xf32>
   return
 }
 
@@ -531,7 +531,7 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi
   // CHECK: linalg.conv_2d_nhwc_fhwc_q {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32_0 : tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, i32, i32) outs(%[[BROADCAST]] : tensor<1x45x40x28xi32>) -> tensor<1x45x40x28xi32>
   // HWCF: linalg.conv_2d_nhwc_hwcf_q {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSE]], %c0_i32, %c0_i32_0 : tensor<1x49x42x27xi8>, tensor<1x1x27x28xi8>, i32, i32) outs(%{{[a-zA-Z0-9_]*}} : tensor<1x45x40x28xi32>) -> tensor<1x45x40x28xi32>
 
-  %0 = tosa.conv2d %input, %weights, %bias {dilation = array<i64: 2, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, tensor<28xi8>) -> tensor<1x45x40x28xi32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = i32, dilation = array<i64: 2, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, tensor<28xi8>) -> tensor<1x45x40x28xi32>
   return
 }
 
@@ -552,7 +552,7 @@ func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27
   // CHECK: linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>) outs(%1 : tensor<1x45x40x28xf32>) -> tensor<1x45x40x28xf32>
 
   // HWCF: linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%{{[a-zA-Z0-9_]*}} : tensor<1x45x40x28xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
   return
 }
 
@@ -571,7 +571,7 @@ func.func @conv2d_dyn(%input: tensor<?x49x42x27xf32>, %weights: tensor<28x3x3x27
   // CHECK:   linalg.yield %[[IN]] : f32
   // CHECK: } -> tensor<?x45x40x28xf32>
   // CHECK: %2 = linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<?x49x42x27xf32>, tensor<28x3x3x27xf32>) outs(%[[BROADCAST]] : tensor<?x45x40x28xf32>) -> tensor<?x45x40x28xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<?x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<?x45x40x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<?x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<?x45x40x28xf32>
   return
 }
 
@@ -627,7 +627,7 @@ func.func @conv2d_dyn_w_h(%input: tensor<1x?x?x27xf32>, %weights: tensor<28x3x3x
   // CHECK: } -> tensor<1x?x?x28xf32>
   // CHECK: linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x?x?x27xf32>, tensor<28x3x3x27xf32>) outs(%17 : tensor<1x?x?x28xf32>) -> tensor<1x?x?x28xf32>
 
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x?x?x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x?x?x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x?x?x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x?x?x28xf32>
   return
 }
 
@@ -650,7 +650,7 @@ func.func @conv2d_dyn_output(%input: tensor<2x6x5x4xf32>, %weights: tensor<4x3x3
   //   linalg.yield %[[ADD]] : f32
   // } -> tensor<?x4x3x4xf32>
 
-  %0 = tosa.conv2d %input, %weights, %bias {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x6x5x4xf32    >, tensor<4x3x3x4xf32>, tensor<4xf32>) -> tensor<?x4x3x4xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x6x5x4xf32    >, tensor<4x3x3x4xf32>, tensor<4xf32>) -> tensor<?x4x3x4xf32>
   return
 }
 
@@ -662,7 +662,7 @@ func.func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28
   // CHECK: tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
   // CHECK:   tensor.yield %[[C0]]
   // CHECK: linalg.conv_2d_nhwc_fhwc
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
   return
 }
 
@@ -674,7 +674,7 @@ func.func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1x
   // CHECK: tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
   // CHECK:   tensor.yield %[[C22]]
   // CHECK: linalg.conv_2d_nhwc_fhwc_q
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32>
   return
 }
 
@@ -696,7 +696,7 @@ func.func @depthwise_conv(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf
   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
   // CHECK:   linalg.yield [[ADD]] : f32
   // CHECK: } -> tensor<1x5x5x33xf32>
-  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
+  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
   return
 }
 
@@ -712,7 +712,7 @@ func.func @depthwise_conv_scalar_bias(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tenso
   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
   // CHECK:   linalg.yield [[ADD]] : f32
   // CHECK: } -> tensor<1x5x5x33xf32>
-  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<1xf32>)  -> tensor<1x5x5x33xf32>
+  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<1xf32>)  -> tensor<1x5x5x33xf32>
   return
 }
 
@@ -736,7 +736,7 @@ func.func @depthwise_conv_dyn(%arg0 : tensor<?x7x5x3xf32>, %arg1 : tensor<3x1x3x
   // CHECK:   %[[ADD:.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
   // CHECK:   linalg.yield %[[ADD]] : f32
   // CHECK: } -> tensor<?x5x5x33xf32>
-  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<?x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<?x5x5x33xf32>
+  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<?x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<?x5x5x33xf32>
   return
 }
 
@@ -758,7 +758,7 @@ func.func @depthwise_conv_strides(%arg0 : tensor<1x11x9x3xf32>, %arg1 : tensor<3
   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
   // CHECK:   linalg.yield [[ADD]] : f32
   // CHECK: } -> tensor<1x5x5x33xf32>
-  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1> } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
+  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1> } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
   return
 }
 
@@ -786,7 +786,7 @@ func.func @depthwise_conv_quant(%arg0 : tensor<1x12x12x4xi8>, %arg1 : tensor<3x3
   // CHECK:   [[ADD:%.+]] = arith.addi %[[ARG3]], %[[ARG4]] : i32
   // CHECK:   linalg.yield [[ADD]] : i32
   // CHECK: } -> tensor<1x12x12x512xi32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x12x12x512xi32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x12x12x512xi32>
   return
 }
 
@@ -810,7 +810,7 @@ func.func @depthwise_conv_quant_dilations(%arg0 : tensor<1x14x14x4xi8>, %arg1 :
   // CHECK:   [[ADD:%.+]] = arith.addi %[[ARG3]], %[[ARG4]] : i32
   // CHECK:   linalg.yield [[ADD]] : i32
   // CHECK: } -> tensor<1x10x10x512xi32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 2> } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x10x10x512xi32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 2> } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x10x10x512xi32>
   return
 }
 
@@ -826,7 +826,7 @@ func.func @depthwise_conv2d_dyn_w_h(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<3x
   // CHECK:  } : tensor<2x?x?x3xf32> to tensor<2x?x?x3xf32>
   // CHECK: %[[CONV:.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} ins(%[[PADDED]], %arg1 : tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>) outs(%{{.*}} : tensor<2x?x?x3x5xf32>) -> tensor<2x?x?x3x5xf32>
   // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[CONV]] {{\[}}[0], [1], [2], [3, 4]]
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 2, 3, 4>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 2>} : (tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 1, 2, 3, 4>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 2>} : (tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
   return
 }
 
@@ -850,7 +850,7 @@ func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4
   // CHECK-SAME: {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
   // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x48x47x27xf32>, tensor<3x4x5x27x28xf32>)
   // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf32>
-  %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<28xf32>)  -> tensor<1x47x45x43x28xf32>
+  %0 = tosa.conv3d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<28xf32>)  -> tensor<1x47x45x43x28xf32>
   return
 }
 
@@ -864,7 +864,7 @@ func.func @conv3d_scalar_bias_f32(%input: tensor<1x49x48x47x27xf32>, %weights: t
   // CHECK:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xf32>
   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
-  %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<1xf32>)  -> tensor<1x47x45x43x28xf32>
+  %0 = tosa.conv3d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<1xf32>)  -> tensor<1x47x45x43x28xf32>
   return
 }
 
@@ -892,7 +892,7 @@ func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5
   // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]], %[[IZP]], %[[FZP]] : tensor<1x49x48x47x27xi8>, tensor<3x4x5x27x28xi8>, i32, i32)
   // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x28xi32>) -> tensor<1x47x45x43x28xi32>
 
-  %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xi8>, tensor<28x3x4x5x27xi8>, tensor<28xi32>)  -> tensor<1x47x45x43x28xi32>
+  %0 = tosa.conv3d %input, %weights, %bias {acc_type = i32, pad = array<i64: 0, 0, 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xi8>, tensor<28x3x4x5x27xi8>, tensor<28xi32>)  -> tensor<1x47x45x43x28xi32>
   return
 }
 
diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
index 1e62e25..0b9a644 100644
--- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
+++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
@@ -459,85 +459,65 @@ func.func @slice_dyn(%arg0: tensor<?xf32>) -> (tensor<?xf32>) {
 // CHECK-LABEL: @pad_float
 // CHECK-SAME: (%[[ARG0:[0-9a-zA-Z_]*]]:
 func.func @pad_float(%arg0 : tensor<1x2xf32>) -> (tensor<4x9xf32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // TODO: Output contains multiple "arith.constant 1 : index".
-  // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index
-  // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index
-  // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
-  // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32
-  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] {
   // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<1x2xf32> to tensor<4x9xf32>
-  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<4x9xf32>)
+  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<4xi32>)  -> (tensor<4x9xf32>)
   return %1 : tensor<4x9xf32>
 }
 
 func.func @pad_int(%arg0 : tensor<1x2xi32>) -> (tensor<4x9xi32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK: [[CST:%.+]] = arith.constant 0 : i32
   // CHECK: tensor.pad
   // CHECK:   tensor.yield [[CST]]
-  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xi32>, tensor<2x2xi32>)  -> (tensor<4x9xi32>)
+  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xi32>, tensor<4xi32>)  -> (tensor<4x9xi32>)
   return %1 : tensor<4x9xi32>
 }
 
 func.func @pad_quant(%arg0 : tensor<1x2xi32>) -> (tensor<4x9xi32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK: [[CST:%.+]] = arith.constant 42 : i32
   // CHECK: tensor.pad
   // CHECK:   tensor.yield [[CST]]
-  %1 = "tosa.pad"(%arg0, %0) {quantization_info = #tosa.pad_quant<input_zp = 42>} : (tensor<1x2xi32>, tensor<2x2xi32>)  -> (tensor<4x9xi32>)
+  %1 = "tosa.pad"(%arg0, %0) {quantization_info = #tosa.pad_quant<input_zp = 42>} : (tensor<1x2xi32>, tensor<4xi32>)  -> (tensor<4x9xi32>)
   return %1 : tensor<4x9xi32>
 }
 
 // -----
 
 func.func @pad_float_explicit(%arg0 : tensor<1x2xf32>) -> (tensor<4x9xf32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // TODO: Output contains multiple "arith.constant 1 : index".
-  // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index
-  // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index
-  // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
-  // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK-DAG: [[CST:%.+]] = arith.constant 4.200000e+01 : f32
-  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] {
   // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<1x2xf32> to tensor<4x9xf32>
   %1 = arith.constant dense<42.0> : tensor<f32>
-  %2 = "tosa.pad"(%arg0, %0, %1)  : (tensor<1x2xf32>, tensor<2x2xi32>, tensor<f32>)  -> (tensor<4x9xf32>)
+  %2 = "tosa.pad"(%arg0, %0, %1)  : (tensor<1x2xf32>, tensor<4xi32>, tensor<f32>)  -> (tensor<4x9xf32>)
   return %2 : tensor<4x9xf32>
 }
 
 // -----
 
 func.func @pad_dyn_input(%arg0 : tensor<?x2xf32>) -> (tensor<?x9xf32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // TODO: Output contains multiple "arith.constant 1 : index".
-  // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index
-  // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index
-  // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
-  // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32
-  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] {
   // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<?x2xf32> to tensor<?x9xf32>
-  %1 = "tosa.pad"(%arg0, %0)  : (tensor<?x2xf32>, tensor<2x2xi32>)  -> (tensor<?x9xf32>)
+  %1 = "tosa.pad"(%arg0, %0)  : (tensor<?x2xf32>, tensor<4xi32>)  -> (tensor<?x9xf32>)
   return %1 : tensor<?x9xf32>
 }
 
 func.func @pad_dyn_padding(%arg0 : tensor<1x2xf32>) -> (tensor<?x9xf32>) {
-  %0 = arith.constant dense<[[-1, 2], [3, 4]]> : tensor<2x2xi32>
-  // TODO: Output contains multiple "arith.constant 1 : index".
-  // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index
-  // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index
-  // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
-  // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
+  %0 = arith.constant dense<[-1, 2, 3, 4]> : tensor<4xi32>
   // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32
-  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] {
   // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<1x2xf32> to tensor<?x9xf32>
-  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<?x9xf32>)
+  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<4xi32>)  -> (tensor<?x9xf32>)
   return %1 : tensor<?x9xf32>
 }
 
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
index 717004e..a9ac13a 100644
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -1917,12 +1917,12 @@ func.func @linearize_one_element_basis(%arg0: index, %arg1: index) -> index {
 
 // -----
 
-// CHECK-LABEL: func @cancel_linearize_denearize_exact(
+// CHECK-LABEL: func @cancel_linearize_delinearize_exact(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
 //       CHECK:     return %[[ARG0]]
-func.func @cancel_linearize_denearize_exact(%arg0: index, %arg1: index, %arg2: index) -> index {
+func.func @cancel_linearize_delinearize_exact(%arg0: index, %arg1: index, %arg2: index) -> index {
   %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index
   %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 4, %arg2) : index
   return %1 : index
@@ -1930,12 +1930,12 @@ func.func @cancel_linearize_denearize_exact(%arg0: index, %arg1: index, %arg2: i
 
 // -----
 
-// CHECK-LABEL: func @cancel_linearize_denearize_linearize_extra_bound(
+// CHECK-LABEL: func @cancel_linearize_delinearize_linearize_extra_bound(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
 //       CHECK:     return %[[ARG0]]
-func.func @cancel_linearize_denearize_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index {
+func.func @cancel_linearize_delinearize_linearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index {
   %0:3 = affine.delinearize_index %arg0 into (4, %arg2) : index, index, index
   %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 4, %arg2) : index
   return %1 : index
@@ -1943,12 +1943,12 @@ func.func @cancel_linearize_denearize_linearize_extra_bound(%arg0: index, %arg1:
 
 // -----
 
-// CHECK-LABEL: func @cancel_linearize_denearize_delinearize_extra_bound(
+// CHECK-LABEL: func @cancel_linearize_delinearize_delinearize_extra_bound(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
 //       CHECK:     return %[[ARG0]]
-func.func @cancel_linearize_denearize_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index {
+func.func @cancel_linearize_delinearize_delinearize_extra_bound(%arg0: index, %arg1: index, %arg2: index) -> index {
   %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index
   %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (4, %arg2) : index
   return %1 : index
@@ -1956,31 +1956,252 @@ func.func @cancel_linearize_denearize_delinearize_extra_bound(%arg0: index, %arg
 
 // -----
 
+// CHECK-LABEL: func @cancel_linearize_delinearize_head(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (12, 8)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (12, 16)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_head(%arg0: index, %arg1: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index
+  %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (3, 4, 16) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_head_delinearize_unbounded(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (12, 8)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (12, 16)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_head_delinearize_unbounded(%arg0: index, %arg1: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (4, 8) : index, index, index
+  %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (3, 4, 16) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_head_linearize_unbounded(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (16)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_head_linearize_unbounded(%arg0: index, %arg1: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index
+  %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (4, 16) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_head_both_unbounded(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[ARG1]]] by (16)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_head_both_unbounded(%arg0: index, %arg1: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (4, 8) : index, index, index
+  %1 = affine.linearize_index [%0#0, %0#1, %arg1] by (4, 16) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_tail(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (3, 32)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#1] by (5, 32)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_tail(%arg0: index, %arg1: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (3, 4, 8) : index, index, index
+  %1 = affine.linearize_index [%arg1, %0#1, %0#2] by (5, 4, 8) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-z0-9]+]]: index)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[ARG0]], %[[ARG2]]] by (9, 30, 7)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_middle_exact(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (2, 3, 5) : index, index, index
+  %1 = affine.linearize_index [%arg1, %0#0, %0#1, %0#2, %arg2] by (9, 2, 3, 5, 7) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) * 16)>
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact_dynamic_basis(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-z0-9]+]]: index)
+//       CHECK:     %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:     %[[SIZEPROD:.+]] = affine.apply #[[$MAP]]()[%[[ARG1]], %[[ARG2]]]
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[C1]], %[[ARG0]], %[[C1]]] by (3, %[[SIZEPROD]], 4)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_middle_exact_dynamic_basis(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %c1 = arith.constant 1 : index
+  %0:4 = affine.delinearize_index %arg0 into (2, %arg1, %arg2, 8) : index, index, index, index
+  %1 = affine.linearize_index [%c1, %0#0, %0#1, %0#2, %0#3, %c1] by (3, 2, %arg1, %arg2, 8, 4) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_middle_exact_delinearize_unbounded_disjoint(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-z0-9]+]]: index)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index disjoint [%[[ARG1]], %[[ARG0]], %[[ARG2]]] by (9, 30, 7)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_middle_exact_delinearize_unbounded_disjoint(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (3, 5) : index, index, index
+  %1 = affine.linearize_index disjoint [%arg1, %0#0, %0#1, %0#2, %arg2] by (9, 2, 3, 5, 7) : index
+  return %1 : index
+}
+
+// -----
+
+// Unlike in the test above, the linerize indices aren't asserted to be disjoint, so
+// we can't know if the `2` from the basis is a correct bound.
+// CHECK-LABEL: func @dont_cancel_linearize_delinearize_middle_exact_delinearize_unbounded(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-z0-9]+]]: index)
+//       CHECK:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (3)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#0, %[[DELIN]]#1, %[[ARG2]]] by (9, 2, 3, 7)
+//       CHECK:     return %[[LIN]]
+
+func.func @dont_cancel_linearize_delinearize_middle_exact_delinearize_unbounded(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %0:2 = affine.delinearize_index %arg0 into (3) : index, index
+  %1 = affine.linearize_index [%arg1, %0#0, %0#1, %arg2] by (9, 2, 3, 7) : index
+  return %1 : index
+}
+
+// -----
+
+// The presence of a `disjoint` here tells us that the "unbounded" term on the
+// delinearization can't have been above 2.
+// CHECK-LABEL: func @cancel_linearize_delinearize_middle_delinearize_unbounded_disjoint_implied_bound(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-z0-9]+]]: index)
+//       CHECK:     %[[DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (6, 5)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index disjoint [%[[ARG1]], %[[DELIN]]#0, %[[ARG2]]] by (9, 6, 7)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_middle_delinearize_unbounded_disjoint_implied_bound(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %0:3 = affine.delinearize_index %arg0 into (3, 5) : index, index, index
+  %1 = affine.linearize_index disjoint [%arg1, %0#0, %0#1, %arg2] by (9, 2, 3, 7) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_multiple_matches(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[C0:.+]] = arith.constant 0
+//       CHECK:     %[[DELIN:.+]]:4 = affine.delinearize_index %[[ARG0]] into (4, 16, 4, 64)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[ARG1]], %[[DELIN]]#1, %[[C0]], %[[DELIN]]#3] by (4, 16, 4, 64)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_multiple_matches(%arg0: index, %arg1: index) -> index {
+  %c0 = arith.constant 0 : index
+  %0:7 = affine.delinearize_index %arg0 into (4, 4, 4, 4, 4, 4, 4) : index, index, index, index, index, index, index
+  %1 = affine.linearize_index [%arg1, %0#1, %0#2, %c0, %0#4, %0#5, %0#6] by (4, 4, 4, 4, 4, 4, 4) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @cancel_linearize_delinearize_multiple_delinearizes(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (32, 32)
+//       CHECK:     return %[[LIN]]
+func.func @cancel_linearize_delinearize_multiple_delinearizes(%arg0: index, %arg1: index) -> index {
+  %0:2 = affine.delinearize_index %arg0 into (4, 8) : index, index
+  %1:2 = affine.delinearize_index %arg1 into (2, 16) : index, index
+  %2 = affine.linearize_index [%0#0, %0#1, %1#0, %1#1] by (4, 8, 2, 16) : index
+  return %2 : index
+}
+
+// -----
+
 // Don't cancel because the values from the delinearize aren't used in order
-// CHECK-LABEL: func @no_cancel_linearize_denearize_permuted(
+// CHECK-LABEL: func @no_cancel_linearize_delinearize_permuted(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
 //       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[ARG2]])
-//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], 4, %[[ARG2]])
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], %[[ARG2]], 4)
 //       CHECK:     return %[[LIN]]
-func.func @no_cancel_linearize_denearize_permuted(%arg0: index, %arg1: index, %arg2: index) -> index {
+func.func @no_cancel_linearize_delinearize_permuted(%arg0: index, %arg1: index, %arg2: index) -> index {
   %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index
-  %1 = affine.linearize_index [%0#0, %0#2, %0#1] by (%arg1, 4, %arg2) : index
+  %1 = affine.linearize_index [%0#0, %0#2, %0#1] by (%arg1, %arg2, 4) : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 3)>
+// But these cancel because they're a contiguous segment
+// CHECK-LABEL: func @partial_cancel_linearize_delinearize_not_fully_permuted(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
+//       CHECK:     %[[SIZEPROD:.+]] = affine.apply #[[$MAP]]()[%[[ARG2]]]
+//       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[SIZEPROD]])
+//       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#2, %[[DELIN]]#1] by (%[[ARG1]], %[[SIZEPROD]], 4)
+//       CHECK:     return %[[LIN]]
+func.func @partial_cancel_linearize_delinearize_not_fully_permuted(%arg0: index, %arg1: index, %arg2: index) -> index {
+  %0:4 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2, 3) : index, index, index, index
+  %1 = affine.linearize_index [%0#0, %0#2, %0#3, %0#1] by (%arg1, %arg2, 3, 4) : index
   return %1 : index
 }
 
 // -----
 
+// Ensure we don't get SSA errors when creating new `affine.delinearize` operations.
+// CHECK-LABEL: func @cancel_linearize_delinearize_placement
+// CHECK-SAME: (%[[ARG0:.+]]: index)
+// CHECK: %[[C0:.+]] = arith.constant 0 : index
+// CHECK: %[[NEW_DELIN:.+]]:2 = affine.delinearize_index %[[ARG0]] into (8, 32) : index, index
+// CHECK-NEXT: %[[DELIN_PART:.+]]:2 = affine.delinearize_index %[[NEW_DELIN]]#1 into (8, 4) : index, index
+// CHECK-NEXT: %[[L1:.+]] = affine.linearize_index disjoint [%[[DELIN_PART]]#1, %[[NEW_DELIN]]#0, %[[C0]], %[[C0]]] by (4, 8, 4, 8)
+// CHECK-NEXT: %[[L2:.+]] = affine.linearize_index disjoint [%[[NEW_DELIN]]#1, %[[C0]], %[[C0]]] by (32, 8, 4)
+// CHECK-NEXT: %[[L3:.+]] = affine.linearize_index disjoint [%[[DELIN_PART]]#0, %[[NEW_DELIN]]#0, %[[C0]], %[[C0]]] by (8, 8, 4, 4)
+// CHECK-NEXT: return %[[L1]], %[[L2]], %[[L3]]
+func.func @cancel_linearize_delinearize_placement(%arg0: index) -> (index, index, index) {
+  %c0 = arith.constant 0 : index
+  %0:3 = affine.delinearize_index %arg0 into (8, 8, 4) : index, index, index
+  %1 = affine.linearize_index disjoint [%0#2, %0#0, %c0, %c0] by (4, 8, 4, 8) : index
+  %2 = affine.linearize_index disjoint [%0#1, %0#2, %c0, %c0] by (8, 4, 8, 4) : index
+  %3 = affine.linearize_index disjoint [%0#1, %0#0, %c0, %c0] by (8, 8, 4, 4) : index
+  return %1, %2, %3 : index, index, index
+}
+
+// -----
+
 // Won't cancel because the linearize and delinearize are using a different basis
-// CHECK-LABEL: func @no_cancel_linearize_denearize_different_basis(
+// CHECK-LABEL: func @no_cancel_linearize_delinearize_different_basis(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
 //       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[ARG0]] into (%[[ARG1]], 4, %[[ARG2]])
 //       CHECK:     %[[LIN:.+]] = affine.linearize_index [%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2] by (%[[ARG1]], 8, %[[ARG2]])
 //       CHECK:     return %[[LIN]]
-func.func @no_cancel_linearize_denearize_different_basis(%arg0: index, %arg1: index, %arg2: index) -> index {
+func.func @no_cancel_linearize_delinearize_different_basis(%arg0: index, %arg1: index, %arg2: index) -> index {
   %0:3 = affine.delinearize_index %arg0 into (%arg1, 4, %arg2) : index, index, index
   %1 = affine.linearize_index [%0#0, %0#1, %0#2] by (%arg1, 8, %arg2) : index
   return %1 : index
diff --git a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
index 935c08a..5354eb3 100644
--- a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
@@ -155,3 +155,84 @@ func.func @compare_maps(%a: index, %b: index) {
       : (index, index, index, index) -> ()
   return
 }
+
+// -----
+
+// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 floordiv 15)>
+// CHECK-DAG: #[[$map2:.+]] = affine_map<()[s0] -> ((s0 mod 15) floordiv 5)>
+// CHECK-DAG: #[[$map3:.+]] = affine_map<()[s0] -> (s0 mod 5)>
+// CHECK-LABEL: func.func @delinearize_static
+// CHECK-SAME: (%[[arg0:.+]]: index)
+// CHECK-DAG: %[[v1:.+]] = affine.apply #[[$map1]]()[%[[arg0]]]
+// CHECK-DAG: %[[v2:.+]] = affine.apply #[[$map2]]()[%[[arg0]]]
+// CHECK-DAG: %[[v3:.+]] = affine.apply #[[$map3]]()[%[[arg0]]]
+// CHECK: return %[[v1]], %[[v2]], %[[v3]]
+func.func @delinearize_static(%arg0: index) -> (index, index, index) {
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %0:3 = affine.delinearize_index %arg0 into (2, 3, 5) : index, index, index
+  %1 = "test.reify_bound"(%0#0) {type = "EQ"} : (index) -> (index)
+  %2 = "test.reify_bound"(%0#1) {type = "EQ"} : (index) -> (index)
+  %3 = "test.reify_bound"(%0#2) {type = "EQ"} : (index) -> (index)
+  // expected-remark @below{{true}}
+  "test.compare"(%0#0, %c2) {cmp = "LT"} : (index, index) -> ()
+  // expected-remark @below{{true}}
+  "test.compare"(%0#1, %c3) {cmp = "LT"} : (index, index) -> ()
+  return %1, %2, %3 : index, index, index
+}
+
+// -----
+
+// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 floordiv 15)>
+// CHECK-DAG: #[[$map2:.+]] = affine_map<()[s0] -> ((s0 mod 15) floordiv 5)>
+// CHECK-DAG: #[[$map3:.+]] = affine_map<()[s0] -> (s0 mod 5)>
+// CHECK-LABEL: func.func @delinearize_static_no_outer_bound
+// CHECK-SAME: (%[[arg0:.+]]: index)
+// CHECK-DAG: %[[v1:.+]] = affine.apply #[[$map1]]()[%[[arg0]]]
+// CHECK-DAG: %[[v2:.+]] = affine.apply #[[$map2]]()[%[[arg0]]]
+// CHECK-DAG: %[[v3:.+]] = affine.apply #[[$map3]]()[%[[arg0]]]
+// CHECK: return %[[v1]], %[[v2]], %[[v3]]
+func.func @delinearize_static_no_outer_bound(%arg0: index) -> (index, index, index) {
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %0:3 = affine.delinearize_index %arg0 into (3, 5) : index, index, index
+  %1 = "test.reify_bound"(%0#0) {type = "EQ"} : (index) -> (index)
+  %2 = "test.reify_bound"(%0#1) {type = "EQ"} : (index) -> (index)
+  %3 = "test.reify_bound"(%0#2) {type = "EQ"} : (index) -> (index)
+  "test.compaare"(%0#0, %c2) {cmp = "LT"} : (index, index) -> ()
+  // expected-remark @below{{true}}
+  "test.compare"(%0#1, %c3) {cmp = "LT"} : (index, index) -> ()
+  return %1, %2, %3 : index, index, index
+}
+
+// -----
+
+// CHECK: #[[$map:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 3)>
+// CHECK-LABEL: func.func @linearize_static
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index)
+// CHECK: %[[v1:.+]] = affine.apply #[[$map]]()[%[[arg1]], %[[arg0]]]
+// CHECK: return %[[v1]]
+func.func @linearize_static(%arg0: index, %arg1: index)  -> index {
+  %c6 = arith.constant 6 : index
+  %0 = affine.linearize_index disjoint [%arg0, %arg1] by (2, 3) : index
+  %1 = "test.reify_bound"(%0) {type = "EQ"} : (index) -> (index)
+  // expected-remark @below{{true}}
+  "test.compare"(%0, %c6) {cmp = "LT"} : (index, index) -> ()
+  return %1 : index
+}
+
+// -----
+
+// CHECK: #[[$map:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 3)>
+// CHECK-LABEL: func.func @linearize_static_no_outer_bound
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index)
+// CHECK: %[[v1:.+]] = affine.apply #[[$map]]()[%[[arg1]], %[[arg0]]]
+// CHECK: return %[[v1]]
+func.func @linearize_static_no_outer_bound(%arg0: index, %arg1: index)  -> index {
+  %c6 = arith.constant 6 : index
+  %0 = affine.linearize_index disjoint [%arg0, %arg1] by (3) : index
+  %1 = "test.reify_bound"(%0) {type = "EQ"} : (index) -> (index)
+  // expected-error @below{{unknown}}
+  "test.compare"(%0, %c6) {cmp = "LT"} : (index, index) -> ()
+  return %1 : index
+}
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 6a186a0..522711b 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2060,6 +2060,70 @@ func.func @test_divf1(%arg0 : f32, %arg1 : f32) -> (f32) {
 
 // -----
 
+func.func @fold_divui_of_muli_0(%arg0 : index, %arg1 : index) -> index {
+  %0 = arith.muli %arg0, %arg1 overflow<nuw> : index
+  %1 = arith.divui %0, %arg0 : index
+  return %1 : index
+}
+// CHECK-LABEL: func @fold_divui_of_muli_0(
+//  CHECK-SAME:     %[[ARG0:.+]]: index,
+//  CHECK-SAME:     %[[ARG1:.+]]: index)
+//       CHECK:   return %[[ARG1]]
+
+func.func @fold_divui_of_muli_1(%arg0 : index, %arg1 : index) -> index {
+  %0 = arith.muli %arg0, %arg1 overflow<nuw> : index
+  %1 = arith.divui %0, %arg1 : index
+  return %1 : index
+}
+// CHECK-LABEL: func @fold_divui_of_muli_1(
+//  CHECK-SAME:     %[[ARG0:.+]]: index,
+//  CHECK-SAME:     %[[ARG1:.+]]: index)
+//       CHECK:   return %[[ARG0]]
+
+func.func @fold_divsi_of_muli_0(%arg0 : index, %arg1 : index) -> index {
+  %0 = arith.muli %arg0, %arg1 overflow<nsw> : index
+  %1 = arith.divsi %0, %arg0 : index
+  return %1 : index
+}
+// CHECK-LABEL: func @fold_divsi_of_muli_0(
+//  CHECK-SAME:     %[[ARG0:.+]]: index,
+//  CHECK-SAME:     %[[ARG1:.+]]: index)
+//       CHECK:   return %[[ARG1]]
+
+func.func @fold_divsi_of_muli_1(%arg0 : index, %arg1 : index) -> index {
+  %0 = arith.muli %arg0, %arg1 overflow<nsw> : index
+  %1 = arith.divsi %0, %arg1 : index
+  return %1 : index
+}
+// CHECK-LABEL: func @fold_divsi_of_muli_1(
+//  CHECK-SAME:     %[[ARG0:.+]]: index,
+//  CHECK-SAME:     %[[ARG1:.+]]: index)
+//       CHECK:   return %[[ARG0]]
+
+// Do not fold divui(mul(a, v), v) -> a with nuw attribute.
+func.func @no_fold_divui_of_muli(%arg0 : index, %arg1 : index) -> index {
+  %0 = arith.muli %arg0, %arg1 : index
+  %1 = arith.divui %0, %arg0 : index
+  return %1 : index
+}
+// CHECK-LABEL: func @no_fold_divui_of_muli
+//       CHECK:   %[[T0:.+]] = arith.muli
+//       CHECK:   %[[T1:.+]] = arith.divui %[[T0]],
+//       CHECK:   return %[[T1]]
+
+// Do not fold divsi(mul(a, v), v) -> a with nuw attribute.
+func.func @no_fold_divsi_of_muli(%arg0 : index, %arg1 : index) -> index {
+  %0 = arith.muli %arg0, %arg1 : index
+  %1 = arith.divsi %0, %arg0 : index
+  return %1 : index
+}
+// CHECK-LABEL: func @no_fold_divsi_of_muli
+//       CHECK:   %[[T0:.+]] = arith.muli
+//       CHECK:   %[[T1:.+]] = arith.divsi %[[T0]],
+//       CHECK:   return %[[T1]]
+
+// -----
+
 // CHECK-LABEL: @test_cmpf(
 func.func @test_cmpf(%arg0 : f32) -> (i1, i1, i1, i1) {
 //   CHECK-DAG:   %[[T:.*]] = arith.constant true
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index 48a3eb2..090af3e 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -249,6 +249,18 @@ func.func @ceil_divsi(%arg0 : index) -> i1 {
     func.return %10 : i1
 }
 
+// There was a bug, which was causing this expr errorneously fold to constant
+// CHECK-LABEL: func @ceil_divsi_full_range
+// CHECK-SAME: (%[[arg:.*]]: index)
+// CHECK: %[[c64:.*]] = arith.constant 64 : index
+// CHECK: %[[ret:.*]] = arith.ceildivsi %[[arg]], %[[c64]] : index
+// CHECK: return %[[ret]]
+func.func @ceil_divsi_full_range(%6: index) -> index {
+  %c64 = arith.constant 64 : index
+  %55 = arith.ceildivsi %6, %c64 : index
+  return %55 : index
+}
+
 // CHECK-LABEL: func @ceil_divsi_intmin_bug_115293
 // CHECK: %[[ret:.*]] = arith.constant true
 // CHECK: return %[[ret]]
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
index 2643477..820fb3d 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -465,3 +465,14 @@ func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read(%arg1: t
       : tensor<5x6x64xf32> into tensor<5x6x128xf32>
   return %inserted_slice_1, %res_2 : tensor<5x6x128xf32>, tensor<5x6x64xf32>
 }
+
+// -----
+
+// CHECK-LABEL:   func.func @direct_use_of_tensor_empty
+func.func @direct_use_of_tensor_empty(%arg0: tensor<5x6x128xf32>) -> tensor<5x6x128xf32> {
+  // CHECK-NOT: memref.alloc
+  %empty_1 = tensor.empty() : tensor<5x6x64xf32>
+  %inserted_slice_1 = tensor.insert_slice %empty_1 into %arg0[0, 0, 0][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  return %inserted_slice_1 : tensor<5x6x128xf32>
+}
diff --git a/mlir/test/Dialect/GPU/indirect-device-func-call.mlir b/mlir/test/Dialect/GPU/indirect-device-func-call.mlir
index 91d7f1c..85805da 100644
--- a/mlir/test/Dialect/GPU/indirect-device-func-call.mlir
+++ b/mlir/test/Dialect/GPU/indirect-device-func-call.mlir
@@ -6,7 +6,7 @@ gpu.module @kernels {
     func.func @hello(%arg0 : f32) {
         %tid_x = gpu.thread_id x
         %csti8 = arith.constant 2 : i8
-        gpu.printf "Hello from %lld, %d, %f\n" %tid_x, %csti8, %arg0  : index, i8, f32
+        gpu.printf "Hello from %lld, %d, %f\n", %tid_x, %csti8, %arg0  : index, i8, f32
         return
     }
     // CHECK-LABEL: @hello_indirect
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index c0ff204..99915c4 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -229,9 +229,22 @@ module attributes {gpu.container_module} {
 
     // CHECK-LABEL: gpu.func @printf_test
     // CHECK: (%[[ARG0:.*]]: i32)
-    // CHECK: gpu.printf "Value: %d" %[[ARG0]] : i32
+    // CHECK: gpu.printf "Value: %d", %[[ARG0]] : i32
     gpu.func @printf_test(%arg0 : i32) {
-      gpu.printf "Value: %d" %arg0 : i32
+      gpu.printf "Value: %d", %arg0 : i32
+      gpu.return
+    }
+
+    // CHECK-LABEL: gpu.func @printf_empty
+    // CHECK: gpu.printf  "]"
+    // CHECK: scf.if
+    // CHECK: gpu.printf ", "
+    gpu.func @printf_empty(%arg0 : i32) {
+      gpu.printf "]"
+      %1 = arith.cmpi slt, %arg0, %arg0 : i32
+      scf.if %1 {
+        gpu.printf ", "
+      } 
       gpu.return
     }
 
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
index 732f40c..f02b26d 100644
--- a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -23,7 +23,7 @@ func.func @test_math(%arg0 : f32) {
         threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { 
         // CHECK-NVVM: __nv_expf 
         %s1 = math.exp %arg0 : f32
-        gpu.printf "%f" %s1 : f32
+        gpu.printf "%f", %s1 : f32
         gpu.terminator
     }
     return
diff --git a/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-target-tag.mlir b/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-target-tag.mlir
index f6d3387..2785b50 100644
--- a/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-target-tag.mlir
+++ b/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-target-tag.mlir
@@ -28,7 +28,7 @@ func.func @subview(%0 : memref<64x4xf32, strided<[4, 1], offset: 0>>, %arg0 : in
   // CHECK-SAME: -> !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>
 
   // CHECK-DAG: %[[STRIDE0:.*]] = llvm.mlir.constant(4 : index) : i64
-  // CHECK-DAG: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] : i64
+  // CHECK-DAG: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] overflow<nsw> : i64
   // CHECK-DAG: %[[OFF2:.*]] = llvm.add %[[DESCSTRIDE0]], %[[ARG1]] : i64
   // CHECK-DAG: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 
diff --git a/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-top-level-named-sequence.mlir b/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-top-level-named-sequence.mlir
index a74553c..c1f30c7 100644
--- a/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-top-level-named-sequence.mlir
+++ b/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-top-level-named-sequence.mlir
@@ -27,7 +27,7 @@ func.func @subview(%0 : memref<64x4xf32, strided<[4, 1], offset: 0>>, %arg0 : in
   // CHECK-SAME: -> !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>
 
   // CHECK-DAG: %[[STRIDE0:.*]] = llvm.mlir.constant(4 : index) : i64
-  // CHECK-DAG: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] : i64
+  // CHECK-DAG: %[[DESCSTRIDE0:.*]] = llvm.mul %[[ARG0]], %[[STRIDE0]] overflow<nsw> : i64
   // CHECK-DAG: %[[OFF2:.*]] = llvm.add %[[DESCSTRIDE0]], %[[ARG1]] : i64
   // CHECK-DAG: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index aebfd74..88660ce 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -750,6 +750,16 @@ llvm.func @experimental_noalias_scope_decl() {
   llvm.return
 }
 
+#alias_scope_domain2 = #llvm.alias_scope_domain<id = "domainid", description = "The domain">
+#alias_scope2 = #llvm.alias_scope<id = "stringid", domain = #alias_scope_domain2, description = "The domain">
+
+// CHECK-LABEL: @experimental_noalias_scope_with_string_id
+llvm.func @experimental_noalias_scope_with_string_id() {
+  // CHECK: llvm.intr.experimental.noalias.scope.decl #{{.*}}
+  llvm.intr.experimental.noalias.scope.decl #alias_scope2
+  llvm.return
+}
+
 // CHECK-LABEL: @experimental_constrained_fptrunc
 llvm.func @experimental_constrained_fptrunc(%in: f64) {
   // CHECK: llvm.intr.experimental.constrained.fptrunc %{{.*}} towardzero ignore : f64 to f32
diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir
index 6d9709c..0dbdf47 100644
--- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir
@@ -1,4 +1,7 @@
-// RUN: mlir-opt -split-input-file --transform-interpreter --canonicalize --test-linalg-transform-patterns="test-decompose-tensor-unpack"  %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -transform-interpreter --canonicalize \
+// RUN: -transform-preload-library='transform-library-paths=%p/td/decompose-unpack.mlir' \
+// RUN: -transform-interpreter=entry-point=decompose_unpack \
+// RUN: -transform-interpreter  %s | FileCheck %s
 
 func.func @KCRSsr_to_KCRS(%arg0: tensor<1x1x4x8x8x32xf32>, %arg1: tensor<1x1x128x64xf32>) -> tensor<1x1x128x64xf32> {
   %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x4x8x8x32xf32> -> tensor<1x1x128x64xf32>
diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir
index bd60504..ba1f214 100644
--- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt -split-input-file --test-linalg-transform-patterns="test-decompose-tensor-unpack"  %s | FileCheck %s
+// RUN: mlir-opt -split-input-file \
+// RUN: -transform-preload-library='transform-library-paths=%p/td/decompose-unpack.mlir' \
+// RUN: -transform-interpreter=entry-point=decompose_unpack %s | FileCheck %s
 
 func.func @simple_KCRSsr_to_KCRS(%arg0: tensor<1x1x1x1x8x32xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32> {
   %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x1x1x8x32xf32> -> tensor<1x1x32x8xf32>
diff --git a/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir
new file mode 100644
index 0000000..1124363
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir
@@ -0,0 +1,12 @@
+module @transforms attributes { transform.with_named_sequence } {
+  transform.named_sequence @decompose_unpack(%module: !transform.any_op {transform.readonly}) {
+    %pack = transform.structured.match ops{["tensor.unpack"]} in %module : (!transform.any_op) -> !transform.any_op
+
+    %1 = transform.get_parent_op %pack {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %1 {
+      transform.apply_patterns.linalg.decompose_pack_unpack
+    } : !transform.any_op
+
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
index cce4b4e..9d34c80 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -32,8 +32,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:   %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 // CHECK-DAG:   %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
-// CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
-//     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
+//     CHECK:   %[[E:.*]] = tensor.empty(%[[D0]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
 //     CHECK:   %[[L:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[D1]] step %[[C5]] iter_args(%[[ARG3:.*]] = %[[F]]) -> (tensor<?x5xf32>) {
 //     CHECK:     %[[PS:.*]] = affine.min #[[MAP0]](%[[K]])[%[[D1]]]
@@ -81,13 +80,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d1, d0)>
 //     CHECK: func @reduction_tile_transpose
-//     CHECK:   tensor.empty(%{{.*}}) : tensor<5x?xf32>
-//     CHECK:   linalg.fill {{.*}} : tensor<5x?xf32>) -> tensor<5x?xf32>
+//     CHECK:   tensor.empty(%{{.*}}) : tensor<?x5xf32>
+//     CHECK:   linalg.fill {{.*}} : tensor<?x5xf32>) -> tensor<?x5xf32>
 //     CHECK:   scf.for
-//     CHECK:     %[[EXT:.*]] = tensor.extract_slice %[[ARG3:.*]][0, 0] [%[[D0:.*]], %[[D1:.*]]] [1, 1] : tensor<5x?xf32> to tensor<?x?xf32>
+//     CHECK:     %[[EXT:.*]] = tensor.extract_slice %[[ARG3:.*]][0, 0] [%[[D0:.*]], %[[D1:.*]]] [1, 1] : tensor<?x5xf32> to tensor<?x?xf32>
 //     CHECK:     %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[L:.*]] : tensor<?x?xf32>) outs(%[[EXT]] : tensor<?x?xf32>)
-//     CHECK:     %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0] [%[[D0]], %[[D1]]] [1, 1] : tensor<?x?xf32> into tensor<5x?xf32>
-//     CHECK:     scf.yield {{.*}} : tensor<5x?xf32>
+//     CHECK:     %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0] [%[[D0]], %[[D1]]] [1, 1] : tensor<?x?xf32> into tensor<?x5xf32>
+//     CHECK:     scf.yield {{.*}} : tensor<?x5xf32>
 //     CHECK:   }
 //     CHECK:   linalg.reduce
 //     CHECK:   return
@@ -129,8 +128,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 // CHECK-DAG:   %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
-// CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
-//     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
+//     CHECK:   %[[E:.*]] = tensor.empty(%[[D0]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
 //     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
@@ -183,9 +181,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG:   %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 // CHECK-DAG:   %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
-// CHECK-DAG:   %[[D3:.*]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
-// CHECK-DAG:   %[[D4:.*]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
-//     CHECK:   %[[E:.*]] = tensor.empty(%[[D3]], %[[D4]]) : tensor<?x?x5xf32>
+//     CHECK:   %[[E:.*]] = tensor.empty(%[[D0]], %[[D2]]) : tensor<?x?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x?x5xf32>) -> tensor<?x?x5xf32>
 //     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
@@ -243,8 +239,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C15:.*]] = arith.constant 15 : index
 // CHECK-DAG:   %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
-// CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
-//     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
+//     CHECK:   %[[E:.*]] = tensor.empty(%[[D0]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
 //     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 //     CHECK:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
@@ -422,8 +417,8 @@ func.func @reduction_tile_multiple_results(%arg0: tensor<?x?xf32>, %out: tensor<
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %12, %2, %3, %loop = transform.structured.tile_reduction_using_for %0
-      by tile_sizes = [0, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %1, %12, %2, %3, %4, %loop = transform.structured.tile_reduction_using_for %0
+      by tile_sizes = [0, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
       transform.yield
   }
 }
@@ -444,4 +439,44 @@ module attributes {transform.with_named_sequence} {
 // CHECK:       scf.yield %[[INSERT1]], %[[INSERT1]]
 // CHECK:       linalg.reduce
 // CHECK:         arith.addf
+// CHECK:       linalg.reduce
 // CHECK:         arith.maximumf
+
+// -----
+
+func.func @reduction_tile_multi_dim_transpose(%arg0: tensor<?x?x?xf32>, %out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %red = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                                          affine_map<(d0, d1, d2) -> (d2, d0)>],
+   iterator_types = ["parallel", "reduction", "parallel"]}
+   ins(%arg0 : tensor<?x?x?xf32>)
+   outs(%out : tensor<?x?xf32>) {
+    ^bb0(%arg7: f32, %arg9: f32):
+      %42 = arith.addf %arg7, %arg9 : f32
+      linalg.yield %42 : f32
+    } -> tensor<?x?xf32>
+  return %red : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %2, %3, %loop = transform.structured.tile_reduction_using_for %0
+      by tile_sizes = [0, 5, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      transform.yield
+  }
+}
+
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
+//     CHECK: func @reduction_tile_multi_dim_transpose
+//     CHECK:   tensor.empty(%{{.*}}) : tensor<?x?x5xf32>
+//     CHECK:   linalg.fill {{.*}} : tensor<?x?x5xf32>) -> tensor<?x?x5xf32>
+//     CHECK:   scf.for
+//     CHECK:     %[[K:.*]] = affine.min
+//     CHECK:     %[[EXT:.*]] = tensor.extract_slice %[[ARG3:.*]][0, 0, 0] [%[[D2:.*]], %[[D0:.*]], %[[K]]] [1, 1, 1] : tensor<?x?x5xf32> to tensor<?x?x?xf32>
+//     CHECK:     %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[L:.*]] : tensor<?x?x?xf32>) outs(%[[EXT]] : tensor<?x?x?xf32>)
+//     CHECK:     %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0, 0] [%[[D2]], %[[D0]], %[[K]]] [1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x5xf32>
+//     CHECK:     scf.yield {{.*}} : tensor<?x?x5xf32>
+//     CHECK:   }
+//     CHECK:   linalg.reduce
+//     CHECK:   return
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 8c4e7a4..828758d 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -408,6 +408,20 @@ func.func @for_yields_4() -> i32 {
 
 // -----
 
+// CHECK-LABEL: @constant_iter_arg
+func.func @constant_iter_arg(%arg0: index, %arg1: index, %arg2: index) {
+  %c0_i32 = arith.constant 0 : i32
+  // CHECK: scf.for %arg3 = %arg0 to %arg1 step %arg2 {
+  %0 = scf.for %i = %arg0 to %arg1 step %arg2 iter_args(%arg3 = %c0_i32) -> i32 {
+    // CHECK-NEXT: "test.use"(%c0_i32)
+    "test.use"(%arg3) : (i32) -> ()
+    scf.yield %c0_i32 : i32
+  }
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @replace_true_if
 func.func @replace_true_if() {
   %true = arith.constant true
@@ -1789,7 +1803,7 @@ module {
 }
 // CHECK-LABEL: @fold_iter_args_not_being_modified_within_scfforall
 //  CHECK-SAME:   (%{{.*}}: index, %[[ARG1:.*]]: tensor<?xf32>, %[[ARG2:.*]]: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
-//       CHECK:    %[[RESULT:.*]] = scf.forall 
+//       CHECK:    %[[RESULT:.*]] = scf.forall
 //  CHECK-SAME:                       shared_outs(%[[ITER_ARG_5:.*]] = %[[ARG2]]) -> (tensor<?xf32>) {
 //       CHECK:      %[[OPERAND0:.*]] = tensor.extract_slice %[[ARG1]]
 //       CHECK:      %[[OPERAND1:.*]] = tensor.extract_slice %[[ITER_ARG_5]]
@@ -1832,7 +1846,7 @@ module {
 }
 // CHECK-LABEL: @fold_iter_args_with_no_use_of_result_scfforall
 //  CHECK-SAME:   (%{{.*}}: index, %[[ARG1:.*]]: tensor<?xf32>, %[[ARG2:.*]]: tensor<?xf32>, %[[ARG3:.*]]: tensor<?xf32>) -> tensor<?xf32> {
-//       CHECK:    %[[RESULT:.*]] = scf.forall 
+//       CHECK:    %[[RESULT:.*]] = scf.forall
 //  CHECK-SAME:                       shared_outs(%[[ITER_ARG_6:.*]] = %[[ARG2]]) -> (tensor<?xf32>) {
 //       CHECK:      %[[OPERAND0:.*]] = tensor.extract_slice %[[ARG1]]
 //       CHECK:      %[[OPERAND1:.*]] = tensor.extract_slice %[[ARG3]]
@@ -1856,7 +1870,7 @@ func.func @index_switch_fold() -> (f32, f32) {
     %y = arith.constant 42.0 : f32
     scf.yield %y : f32
   }
-  
+
   %switch_cst_2 = arith.constant 2: index
   %1 = scf.index_switch %switch_cst_2 -> f32
   case 0 {
@@ -1867,7 +1881,7 @@ func.func @index_switch_fold() -> (f32, f32) {
     %y = arith.constant 42.0 : f32
     scf.yield %y : f32
   }
-  
+
   return %0, %1 : f32, f32
 }
 
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index e8fc4ce..01d1487 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -2786,6 +2786,7 @@ func.func @fold_cast_multiple_results(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2x
   %0:2 = test.destination_style_op ins(%cast : tensor<?x2xf32>) outs(%cast_0 : tensor<?x2xf32>) -> tensor<?x2xf32>, index
   return %0#1 : index
 }
+
 // -----
 
 // CHECK-LABEL:   func.func @fold_cast_pack_dynamic_tile_size
@@ -2794,7 +2795,7 @@ func.func @fold_cast_multiple_results(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2x
 // CHECK-SAME:      %[[PAD:.*]]: i32) -> tensor<1x1x8x1xi32> {
 // CHECK:           %[[PACK:.*]] = tensor.pack %[[SRC]] padding_value(%[[PAD]] : i32)
 // CHECK-SAME:        inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]]
-// CHECK-SAME:        some_attr
+// CHECK-SAME:        test_attr
 // CHECK-SAME:        : tensor<7x?xi32> -> tensor<1x1x8x1xi32>
 // CHECK:           return %[[PACK]] : tensor<1x1x8x1xi32>
 func.func @fold_cast_pack_dynamic_tile_size(
@@ -2807,13 +2808,33 @@ func.func @fold_cast_pack_dynamic_tile_size(
     %pack = tensor.pack %src padding_value(%pad : i32)
       inner_dims_pos = [0, 1]
       inner_tiles = [%c8, 1]
-      into %cast {some_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32>
+      into %cast {test_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32>
     %res = tensor.cast %pack : tensor<1x1x?x1xi32> to tensor<1x1x8x1xi32>
     return %res : tensor<1x1x8x1xi32>
 }
 
 // -----
 
+// CHECK-LABEL:   func.func @fold_cast_unpack_dynamic_tile_size(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<1x1x8x1xi32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<7x?xi32>) -> tensor<7x?xi32> {
+// CHECK:           %[[RES:.*]] = tensor.unpack %[[SRC]] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] {test_attr} : tensor<1x1x8x1xi32> -> tensor<7x?xi32>
+// CHECK:           return %[[RES]] : tensor<7x?xi32>
+func.func @fold_cast_unpack_dynamic_tile_size(
+  %src: tensor<1x1x8x1xi32>,
+  %res: tensor<7x?xi32>) -> tensor<7x?xi32> {
+
+    %cast = tensor.cast %src : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32>
+    %c8 = arith.constant 8 : index
+    %unpack = tensor.unpack %cast
+      inner_dims_pos = [0, 1]
+      inner_tiles = [%c8, 1]
+      into %res {test_attr} : tensor<1x1x?x1xi32> -> tensor<7x?xi32>
+    return %unpack : tensor<7x?xi32>
+}
+
+// -----
+
 // CHECK-LABEL:   func.func @pack_dont_drop_attributes(
 // CHECK: tensor.pack {{.*}}  {test_attr}
 func.func @pack_dont_drop_attributes(%arg0: tensor<?x?x?xf16>, %arg1: tensor<128x?x100x16x1xf16>) -> tensor<128x?x100x16x1xf16> {
diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir
index 83cb4b9..1de3e28 100644
--- a/mlir/test/Dialect/Tensor/invalid.mlir
+++ b/mlir/test/Dialect/Tensor/invalid.mlir
@@ -699,7 +699,7 @@ func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor
 
 // -----
 
-func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> {
+func.func @unpack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> {
   // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}}
   %0 = tensor.unpack %output inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %input : tensor<64x32x16xf32> -> tensor<256x128xf32>
   return %0 : tensor<256x128xf32>
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 67cd01f..60121bb 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -162,7 +162,7 @@ func.func @conv2d_stride_2(%arg0: tensor<4x10x10x2xf32>) -> tensor<4x10x10x3xf32
   // CHECK: tosa.conv2d
   %weight = "tosa.const"() {value = dense<[[[[1.0, 1.0]]], [[[1.0, 1.0]]], [[[1.0, 1.0]]]]> : tensor<3x1x1x2xf32>} : ()-> tensor<3x1x1x2xf32>
   %bias = "tosa.const"() {value = dense<0.0> : tensor<3xf32>} : ()-> tensor<3xf32>
-  %0 = tosa.conv2d %arg0, %weight, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<3x1x1x2xf32>, tensor<3xf32>) -> tensor<4x10x10x3xf32>
+  %0 = tosa.conv2d %arg0, %weight, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<3x1x1x2xf32>, tensor<3xf32>) -> tensor<4x10x10x3xf32>
   return %0 : tensor<4x10x10x3xf32>
 }
 
@@ -173,7 +173,7 @@ func.func @conv2d_weight_2x2(%arg0: tensor<4x10x10x1xf32>) -> tensor<4x10x10x1xf
   // CHECK: tosa.conv2d
   %weight = "tosa.const"() {value = dense<[[[[1.0], [1.0]], [[1.0], [1.0]]]]> : tensor<1x2x2x1xf32>} : ()-> tensor<1x2x2x1xf32>
   %bias = "tosa.const"() {value = dense<0.0> : tensor<1xf32>} : ()-> tensor<1xf32>
-  %0 = tosa.conv2d %arg0, %weight, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x1xf32>, tensor<1x2x2x1xf32>, tensor<1xf32>) -> tensor<4x10x10x1xf32>
+  %0 = tosa.conv2d %arg0, %weight, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x1xf32>, tensor<1x2x2x1xf32>, tensor<1xf32>) -> tensor<4x10x10x1xf32>
   return %0 : tensor<4x10x10x1xf32>
 }
 
@@ -182,7 +182,7 @@ func.func @conv2d_weight_2x2(%arg0: tensor<4x10x10x1xf32>) -> tensor<4x10x10x1xf
 // CHECK-LABEL: @depthwise_conv2d_stride_2
 func.func @depthwise_conv2d_stride_2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x10x10x6xf32> {
   // CHECK: tosa.depthwise_conv2d
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
   return %0 : tensor<4x10x10x6xf32>
 }
 
@@ -191,7 +191,7 @@ func.func @depthwise_conv2d_stride_2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor
 // CHECK-LABEL: @depthwise_conv2d_weight_2x2
 func.func @depthwise_conv2d_weight_2x2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<2x2x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x10x10x6xf32> {
   // CHECK: tosa.depthwise_conv2d
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<2x2x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<2x2x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
   return %0 : tensor<4x10x10x6xf32>
 }
 
@@ -210,8 +210,8 @@ func.func @max_pool2d_is_noop(%arg0: tensor<10x1x1x3xf32>) -> tensor<10x1x1x3xf3
 // CHECK-LABEL: @pad_noop
 func.func @pad_noop(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // CHECK: return %arg0
-  %0 = "tosa.const"() { value = dense<0> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %0 = "tosa.const"() { value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 
@@ -221,8 +221,8 @@ func.func @pad_noop(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
 func.func @pad_noop_padding_mismatch_nofold(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // CHECK: %[[PAD:.+]] = tosa.pad
   // CHECK: return %[[PAD]]
-  %0 = "tosa.const"() { value = dense_resource<__elided__> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %0 = "tosa.const"() { value = dense_resource<__elided__> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 
@@ -234,42 +234,39 @@ func.func @pad_noop_type_mismatch_nofold(%arg0: tensor<10xf32>) -> tensor<?xf32>
   // CHECK: return %[[PAD]]
 
   %c0_i32 = arith.constant 0 : i32
-  %shape = tensor.from_elements %c0_i32, %c0_i32 : tensor<1x2xi32>
+  %shape = tensor.from_elements %c0_i32, %c0_i32 : tensor<2xi32>
 
-  %0 = tosa.pad %arg0, %shape : (tensor<10xf32>, tensor<1x2xi32>) -> tensor<?xf32>
+  %0 = tosa.pad %arg0, %shape : (tensor<10xf32>, tensor<2xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @pad_determine_val_i32
-func.func @pad_determine_val_i32(%arg0: tensor<?x?xi32>, %arg1 : tensor<2x2xi32>) -> tensor<?x?xi32> {
+func.func @pad_determine_val_i32(%arg0: tensor<?x?xi32>, %arg1 : tensor<4xi32>) -> tensor<?x?xi32> {
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor<i32>}
   // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]]
-  %0 = "tosa.const"() { value = dense<[[1, 0], [0, 1]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %arg1 : (tensor<?x?xi32>, tensor<2x2xi32>) -> tensor<?x?xi32>
+  %1 = tosa.pad %arg0, %arg1 : (tensor<?x?xi32>, tensor<4xi32>) -> tensor<?x?xi32>
   return %1 : tensor<?x?xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @pad_determine_val_f32
-func.func @pad_determine_val_f32(%arg0: tensor<?x?xf32>, %arg1 : tensor<2x2xi32>) -> tensor<?x?xf32> {
+func.func @pad_determine_val_f32(%arg0: tensor<?x?xf32>, %arg1 : tensor<4xi32>) -> tensor<?x?xf32> {
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}
   // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]]
-  %0 = "tosa.const"() { value = dense<[[1, 0], [0, 1]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %arg1 : (tensor<?x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %1 = tosa.pad %arg0, %arg1 : (tensor<?x?xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @pad_determine_val_quant
-func.func @pad_determine_val_quant(%arg0: tensor<?x?xi32>, %arg1 : tensor<2x2xi32>) -> tensor<?x?xi32> {
+func.func @pad_determine_val_quant(%arg0: tensor<?x?xi32>, %arg1 : tensor<4xi32>) -> tensor<?x?xi32> {
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<42> : tensor<i32>}
   // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]]
-  %0 = "tosa.const"() { value = dense<[[1, 0], [0, 1]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %arg1 {quantization_info = #tosa.pad_quant<input_zp = 42>} : (tensor<?x?xi32>, tensor<2x2xi32>) -> tensor<?x?xi32>
+  %1 = tosa.pad %arg0, %arg1 {quantization_info = #tosa.pad_quant<input_zp = 42>} : (tensor<?x?xi32>, tensor<4xi32>) -> tensor<?x?xi32>
   return %1 : tensor<?x?xi32>
 }
 
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index 2902c4a..8198903 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -117,15 +117,6 @@ func.func @transpose_nofold_multi_users() -> (tensor<3x2xf32>, tensor<2x3xf32>)
   return %1, %input : tensor<3x2xf32>, tensor<2x3xf32>
 }
 
-// CHECK-LABEL: @transpose_nofold_quantized_types
-func.func @transpose_nofold_quantized_types() -> tensor<1x1x2x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>> {
-  %perms = "tosa.const"() {value = dense<[1, 2, 3, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %input = "tosa.const"() {value = dense<-127> : tensor<2x1x1x2xi8>} : () -> tensor<2x1x1x2xi8>
-  // CHECK: tosa.transpose
-  %0 = tosa.transpose %input, %perms : (tensor<2x1x1x2xi8>, tensor<4xi32>) -> tensor<1x1x2x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>>
-  return %0: tensor<1x1x2x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>>
-}
-
 // CHECK-LABEL: @transpose_nofold_dense_resource
 func.func @transpose_nofold_dense_resource() -> tensor<2x2xf32> {
   %0 = "tosa.const"() <{value = dense_resource<resource> : tensor<2x2xf32>}> : () -> tensor<2x2xf32>
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index cca50b2..a6d57f8 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -25,7 +25,7 @@ func.func @test_const_non_tensor_attr() {
 
 func.func @test_conv2d(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{expect both input and weight to be float or not together, got 'f32' and 'i8'}}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x29x4xf32>, tensor<16x3x3x4xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
   return %0 : tensor<1x27x27x16xi8>
 }
@@ -34,7 +34,7 @@ func.func @test_conv2d(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xi8>,
 
 func.func @test_conv2d(%arg0: tensor<*xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{expect a ranked tensor for input, got <block argument> of type 'tensor<*xi8>' at index: 0}}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<*xi8>, tensor<16x3x3x4xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
   return %0 : tensor<1x27x27x16xi8>
 }
@@ -43,7 +43,7 @@ func.func @test_conv2d(%arg0: tensor<*xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: t
 
 func.func @test_conv2d(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<*xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{'tosa.conv2d' op operand #1 must be 4D tensor of 4-bit signless integer or 8-bit signless integer or Quint8 type or Qint4 type or Qint8 type or Qint16 type or Qint32 type or floating-point values, but got 'tensor<*xi8>'}}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x29x4xi8>, tensor<*xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
   return %0 : tensor<1x27x27x16xi8>
 }
@@ -52,13 +52,101 @@ func.func @test_conv2d(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<*xi8>, %arg2:
 
 func.func @test_conv2d(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{'tosa.conv2d' op quantizationattr is required for quantized type, and not allowed for float type}}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x29x4xi8>, tensor<16x3x3x4xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
   return %0 : tensor<1x27x27x16xi8>
 }
 
 // -----
 
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for i8 tensor is not i32}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>}
+           : (tensor<1x29x29x4xi8>, tensor<16x3x3x4xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
+  return %0 : tensor<1x27x27x16xi8>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xi16>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi16>) -> tensor<1x27x27x16xi16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for i16 tensor is not i48}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>}
+           : (tensor<1x29x29x4xi16>, tensor<16x3x3x4xi8>, tensor<16xi16>) -> tensor<1x27x27x16xi16>
+  return %0 : tensor<1x27x27x16xi16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xf8E5M2>, %arg1: tensor<16x3x3x4xf8E5M2>, %arg2: tensor<16xf16>) -> tensor<1x27x27x16xf16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for f8 tensor is not f16}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xf8E5M2>, tensor<16x3x3x4xf8E5M2>, tensor<16xf16>) -> tensor<1x27x27x16xf16>
+  return %0 : tensor<1x27x27x16xf16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xf8E4M3>, %arg1: tensor<16x3x3x4xf8E4M3>, %arg2: tensor<16xf16>) -> tensor<1x27x27x16xf16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for f8 tensor is not f16}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xf8E4M3>, tensor<16x3x3x4xf8E4M3>, tensor<16xf16>) -> tensor<1x27x27x16xf16>
+  return %0 : tensor<1x27x27x16xf16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xf16>, %arg1: tensor<16x3x3x4xf16>, %arg2: tensor<16xf16>) -> tensor<1x27x27x16xf16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for f16 tensor is not f16/f32}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xf16>, tensor<16x3x3x4xf16>, tensor<16xf16>) -> tensor<1x27x27x16xf16>
+  return %0 : tensor<1x27x27x16xf16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xbf16>, %arg1: tensor<16x3x3x4xbf16>, %arg2: tensor<16xbf16>) -> tensor<1x27x27x16xbf16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for bf16 tensor is not f32}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xbf16>, tensor<16x3x3x4xbf16>, tensor<16xbf16>) -> tensor<1x27x27x16xbf16>
+  return %0 : tensor<1x27x27x16xbf16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for f32 tensor is not f32}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32>
+  return %0 : tensor<1x27x27x16xf32>
+}
+
+// -----
+
+func.func @test_conv3d_acc_type(%arg0: tensor<1x4x8x21x17xi8>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<34xi8>) -> tensor<1x4x8x21x34xi8> {
+  // expected-error@+1 {{'tosa.conv3d' op accumulator type for i8 tensor is not i32}}
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>}
+           : (tensor<1x4x8x21x17xi8>, tensor<34x1x1x1x17xi8>, tensor<34xi8>) -> tensor<1x4x8x21x34xi8>
+  return %0 : tensor<1x4x8x21x34xi8>
+}
+
+// -----
+
+func.func @test_depthwise_conv2d_acc_type(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<1x1x4x2xi8>, %arg2: tensor<8xi8>) -> tensor<1x4x4x8xi8> {
+  // expected-error@+1 {{'tosa.depthwise_conv2d' op accumulator type for i8 tensor is not i32}}
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>} : (tensor<1x4x4x4xi8>, tensor<1x1x4x2xi8>, tensor<8xi8>) -> tensor<1x4x4x8xi8>
+  return %0 : tensor<1x4x4x8xi8>
+}
+
+// -----
+
+func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xi8>, %arg1: tensor<16x1x1x8xi8>, %arg2: tensor<16xi8>) -> tensor<1x32x32x16xi8> {
+  // expected-error@+1 {{'tosa.transpose_conv2d' op accumulator type for i8 tensor is not i32}}
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f16, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>} : (tensor<1x32x32x8xi8>, tensor<16x1x1x8xi8>, tensor<16xi8>) -> tensor<1x32x32x16xi8>
+  return %0 : tensor<1x32x32x16xi8>
+}
+
+// -----
+
 func.func @test_concat(%arg0 : tensor<2x1xf32>, %arg1 : tensor<2x2xf32>) -> tensor<?x?xf32> {
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{Cannot concat tensors with different sizes on the non-axis dimension 1}}
@@ -77,48 +165,56 @@ func.func @test_concat_element_type_mismatch(%arg0 : tensor<1x2xf32>, %arg1 : te
 
 // -----
 
-func.func @test_pad_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3x2xi32>) -> tensor<13x21x3xf32> {
+func.func @test_pad_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> {
   // expected-error@+1 {{'tosa.pad' op padding of pad is not constant}}
-  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3x2xi32>) -> tensor<13x21x3xf32>
+  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<6xi32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
 
 // -----
 
 func.func @test_pad_non_const(%arg0: tensor<13x21x3xi8>, %arg1: tensor<i8>) -> tensor<13x21x3xi8> {
-  %0 = "tosa.const"() {value = dense<[[0, 0], [0, 1], [0, 1]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+  %0 = "tosa.const"() {value = dense<[0, 0, 0, 1, 0, 1]> : tensor<6xi32>} : () -> tensor<6xi32>
   // expected-error@+1 {{'tosa.pad' op pad_const of pad is not constant}}
-  %1 = tosa.pad %arg0, %0, %arg1 : (tensor<13x21x3xi8>, tensor<3x2xi32>, tensor<i8>) -> tensor<13x21x3xi8>
+  %1 = tosa.pad %arg0, %0, %arg1 : (tensor<13x21x3xi8>, tensor<6xi32>, tensor<i8>) -> tensor<13x21x3xi8>
   return %1 : tensor<13x21x3xi8>
 }
 
 // -----
 
-func.func @test_pad_io_rank_mismatch(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) {
+func.func @test_pad_io_rank_mismatch(%arg0: tensor<13x21xf32>, %arg1: tensor<4xi32>) {
   // expected-error@+1 {{'tosa.pad' op expect same input and output tensor rank.}}
-  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2x2xi32>) -> tensor<13x21x3xf32>
+  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<4xi32>) -> tensor<13x21x3xf32>
   return
 }
 
 // -----
 
-func.func @test_pad_invalid_padding_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2xi32>) {
-  // expected-error@+1 {{'tosa.pad' op expect 'padding' tensor rank equal to 2.}}
-  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2xi32>) -> tensor<13x21xf32>
+func.func @test_pad_invalid_padding_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) {
+  // expected-error@+1 {{'tosa.pad' op operand #1 must be 1D tensor of 32-bit signless integer or 64-bit signless integer values, but got 'tensor<2x2xi32>'}}
+  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2x2xi32>) -> tensor<13x21xf32>
   return
 }
 
 // -----
 
-func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) {
+func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<4xi32>) {
   %0 = "tosa.const"() {value = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32>
   // expected-error@+1 {{'tosa.pad' op operand #2 must be 0D tensor of number values, but got 'tensor<1xf32>'}}
-  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21xf32>, tensor<2x2xi32>, tensor<1xf32>) -> tensor<13x21xf32>
+  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21xf32>, tensor<4xi32>, tensor<1xf32>) -> tensor<13x21xf32>
   return
 }
 
 // -----
 
+func.func @test_pad_padding_shape_mismatch(%arg0: tensor<13x21x3xf32>, %arg1: tensor<4xi32>) -> tensor<13x21x3xf32> {
+  // expected-error@+1 {{'tosa.pad' op expected padding tensor dim 0 to have size 6 (2*rank(shape1)) but got size 4}}
+  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<4xi32>) -> tensor<13x21x3xf32>
+  return %0 : tensor<13x21x3xf32>
+}
+
+// -----
+
 func.func @test_transpose_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> {
   // expected-error@+1 {{'tosa.transpose' op perms of transpose is not constant}}
   %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>
@@ -206,6 +302,15 @@ func.func @test_transpose_invalid_permutation_types_dynamic_dim_ok(%arg0: tensor
 
 // -----
 
+func.func @test_transpose_element_type_mismatch(%arg0: tensor<2x3xi32>) -> tensor<3x2xf32> {
+  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // expected-error@+1 {{'tosa.transpose' op failed to verify that all of {input1, output} have same element type}}
+  %1 = tosa.transpose %arg0, %perms : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
 func.func @test_fully_connected_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<273x2xf32> {
   %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
   %1 = tosa.reshape %arg0 {new_shape = array<i64: 273, 3>} : (tensor<13x21x3xf32>) -> tensor<273x3xf32>
@@ -416,7 +521,7 @@ func.func @test_const_attribute_type_mismatch() -> tensor<100x100xf32> {
 
 func.func @test_conv2d_static_zero_dim_input(%arg0: tensor<1x29x0x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x29x0x4xf32>'}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x0x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32>
   return %0 : tensor<1x27x27x16xf32>
 }
diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir
index 529a16c..ba8ed8a1 100644
--- a/mlir/test/Dialect/Tosa/level_check.mlir
+++ b/mlir/test/Dialect/Tosa/level_check.mlir
@@ -226,7 +226,7 @@ func.func @test_avgpool2d_pad_right(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32
 
 func.func @test_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: dilation_y * KH <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 4097, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 4097, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -235,7 +235,7 @@ func.func @test_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: dilation_x * KW <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 4097>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 4097>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -244,7 +244,7 @@ func.func @test_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 8193, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 8193, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -253,7 +253,7 @@ func.func @test_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x
 
 func.func @test_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 8193, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 8193, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -262,7 +262,7 @@ func.func @test_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 8193, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 8193, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -271,7 +271,7 @@ func.func @test_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 8193>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 8193>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -280,7 +280,7 @@ func.func @test_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x
 
 func.func @test_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 8193, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 8193, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -289,7 +289,7 @@ func.func @test_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 8193>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 8193>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -298,7 +298,7 @@ func.func @test_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv3d_dilation_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_d * KD <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 4097, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 4097, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -307,7 +307,7 @@ func.func @test_conv3d_dilation_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<
 
 func.func @test_conv3d_dilation_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_y * KH <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 4097, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 4097, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -316,7 +316,7 @@ func.func @test_conv3d_dilation_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<
 
 func.func @test_conv3d_dilation_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_x * KW <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 4097>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 4097>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -325,7 +325,7 @@ func.func @test_conv3d_dilation_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<
 
 func.func @test_conv3d_pad_d0(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 8193, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 8193, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -334,7 +334,7 @@ func.func @test_conv3d_pad_d0(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv3d_pad_d1(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 8193, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 8193, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -343,7 +343,7 @@ func.func @test_conv3d_pad_d1(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv3d_pad_top(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 8193, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 8193, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -352,7 +352,7 @@ func.func @test_conv3d_pad_top(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x
 
 func.func @test_conv3d_pad_bottom(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 8193, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 8193, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -361,7 +361,7 @@ func.func @test_conv3d_pad_bottom(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<
 
 func.func @test_conv3d_pad_left(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 8193, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 8193, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -370,7 +370,7 @@ func.func @test_conv3d_pad_left(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv3d_pad_right(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 8193>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 8193>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -379,7 +379,7 @@ func.func @test_conv3d_pad_right(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<1
 
 func.func @test_conv3d_stride_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 8193, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 8193, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -388,7 +388,7 @@ func.func @test_conv3d_stride_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv3d_stride_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 8193, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 8193, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -397,7 +397,7 @@ func.func @test_conv3d_stride_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv3d_stride_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 8193>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 8193>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -406,7 +406,7 @@ func.func @test_conv3d_stride_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_depthwise_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: dilation_y * KH <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 4097, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 4097, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -415,7 +415,7 @@ func.func @test_depthwise_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_depthwise_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: dilation_x * KW <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 4097>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 4097>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -424,7 +424,7 @@ func.func @test_depthwise_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_depthwise_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 8193, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 8193, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -433,7 +433,7 @@ func.func @test_depthwise_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: te
 
 func.func @test_depthwise_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 8193, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 8193, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -442,7 +442,7 @@ func.func @test_depthwise_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_depthwise_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 8193, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 8193, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -451,7 +451,7 @@ func.func @test_depthwise_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_depthwise_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 8193>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 8193>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -460,7 +460,7 @@ func.func @test_depthwise_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_depthwise_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 8193, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 8193, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -469,7 +469,7 @@ func.func @test_depthwise_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_depthwise_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 8193>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 8193>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -603,7 +603,7 @@ func.func @test_rfft2d_input_w(%arg0: tensor<13x8x8193xf32>) -> (tensor<13x8x9xf
 
 func.func @test_transpose_conv2d_weight_h(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x8193x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: KH <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x8193x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -612,7 +612,7 @@ func.func @test_transpose_conv2d_weight_h(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_transpose_conv2d_weight_w(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x8193x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: KW <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x8193x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -621,7 +621,7 @@ func.func @test_transpose_conv2d_weight_w(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_transpose_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 8193, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 8193, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -630,7 +630,7 @@ func.func @test_transpose_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: te
 
 func.func @test_transpose_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 8193, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 8193, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -639,7 +639,7 @@ func.func @test_transpose_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_transpose_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 8193, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 8193, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -648,7 +648,7 @@ func.func @test_transpose_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_transpose_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 8193>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 8193>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -657,7 +657,7 @@ func.func @test_transpose_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_transpose_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 8193, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 8193, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -666,7 +666,7 @@ func.func @test_transpose_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_transpose_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 8193>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 8193>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 88fa94a..f2e1cff 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -54,7 +54,7 @@ func.func @test_avg_pool2d_q8(%arg0: tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>
 // -----
 // CHECK-LABEL: conv2d
 func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, %arg2: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<8x1x1x4xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<8x1x1x4xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
   return %0 : tensor<1x4x4x8xf32>
 }
 
@@ -63,7 +63,7 @@ func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, %
 func.func @test_conv2d_q8xi4(%arg0: tensor<1x11x11x3xi8>) -> tensor<1x1x1x3xi8> {
   %0 = "tosa.const"() {value = dense<0> : tensor<3x11x11x3xi4>} : () -> tensor<3x11x11x3xi4>
   %1 = "tosa.const"() {value = dense<[12, 23, 55]> : tensor<3xi32>} : () -> tensor<3xi32>
-  %2 = "tosa.conv2d"(%arg0, %0, %1) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x11x11x3xi8>, tensor<3x11x11x3xi4>, tensor<3xi32>) -> tensor<1x1x1x3xi32>
+  %2 = "tosa.conv2d"(%arg0, %0, %1) {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x11x11x3xi8>, tensor<3x11x11x3xi4>, tensor<3xi32>) -> tensor<1x1x1x3xi32>
   %3 = "tosa.rescale"(%2) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 2026291432, 1079222024, 1693132724>, output_zp = 27 : i32, per_channel = true, scale32 = true, shift = array<i8: 37, 36, 37>} : (tensor<1x1x1x3xi32>) -> tensor<1x1x1x3xi8>
   return %3 : tensor<1x1x1x3xi8>
 }
@@ -71,28 +71,28 @@ func.func @test_conv2d_q8xi4(%arg0: tensor<1x11x11x3xi8>) -> tensor<1x1x1x3xi8>
 // -----
 // CHECK-LABEL: conv3d
 func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<34xf32>) -> tensor<1x4x8x21x34xf32> {
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>) -> tensor<1x4x8x21x34xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>) -> tensor<1x4x8x21x34xf32>
   return %0 : tensor<1x4x8x21x34xf32>
 }
 
 // -----
 // CHECK-LABEL: conv3d_with_local_bound
 func.func @test_conv3d_with_local_bound(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<34xf32>) -> tensor<1x4x8x21x34xf32> {
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, local_bound = true} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>) -> tensor<1x4x8x21x34xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, local_bound = true} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>) -> tensor<1x4x8x21x34xf32>
   return %0 : tensor<1x4x8x21x34xf32>
 }
 
 // -----
 // CHECK-LABEL: depthwise_conv2d
 func.func @test_depthwise_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x4x2xf32>, %arg2: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x4x4xf32>, tensor<1x1x4x2xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x4x4xf32>, tensor<1x1x4x2xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
   return %0 : tensor<1x4x4x8xf32>
 }
 
 // -----
 // CHECK-LABEL: depthwise_conv2d_with_local_bound
 func.func @test_depthwise_conv2d_with_local_bound(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x4x2xf32>, %arg2: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<1x1x4x2xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<1x1x4x2xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
   return %0 : tensor<1x4x4x8xf32>
 }
 
@@ -162,14 +162,14 @@ func.func @test_rfft2d_with_local_bound(%arg0: tensor<13x8x16xf32>) -> (tensor<1
 // -----
 // CHECK-LABEL: transpose_conv2d
 func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
 
 // -----
 // CHECK-LABEL: transpose_conv2d_with_local_bound
 func.func @test_transpose_conv2d_with_local_bound(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>, local_bound = false} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>, local_bound = false} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
 
@@ -525,16 +525,16 @@ func.func @test_concat(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -
 
 // -----
 // CHECK-LABEL: pad
-func.func @test_pad(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3x2xi32>) -> tensor<13x21x3xf32> {
-  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3x2xi32>) -> tensor<13x21x3xf32>
+func.func @test_pad(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> {
+  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<6xi32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
 
 // -----
 // CHECK-LABEL: pad_explicit_value
-func.func @test_pad_explicit_value(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3x2xi32>) -> tensor<13x21x3xf32> {
+func.func @test_pad_explicit_value(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> {
   %0 = "tosa.const"() {value = dense<3.14> : tensor<f32>} : () -> tensor<f32>
-  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21x3xf32>, tensor<3x2xi32>, tensor<f32>) -> tensor<13x21x3xf32>
+  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21x3xf32>, tensor<6xi32>, tensor<f32>) -> tensor<13x21x3xf32>
   return %1 : tensor<13x21x3xf32>
 }
 
diff --git a/mlir/test/Dialect/Tosa/quant-test.mlir b/mlir/test/Dialect/Tosa/quant-test.mlir
index 82a87db..6437f12 100644
--- a/mlir/test/Dialect/Tosa/quant-test.mlir
+++ b/mlir/test/Dialect/Tosa/quant-test.mlir
@@ -10,9 +10,9 @@ func.func @test_build_qtype(%arg0 : tensor<16x1x1x8x!quant.uniform<u8<1:255>:f32
 
 // -----
 // CHECK-LABEL: test_build_mult_and_shift
-func.func @test_build_mult_and_shift(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, %arg1 : tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32, 0.015680249780416489>>, %arg2 : tensor<16xi32>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
+func.func @test_build_mult_and_shift(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, %arg1 : tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32, 0.015680249780416489>>, %arg2 : tensor<16xi32>) -> tensor<1x32x32x16x!quant.uniform<i32:f32, 0.078431375324726104>> {
   // CHECK: tosa.conv2d
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {pad = array<i64: 1, 1, 2, 2>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = 0>} : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32, 0.015680249780416489>>, tensor<16xi32>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>>
-  return %0 : tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>>
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = i32, pad = array<i64: 1, 1, 2, 2>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = 0>} : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32, 0.015680249780416489>>, tensor<16xi32>) -> tensor<1x32x32x16x!quant.uniform<i32:f32, 0.078431375324726104>>
+  return %0 : tensor<1x32x32x16x!quant.uniform<i32:f32, 0.078431375324726104>>
 
 }
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir
index d876ccfb..8df4630 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir
@@ -14,7 +14,7 @@ func.func @conv2d_as_fully_connected(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor
   // CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]] {new_shape = array<i64: 4, 10, 10, 3>}
   // CHECK-SAME: -> tensor<4x10x10x3xf32>
   // CHECK: return %[[VAR3]]
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<3x1x1x2xf32>, tensor<3xf32>) -> tensor<4x10x10x3xf32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<3x1x1x2xf32>, tensor<3xf32>) -> tensor<4x10x10x3xf32>
   return %0 : tensor<4x10x10x3xf32>
 }
 
@@ -33,7 +33,7 @@ func.func @conv2d_as_fully_connected_quant(%arg0: tensor<4x10x10x2xi8>, %arg1: t
   // CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]] {new_shape = array<i64: 4, 10, 10, 3>}
   // CHECK-SAME: -> tensor<4x10x10x3xi32>
   // CHECK: return %[[VAR3]]
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>} : (tensor<4x10x10x2xi8>, tensor<3x1x1x2xi8>, tensor<3xi32>) -> tensor<4x10x10x3xi32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>} : (tensor<4x10x10x2xi8>, tensor<3x1x1x2xi8>, tensor<3xi32>) -> tensor<4x10x10x3xi32>
   return %0 : tensor<4x10x10x3xi32>
 }
 
@@ -50,7 +50,7 @@ func.func @conv_with_dynamic_dim(%arg0: tensor<?x14x14x64xi8>, %arg1: tensor<384
 // CHECK:           %[[VAL_6:.*]] = tosa.reshape %[[VAL_5]] {new_shape = array<i64: -1, 14, 14, 384>} : (tensor<?x384xi32>) -> tensor<?x14x14x384xi32>
 // CHECK:           return %[[VAL_6]] : tensor<?x14x14x384xi32>
 // CHECK:         }
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -6, weight_zp = 11>, stride = array<i64: 1, 1>} : (tensor<?x14x14x64xi8>, tensor<384x1x1x64xi8>, tensor<384xi32>) -> tensor<?x14x14x384xi32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -6, weight_zp = 11>, stride = array<i64: 1, 1>} : (tensor<?x14x14x64xi8>, tensor<384x1x1x64xi8>, tensor<384xi32>) -> tensor<?x14x14x384xi32>
   return %0 : tensor<?x14x14x384xi32>
 }
 
@@ -58,13 +58,13 @@ func.func @conv_with_dynamic_dim(%arg0: tensor<?x14x14x64xi8>, %arg1: tensor<384
 
 // CHECK-LABEL: @conv2d_as_fully_connected_padded
 func.func @conv2d_as_fully_connected_padded(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<3x1x1x2xi8>, %arg2: tensor<3xi32>) -> tensor<4x12x12x3xi32> {
-  // CHECK-DAG: %[[PAD_SHAPE:.+]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>}
+  // CHECK-DAG: %[[PAD_SHAPE:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi64>}
   // CHECK-DAG: %[[PAD_VAL:.+]] = "tosa.const"() <{value = dense<42> : tensor<i8>}
-  // CHECK-DAG: %[[PAD:.+]] = tosa.pad %arg0, %[[PAD_SHAPE]], %[[PAD_VAL]] : (tensor<4x10x10x2xi8>, tensor<4x2xi64>, tensor<i8>) -> tensor<4x12x12x2xi8>
+  // CHECK-DAG: %[[PAD:.+]] = tosa.pad %arg0, %[[PAD_SHAPE]], %[[PAD_VAL]] : (tensor<4x10x10x2xi8>, tensor<8xi64>, tensor<i8>) -> tensor<4x12x12x2xi8>
   // CHECK-DAG: %[[RESHAPE_INPUT:.+]] = tosa.reshape %[[PAD]] {new_shape = array<i64: 576, 2>}
   // CHECK-DAG: %[[RESHAPE_FILTER:.+]] = tosa.reshape %arg1 {new_shape = array<i64: 3, 2>}
   // CHECK-DAG: %[[FULLY:.+]] = tosa.fully_connected %[[RESHAPE_INPUT]], %[[RESHAPE_FILTER]], %arg2 {quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>}
   // CHECK: %[[RESHAPE:.+]] = tosa.reshape %[[FULLY]] {new_shape = array<i64: 4, 12, 12, 3>}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>} : (tensor<4x10x10x2xi8>, tensor<3x1x1x2xi8>, tensor<3xi32>) -> tensor<4x12x12x3xi32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>} : (tensor<4x10x10x2xi8>, tensor<3x1x1x2xi8>, tensor<3xi32>) -> tensor<4x12x12x3xi32>
   return %0 : tensor<4x12x12x3xi32>
 }
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
index 2224bf3..cfff639 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
@@ -18,7 +18,7 @@ func.func @depthwise_conv2d_as_mul(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1
   // CHECK: %[[VAR5:.*]] = tosa.add %[[VAR3]], %[[VAR4]]
   // CHECK-SAME: -> tensor<4x10x10x6xf32>
   // CHECK: return %[[VAR5]]
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
   return %0 : tensor<4x10x10x6xf32>
 }
 
@@ -38,7 +38,7 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<
   // CHECK: %[[reO:.+]] = tosa.reshape %[[mul]] {new_shape = array<i64: 4, 10, 10, 6>}
   // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 6>}
   // CHECK: %[[add:.+]] = tosa.add %[[reO]], %[[reArg2]]
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 7, weight_zp = 11>} : (tensor<4x10x10x2xi8>, tensor<1x1x2x3xi8>, tensor<6xi32>) -> tensor<4x10x10x6xi32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 7, weight_zp = 11>} : (tensor<4x10x10x2xi8>, tensor<1x1x2x3xi8>, tensor<6xi32>) -> tensor<4x10x10x6xi32>
   return %0 : tensor<4x10x10x6xi32>
 }
 
@@ -46,15 +46,15 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<
 
 // CHECK-LABEL: @depthwise_conv2d_as_mul_padded
 func.func @depthwise_conv2d_as_mul_padded(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x12x12x6xf32> {
-  // CHECK-DAG: %[[pad:.+]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0], [0, 0]]> : tensor<5x2xi64>}
+  // CHECK-DAG: %[[pad:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0, 0, 0]> : tensor<10xi64>}
   // CHECK-DAG: %[[zero:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}
   // CHECK: %[[reIn:.+]] = tosa.reshape %arg0 {new_shape = array<i64: 4, 10, 10, 2, 1>}
-  // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, tensor<5x2xi64>, tensor<f32>) -> tensor<4x12x12x2x1xf32>
+  // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, tensor<10xi64>, tensor<f32>) -> tensor<4x12x12x2x1xf32>
   // CHECK: %[[reArg1:.+]] = tosa.reshape %arg1 {new_shape = array<i64: 1, 1, 1, 2, 3>}
   // CHECK: %[[mul:.+]] = tosa.mul %3, %[[reArg1]] {shift = 0 : i8}
   // CHECK: %[[reOut:.+]] = tosa.reshape %[[mul]] {new_shape = array<i64: 4, 12, 12, 6>}
   // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 6>}
   // CHECK: %[[add:.+]] = tosa.add %[[reOut]], %[[reArg2]]
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x12x12x6xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x12x12x6xf32>
   return %0 : tensor<4x12x12x6xf32>
 }
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
index 1f2bb3f..c361c7c 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
@@ -6,7 +6,7 @@ func.func @transpose_conv2d(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3x
   // CHECK: %[[REV2:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32}
   // CHECK: tosa.conv2d %arg0, %[[REV2]], %arg2
   // CHECK-SAME: dilation = array<i64: 1, 1>, pad = array<i64: 2, 2, 5, 5>, stride = array<i64: 1, 1>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x18x19x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x18x19x5xf32>
   return %0 : tensor<2x18x19x5xf32>
 }
 
@@ -17,8 +17,8 @@ func.func @transpose_conv2d(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3x
 func.func @transpose_conv2d_quantized(%arg0: tensor<2x16x14x3xi8>, %arg1: tensor<5x3x6x3xi8>, %arg2: tensor<5xi32>) -> (tensor<2x18x19x5xi32>) {
   // CHECK: %[[REV1:.+]] = tosa.reverse %arg1 {axis = 1 : i32}
   // CHECK: %[[REV2:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32}
-  // CHECK: tosa.conv2d %arg0, %[[REV2]], %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 2, 2, 5, 5>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xi8>, tensor<5x3x6x3xi8>, tensor<5xi32>) -> tensor<2x18x19x5xi32>
+  // CHECK: tosa.conv2d %arg0, %[[REV2]], %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 2, 2, 5, 5>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = i32, out_pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xi8>, tensor<5x3x6x3xi8>, tensor<5xi32>) -> tensor<2x18x19x5xi32>
   return %0 : tensor<2x18x19x5xi32>
 }
 
@@ -32,6 +32,7 @@ func.func @transpose_conv2d_quantized_padded(%arg0: tensor<2x16x14x3xi8>, %arg1:
   // CHECK-SAME: dilation = array<i64: 1, 1>, pad = array<i64: 3, 4, 8, 9>,
   // CHECK-SAME: quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
   %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {
+    acc_type = i32,
     out_pad = array<i64: 1, 2, 3, 4>,
     quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>,
     out_shape = array<i64: -1, -1, -1, -1>,
@@ -44,7 +45,7 @@ func.func @transpose_conv2d_quantized_padded(%arg0: tensor<2x16x14x3xi8>, %arg1:
 // CHECK-LABEL: @transpose_conv2d_strided
 func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<5x3x5x3xf32>, %arg2: tensor<5xf32>) -> tensor<2x?x?x5xf32> {
   // Manipulate the weight matrix to handle striding.
-  // CHECK-DAG: %[[PADV:.+]]  = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PADV:.+]]  = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[TRANSV:.+]]  = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[PADW:.+]]  = tosa.pad %arg1, %[[PADV]]
   // CHECK-DAG: %[[RESW1:.+]]  = tosa.reshape %[[PADW]] {new_shape = array<i64: 5, 2, 2, 2, 3, 3>}
@@ -54,20 +55,20 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<
   // CHECK-DAG: %[[NEWWEIGHT:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32}
 
   // Pad out the input matrix to handle the transpose conv.
-  // CHECK-DAG: %[[PAD:.+]]  = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PAD:.+]]  = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[TRANS2:.+]]  = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[NEWINPUT:.+]] = tosa.pad %arg0, %[[PAD]]
 
   // Manipulate the final shape.
   // CHECK-DAG: %[[BIAS:.+]]  = "tosa.const"() <{value = dense<0.000000e+00> : tensor<30xf32>}
-  // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
   // CHECK-DAG: %[[RESHAPE_OUT_1:.+]] = tosa.reshape %[[CONV]] {new_shape = array<i64: 2, 18, 16, 2, 3, 5>}
   // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]], %[[TRANS2]]
   // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]] {new_shape = array<i64: 2, 36, 48, 5>}
   // CHECK-DAG: %[[SLICE:.+]] = tosa.slice %[[RESHAPE_OUT_2]] {size = array<i64: 2, 35, 47, 5>, start = array<i64: 0, 0, 0, 0>}
   // CHECK-DAG: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 5>}
   // CHECK: %[[ADD:.+]] = tosa.add %[[SLICE]], %[[RESHAPE_ARG2]]
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x17x15x3xf32>, tensor<5x3x5x3xf32>, tensor<5xf32>) -> tensor<2x35x47x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x17x15x3xf32>, tensor<5x3x5x3xf32>, tensor<5xf32>) -> tensor<2x35x47x5xf32>
   %1 = tensor.cast %0 : tensor<2x35x47x5xf32> to tensor<2x?x?x5xf32>
   return %1 : tensor<2x?x?x5xf32>
 }
@@ -77,7 +78,7 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<
 // CHECK-LABEL: @transpose_conv2d_strided_quantized
 func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1: tensor<5x3x5x3xi8>, %arg2: tensor<5xi32>) -> (tensor<2x35x47x5xi32>) {
   // Manipulate the weight matrix to handle striding.
-  // CHECK-DAG: %[[PADV:.+]]  = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PADV:.+]]  = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[TRANSV:.+]]  = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[PADW:.+]]  = tosa.pad %arg1, %[[PADV]] {quantization_info = #tosa.pad_quant<input_zp = 42>}
   // CHECK-DAG: %[[RESW1:.+]]  = tosa.reshape %[[PADW]] {new_shape = array<i64: 5, 2, 2, 2, 3, 3>}
@@ -87,20 +88,20 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1
   // CHECK-DAG: %[[NEWWEIGHT:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32}
 
   // Pad out the input matrix to handle the transpose conv.
-  // CHECK-DAG: %[[PAD:.+]]  = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PAD:.+]]  = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[TRANS2:.+]]  = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[NEWINPUT:.+]] = tosa.pad %arg0, %[[PAD]] {quantization_info = #tosa.pad_quant<input_zp = -22>}
 
   // Manipulate the final shape.
   // CHECK-DAG: %[[BIAS:.+]]  = "tosa.const"() <{value = dense<0> : tensor<30xi32>}
-  // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
+  // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
   // CHECK-DAG: %[[RESHAPE_OUT_1:.+]] = tosa.reshape %[[CONV]] {new_shape = array<i64: 2, 18, 16, 2, 3, 5>}
   // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]], %[[TRANS2]]
   // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]] {new_shape = array<i64: 2, 36, 48, 5>}
   // CHECK-DAG: %[[SLICE:.+]] = tosa.slice %[[RESHAPE_OUT_2]] {size = array<i64: 2, 35, 47, 5>, start = array<i64: 0, 0, 0, 0>}
   // CHECK-DAG: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 5>}
   // CHECK: %[[ADD:.+]] = tosa.add %[[SLICE]], %[[RESHAPE_ARG2]]
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x17x15x3xi8>, tensor<5x3x5x3xi8>, tensor<5xi32>) -> tensor<2x35x47x5xi32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = i32, out_pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x17x15x3xi8>, tensor<5x3x5x3xi8>, tensor<5xi32>) -> tensor<2x35x47x5xi32>
   return %0 : tensor<2x35x47x5xi32>
 }
 
@@ -108,12 +109,12 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1
 
 // CHECK-LABEL: @transpose_conv2d_strided_overpad
 func.func @transpose_conv2d_strided_overpad(%arg0 : tensor<1x16x1x1xi8>, %arg1 : tensor<1x2x1x1xi8>, %arg2 : tensor<1xi32>) -> (tensor<1x19x2x1xi32>) {
-  // CHECK-DAG: %[[WEIGHT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [0, 0], [0, 1], [0, 0]]> : tensor<4x2xi32>
+  // CHECK-DAG: %[[WEIGHT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 0, 0, 1, 0, 0]> : tensor<8xi32>
   // CHECK-DAG: %[[WEIGHT_PERMS:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
-  // CHECK-DAG: %[[INPUT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [1, 1], [0, 0], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[INPUT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 0, 0, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor<2xi32>}
   // CHECK-DAG: %[[RESULT_PERMS:.+]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-  // CHECK-DAG: %[[RESULT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [2, 0], [0, 0], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[RESULT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 2, 0, 0, 0, 0, 0]> : tensor<8xi32>}
   // CHECK: %[[PAD_WEIGHT:.+]] = tosa.pad %arg1, %[[WEIGHT_PAD]] {quantization_info = #tosa.pad_quant<input_zp = 93>}
   // CHECK: %[[RESHAPE_WEIGHT_0:.+]] = tosa.reshape %[[PAD_WEIGHT]] {new_shape = array<i64: 1, 2, 1, 1, 2, 1>}
   // CHECK: %[[TRANSPOSE_WEIGHT:.+]] = tosa.transpose %[[RESHAPE_WEIGHT_0]], %[[WEIGHT_PERMS]]
@@ -129,6 +130,7 @@ func.func @transpose_conv2d_strided_overpad(%arg0 : tensor<1x16x1x1xi8>, %arg1 :
   // CHECK: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 1>}
   // CHECK: %[[ADD:.+]] = tosa.add %[[PAD_RESULT]], %[[RESHAPE_ARG2]]
   %2 =  tosa.transpose_conv2d %arg0, %arg1, %arg2 {
+    acc_type = i32,
     out_pad = array<i64: 2, 0, 0, 1>,
     out_shape = array<i64: 1, -1, -1, 1>,
     stride = array<i64: 1, 2>,
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index d46de74..82f3e22 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -495,9 +495,9 @@ func.func @test_concat_axis_1(%arg0 : tensor<2x1xf32>, %arg1 : tensor<2x2xf32>)
 // -----
 
 // CHECK-LABEL: @test_padding_no_const
-func.func @test_padding_no_const(%arg0 : tensor<1x2xf32>, %arg1 : tensor<2x2xi32>) -> () {
-  // CHECK: tosa.pad %arg0, %arg1 : (tensor<1x2xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
-  %0 = tosa.pad %arg0, %arg1  : (tensor<1x2xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+func.func @test_padding_no_const(%arg0 : tensor<1x2xf32>, %arg1 : tensor<4xi32>) -> () {
+  // CHECK: tosa.pad %arg0, %arg1 : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<?x?xf32>
+  %0 = tosa.pad %arg0, %arg1  : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return
 }
 
@@ -505,9 +505,9 @@ func.func @test_padding_no_const(%arg0 : tensor<1x2xf32>, %arg1 : tensor<2x2xi32
 
 // CHECK-LABEL:@test_padding_dynamic_input
 func.func @test_padding_dynamic_input(%arg0 : tensor<1x?xf32>) -> () {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // CHECK: tosa.pad %arg0, %cst : (tensor<1x?xf32>, tensor<2x2xi32>) -> tensor<4x?xf32>
-  %1 = tosa.pad %arg0, %0  : (tensor<1x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  // CHECK: tosa.pad %arg0, %cst : (tensor<1x?xf32>, tensor<4xi32>) -> tensor<4x?xf32>
+  %1 = tosa.pad %arg0, %0  : (tensor<1x?xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return
 }
 
@@ -515,9 +515,9 @@ func.func @test_padding_dynamic_input(%arg0 : tensor<1x?xf32>) -> () {
 
 // CHECK-LABEL: @test_padding_simple
 func.func @test_padding_simple(%arg0 : tensor<1x2xf32>) -> () {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // CHECK: tosa.pad %arg0, %cst : (tensor<1x2xf32>, tensor<2x2xi32>) -> tensor<4x9xf32>
-  %1 = tosa.pad %arg0, %0  : (tensor<1x2xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  // CHECK: tosa.pad %arg0, %cst : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<4x9xf32>
+  %1 = tosa.pad %arg0, %0  : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return
 }
 
@@ -674,7 +674,7 @@ func.func @test_pool_static(%arg0: tensor<3x5x6x7xf32>) {
 // CHECK-LABEL: @conv2d_static
 func.func @conv2d_static(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x6x4x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -683,7 +683,7 @@ func.func @conv2d_static(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf
 // CHECK-LABEL: @conv2d_dynamic_input
 func.func @conv2d_dynamic_input(%input: tensor<?x?x?x?xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<?x?x?x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -716,7 +716,7 @@ func.func @test_pool_padded(%arg0: tensor<3x5x6x7xf32>) {
 // CHECK-LABEL: @conv2d_dynamic_weight
 func.func @conv2d_dynamic_weight(%input: tensor<2x8x9x3xf32>, %weights: tensor<?x?x?x?xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x?x?x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<?x?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<?x?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -725,7 +725,7 @@ func.func @conv2d_dynamic_weight(%input: tensor<2x8x9x3xf32>, %weights: tensor<?
 // CHECK-LABEL: @conv2d_dynamic_bias
 func.func @conv2d_dynamic_bias(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<?xf32>) -> () {
   // CHECK: -> tensor<2x6x4x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -746,7 +746,7 @@ func.func @test_pool_stride(%arg0: tensor<3x11x12x7xf32>) {
 // CHECK-LABEL: @conv2d_padded
 func.func @conv2d_padded(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x9x11x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 1, 2, 3, 4>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 1, 2, 3, 4>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -755,7 +755,7 @@ func.func @conv2d_padded(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf
 // CHECK-LABEL: @conv2d_dilated
 func.func @conv2d_dilated(%input: tensor<2x12x14x3xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x6x4x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 3, 2>} : (tensor<2x12x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 3, 2>} : (tensor<2x12x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -764,7 +764,7 @@ func.func @conv2d_dilated(%input: tensor<2x12x14x3xf32>, %weights: tensor<5x3x6x
 // CHECK-LABEL: @conv2d_strided
 func.func @conv2d_strided(%input: tensor<1x13x14x1xf32>, %weights: tensor<1x1x1x1xf32>, %bias: tensor<1xf32>) -> () {
   // CHECK: -> tensor<1x5x7x1xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 2>, dilation = array<i64: 1, 1>} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 2>, dilation = array<i64: 1, 1>} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -773,7 +773,7 @@ func.func @conv2d_strided(%input: tensor<1x13x14x1xf32>, %weights: tensor<1x1x1x
 // CHECK-LABEL: @conv3d_static
 func.func @conv3d_static(%input: tensor<2x8x9x10x3xf32>, %weights: tensor<5x3x6x4x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x6x4x7x5xf32>
-  %0 = tosa.conv3d %input, %weights, %bias {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %input, %weights, %bias {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -782,7 +782,7 @@ func.func @conv3d_static(%input: tensor<2x8x9x10x3xf32>, %weights: tensor<5x3x6x
 // CHECK-LABEL: @conv3d_dynamic_input
 func.func @conv3d_dynamic_input(%arg0: tensor<?x?x?x?x?xf32>, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<?x?x?x?x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<?x?x?x?x?xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<?x?x?x?x?xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -791,7 +791,7 @@ func.func @conv3d_dynamic_input(%arg0: tensor<?x?x?x?x?xf32>, %arg1: tensor<5x3x
 // CHECK-LABEL: @conv3d_dynamic_weight
 func.func @conv3d_dynamic_weight(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<?x?x?x?x?xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x?x?x?x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<?x?x?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<?x?x?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -800,7 +800,7 @@ func.func @conv3d_dynamic_weight(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<?x
 // CHECK-LABEL: @conv3d_dynamic_bias
 func.func @conv3d_dynamic_bias(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<?xf32>) {
   // CHECK: -> tensor<2x6x4x7x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<?xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<?xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -809,7 +809,7 @@ func.func @conv3d_dynamic_bias(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x
 // CHECK-LABEL: @conv3d_padded
 func.func @conv3d_padded(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x9x11x18x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 2, 3, 4, 5, 6>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 2, 3, 4, 5, 6>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -818,7 +818,7 @@ func.func @conv3d_padded(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x6x4x3x
 // CHECK-LABEL: @conv3d_dilated
 func.func @conv3d_dilated(%arg0: tensor<2x12x14x16x3xf32>, %arg1: tensor<5x3x6x2x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x6x4x12x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 3, 2, 4>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x12x14x16x3xf32>, tensor<5x3x6x2x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 3, 2, 4>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x12x14x16x3xf32>, tensor<5x3x6x2x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -827,7 +827,7 @@ func.func @conv3d_dilated(%arg0: tensor<2x12x14x16x3xf32>, %arg1: tensor<5x3x6x2
 // CHECK-LABEL: @conv3d_strided
 func.func @conv3d_strided(%arg0: tensor<1x13x14x15x1xf32>, %arg1: tensor<1x1x1x1x1xf32>, %arg2: tensor<1xf32>) {
   // CHECK: -> tensor<1x5x7x4x1xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 3, 2, 4>} : (tensor<1x13x14x15x1xf32>, tensor<1x1x1x1x1xf32>, tensor<1xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 3, 2, 4>} : (tensor<1x13x14x15x1xf32>, tensor<1x1x1x1x1xf32>, tensor<1xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -836,7 +836,7 @@ func.func @conv3d_strided(%arg0: tensor<1x13x14x15x1xf32>, %arg1: tensor<1x1x1x1
 // CHECK-LABEL: @depthwise_conv2d_static
 func.func @depthwise_conv2d_static(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<2x6x4x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x6x4x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x6x4x15xf32>
   return
 }
 
@@ -845,7 +845,7 @@ func.func @depthwise_conv2d_static(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6
 // CHECK-LABEL: @depthwise_conv2d_dynamic_input
 func.func @depthwise_conv2d_dynamic_input(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<?x?x?x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<?x?x?x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<?x?x?x15xf32>
   return
 }
 
@@ -854,7 +854,7 @@ func.func @depthwise_conv2d_dynamic_input(%arg0: tensor<?x?x?x?xf32>, %arg1: ten
 // CHECK-LABEL: @depthwise_conv2d_dynamic_weight
 func.func @depthwise_conv2d_dynamic_weight(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<2x?x?x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<?x?x?x?xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<?x?x?x?xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
   return
 }
 
@@ -863,7 +863,7 @@ func.func @depthwise_conv2d_dynamic_weight(%arg0: tensor<2x8x9x3xf32>, %arg1: te
 // CHECK-LABEL: @depthwise_conv2d_dynamic_bias
 func.func @depthwise_conv2d_dynamic_bias(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<?xf32>) {
   // CHECK: -> tensor<2x6x4x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<?xf32>) -> tensor<2x6x4x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<?xf32>) -> tensor<2x6x4x15xf32>
   return
 }
 
@@ -872,7 +872,7 @@ func.func @depthwise_conv2d_dynamic_bias(%arg0: tensor<2x8x9x3xf32>, %arg1: tens
 // CHECK-LABEL: @depthwise_conv2d_padded
 func.func @depthwise_conv2d_padded(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<2x9x11x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 1, 2, 3, 4>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x9x11x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 1, 2, 3, 4>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x9x11x15xf32>
   return
 }
 
@@ -881,7 +881,7 @@ func.func @depthwise_conv2d_padded(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6
 // CHECK-LABEL: @depthwise_conv2d_dilated
 func.func @depthwise_conv2d_dilated(%arg0: tensor<2x12x14x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<2x6x4x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 3, 2>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x12x14x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x6x4x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 3, 2>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x12x14x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x6x4x15xf32>
   return
 }
 
@@ -890,7 +890,7 @@ func.func @depthwise_conv2d_dilated(%arg0: tensor<2x12x14x3xf32>, %arg1: tensor<
 // CHECK-LABEL: @depthwise_conv2d_strided
 func.func @depthwise_conv2d_strided(%arg0: tensor<1x13x14x1xf32>, %arg1: tensor<1x1x1x1xf32>, %arg2: tensor<1xf32>) {
   // CHECK: -> tensor<1x5x7x1xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 2>} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x5x7x1xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 2>} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x5x7x1xf32>
   return
 }
 
@@ -899,7 +899,7 @@ func.func @depthwise_conv2d_strided(%arg0: tensor<1x13x14x1xf32>, %arg1: tensor<
 // CHECK-LABEL: @transpose_conv2d_out_shape
 func.func @transpose_conv2d_out_shape(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x8x9x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, 8, 9, -1>, stride = array<i64: 1, 1>} : (tensor<2x?x?x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x8x9x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, 8, 9, -1>, stride = array<i64: 1, 1>} : (tensor<2x?x?x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x8x9x5xf32>
   return
 }
 
@@ -908,7 +908,7 @@ func.func @transpose_conv2d_out_shape(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<
 // CHECK-LABEL: @transpose_conv2d_static
 func.func @transpose_conv2d_static(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x18x19x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
   return
 }
 
@@ -917,7 +917,7 @@ func.func @transpose_conv2d_static(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5
 // CHECK-LABEL: @transpose_conv2d_static_strided
 func.func @transpose_conv2d_static_strided(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x33x45x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
   return
 }
 
@@ -926,7 +926,7 @@ func.func @transpose_conv2d_static_strided(%arg0: tensor<2x16x14x3xf32>, %arg1:
 // CHECK-LABEL: @transpose_conv2d_dynamic_input
 func.func @transpose_conv2d_dynamic_input(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<?x?x?x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x5xf32>
   return
 }
 
@@ -935,7 +935,7 @@ func.func @transpose_conv2d_dynamic_input(%arg0: tensor<?x?x?x?xf32>, %arg1: ten
 // CHECK-LABEL: @transpose_conv2d_dynamic_weights
 func.func @transpose_conv2d_dynamic_weights(%arg0: tensor<2x6x4x3xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x?x?x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x6x4x3xf32>, tensor<?x?x?x?xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x6x4x3xf32>, tensor<?x?x?x?xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
   return
 }
 
@@ -944,7 +944,7 @@ func.func @transpose_conv2d_dynamic_weights(%arg0: tensor<2x6x4x3xf32>, %arg1: t
 // CHECK-LABEL: @transpose_conv2d_dynamic_bias
 func.func @transpose_conv2d_dynamic_bias(%arg0: tensor<2x6x4x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<?xf32>) {
   // CHECK: -> tensor<2x8x9x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x6x4x3xf32>, tensor<5x3x6x3xf32>, tensor<?xf32>) -> tensor<2x8x9x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x6x4x3xf32>, tensor<5x3x6x3xf32>, tensor<?xf32>) -> tensor<2x8x9x5xf32>
   return
 }
 
@@ -953,14 +953,14 @@ func.func @transpose_conv2d_dynamic_bias(%arg0: tensor<2x6x4x3xf32>, %arg1: tens
 // CHECK-LABEL: @transpose_conv2d_padded
 func.func @transpose_conv2d_padded(%arg0: tensor<2x9x11x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x10x13x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 1, 0, 3, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x9x11x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x10x13x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 1, 0, 3, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x9x11x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x10x13x5xf32>
   return
 }
 
 // CHECK-LABEL: @transpose_conv2d_strided
 func.func @transpose_conv2d_strided(%arg0: tensor<1x5x7x1xf32>, %arg1: tensor<1x1x1x1xf32>, %arg2: tensor<1xf32>) {
   // CHECK: -> tensor<1x13x13x1xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 3, 2>} : (tensor<1x5x7x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x13x13x1xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 3, 2>} : (tensor<1x5x7x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x13x13x1xf32>
   return
 }
 
@@ -1368,7 +1368,7 @@ func.func @test_non_tosa_consumer_still_propagates(%arg0: tensor<1x1x8xf32>, %ar
 func.func @test_tosa_use_def_chain(%arg0: tensor<1x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>, %arg2: tensor<16xf32>) -> tensor<?x16x16x16xf32> {
   // CHECK: [[CONV:%.+]] = tosa.conv2d %arg0, %arg1, %arg2
   // CHECK: (tensor<1x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>} : (tensor<1x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<?x32x32x16xf32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>} : (tensor<1x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<?x32x32x16xf32>
   // CHECK: tosa.max_pool2d [[CONV]]
   // CHECK: (tensor<1x32x32x16xf32>) -> tensor<1x16x16x16xf32>
   %1 = tosa.max_pool2d %0 {kernel = array<i64: 2, 2>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>} : (tensor<?x32x32x16xf32>) -> tensor<?x16x16x16xf32>
diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir
new file mode 100644
index 0000000..3d6527f
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/assert.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void 2>&1 \
+// RUN: | FileCheck %s
+
+// CHECK-DAG: thread 0: print after passing assertion
+// CHECK-DAG: thread 1: print after passing assertion
+// CHECK-DAG: callee_file.cc:7: callee_func_name: block: [0,0,0], thread: [0,0,0] Assertion `failing assertion` failed.
+// CHECK-DAG: callee_file.cc:7: callee_func_name: block: [0,0,0], thread: [1,0,0] Assertion `failing assertion` failed.
+// CHECK-NOT: print after failing assertion
+
+module attributes {gpu.container_module} {
+gpu.module @kernels {
+gpu.func @test_assert(%c0: i1, %c1: i1) kernel {
+  %0 = gpu.thread_id x
+  cf.assert %c1, "passing assertion"
+  gpu.printf "thread %lld: print after passing assertion\n", %0 : index
+  // Test callsite(callsite(name)) location.
+  cf.assert %c0, "failing assertion" loc(callsite(callsite("callee_func_name"("callee_file.cc":7:9) at "caller_file.cc":10:8) at "caller2_file.cc":11:12))
+  gpu.printf "thread %lld: print after failing assertion\n", %0 : index
+  gpu.return
+}
+}
+
+func.func @main() {
+  %c2 = arith.constant 2 : index
+  %c1 = arith.constant 1 : index
+  %c0_i1 = arith.constant 0 : i1
+  %c1_i1 = arith.constant 1 : i1
+  gpu.launch_func @kernels::@test_assert
+      blocks in (%c1, %c1, %c1)
+      threads in (%c2, %c1, %c1)
+      args(%c0_i1 : i1, %c1_i1 : i1)
+  return
+}
+}
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
index 99ea120..15b0bf0 100644
--- a/mlir/test/Integration/GPU/CUDA/printf.mlir
+++ b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
             %0 = gpu.thread_id x
             %csti8 = arith.constant 2 : i8
             %cstf32 = arith.constant 3.0 : f32
-            gpu.printf "Hello from %lld, %d, %f\n" %0, %csti8, %cstf32  : index, i8, f32
+            gpu.printf "Hello from %lld, %d, %f\n", %0, %csti8, %cstf32  : index, i8, f32
             gpu.return
         }
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir
index c70c940..a22a34b 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir
@@ -43,7 +43,7 @@ module attributes {gpu.container_module} {
       %cnd2 =  arith.cmpi eq, %bidY, %c3 : index
       scf.if %cnd1 {
         scf.if %cnd2 {
-          gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n" 
+          gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n",
             %cidX_i32,
             %cidY_i32,
             %cidZ_i32,
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index b50772f..95bde40 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -85,7 +85,7 @@ module @mymod {
       
       // Step 7. First thread does TMA load
       scf.if %10 {
-        gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
+        gpu.printf "[GPU] TMA SIZE %d\0A", %c8192 : index
         nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
       } else {
@@ -98,16 +98,16 @@ module @mymod {
 
       // Step 9. Print loaded data in 128b swizzled
       scf.if %10 {        
-        gpu.printf "===--- Matrix A ---=== %d \0A" %c-1_i32 : i32
+        gpu.printf "===--- Matrix A ---=== %d \0A", %c-1_i32 : i32
         scf.for %arg12 = %c0 to %c128 step %c1 {
           scf.for %arg13 = %c0 to %c64 step %c1 {
             %15 = memref.load %7[%arg12, %arg13] : !shmemlhs
             %16 = arith.extf %15 : f16 to f32
-            gpu.printf "%.0f,   " %16 : f32
+            gpu.printf "%.0f,   ", %16 : f32
           }
-          gpu.printf "%d\0A" %c-1_i32 : i32
+          gpu.printf "%d\0A", %c-1_i32 : i32
         }
-        gpu.printf "===----------------=== %d \0A" %c-1_i32 : i32
+        gpu.printf "===----------------=== %d \0A", %c-1_i32 : i32
       }
       gpu.terminator
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 65e5fc0..e76fa039 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -109,7 +109,7 @@ module @mymod {
       
       // Step 6. First thread does TMA load
       scf.if %10 {
-        gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
+        gpu.printf "[GPU] TMA SIZE %d\0A", %c32768 : index
         nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
         nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem1 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1]>, 3>
         nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 4096>, 3>
@@ -124,16 +124,16 @@ module @mymod {
 
       // Step 8. Print loaded data in 128b swizzled
       scf.if %10 {        
-        gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
+        gpu.printf "===--- Matrix B ---=== %d \n", %c-1_i32 : i32
         scf.for %ii = %c0 to %c64 step %c1 {
           scf.for %j = %c0 to %c128 step %c1 {
             %lhs0 = memref.load %rhsShmem[%ii, %j] : !shmemrhs
             %lhs032 = arith.extf %lhs0: f16 to f32
-            gpu.printf "%.0f,   " %lhs032 : f32
+            gpu.printf "%.0f,   ", %lhs032 : f32
           }
-          gpu.printf "%d\n" %c-1_i32 : i32
+          gpu.printf "%d\n", %c-1_i32 : i32
         }
-        gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
+        gpu.printf "===----------------=== %d \n", %c-1_i32 : i32
       }
       gpu.barrier
       gpu.terminator
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 391fda8..acca981 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -80,8 +80,8 @@ module @mymod {
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
         %11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
-        gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
-        gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
+        gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A", %11 : f32
+        gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A", %12 : f32
         nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
         nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
       } else {
@@ -92,8 +92,8 @@ module @mymod {
       scf.if %10 {
         %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
-        gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
-        gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
+        gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32
+        gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32
       }
       gpu.terminator
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
index f83f65b..fe6c645 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
@@ -96,8 +96,8 @@ func.func @main() {
     scf.if %10 {
       %11 = memref.load %out[%c45, %c7] : memref<64x8xf32, 3>
       %12 = memref.load %out_1[%c7, %c0] : memref<8x128xf32, 3>
-      gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
-      gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
+      gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32
+      gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32
     }
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/ROCM/printf.mlir b/mlir/test/Integration/GPU/ROCM/printf.mlir
index d5e6e37..4a0e4d3 100644
--- a/mlir/test/Integration/GPU/ROCM/printf.mlir
+++ b/mlir/test/Integration/GPU/ROCM/printf.mlir
@@ -13,7 +13,7 @@ module attributes {gpu.container_module} {
     gpu.module @kernels {
         gpu.func @hello() kernel {
             %0 = gpu.thread_id x
-            gpu.printf "Hello from %d\n" %0 : index
+            gpu.printf "Hello from %d\n", %0 : index
             gpu.return
         }
     }
diff --git a/mlir/test/Target/LLVMIR/Import/import-failure.ll b/mlir/test/Target/LLVMIR/Import/import-failure.ll
index 6bde174..b616cb8 100644
--- a/mlir/test/Target/LLVMIR/Import/import-failure.ll
+++ b/mlir/test/Target/LLVMIR/Import/import-failure.ll
@@ -13,15 +13,6 @@ bb2:
 ; // -----
 
 ; CHECK:      <unknown>
-; CHECK-SAME: error: unhandled value: ptr asm "bswap $0", "=r,r"
-define i32 @unhandled_value(i32 %arg1) {
-  %1 = call i32 asm "bswap $0", "=r,r"(i32 %arg1)
-  ret i32 %1
-}
-
-; // -----
-
-; CHECK:      <unknown>
 ; CHECK-SAME: unhandled constant: ptr blockaddress(@unhandled_constant, %bb1) since blockaddress(...) is unsupported
 ; CHECK:      <unknown>
 ; CHECK-SAME: error: unhandled instruction: ret ptr blockaddress(@unhandled_constant, %bb1)
diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll
index fff48bb..7377e25 100644
--- a/mlir/test/Target/LLVMIR/Import/instructions.ll
+++ b/mlir/test/Target/LLVMIR/Import/instructions.ll
@@ -535,6 +535,17 @@ define void @indirect_vararg_call(ptr addrspace(42) %fn) {
 
 ; // -----
 
+; CHECK-LABEL: @inlineasm
+; CHECK-SAME:  %[[ARG1:[a-zA-Z0-9]+]]
+define i32 @inlineasm(i32 %arg1) {
+  ; CHECK:  %[[RES:.+]] = llvm.inline_asm has_side_effects "bswap $0", "=r,r" %[[ARG1]] : (i32) -> i32
+  %1 = call i32 asm "bswap $0", "=r,r"(i32 %arg1)
+  ; CHECK: return %[[RES]]
+  ret i32 %1
+}
+
+; // -----
+
 ; CHECK-LABEL: @gep_static_idx
 ; CHECK-SAME:  %[[PTR:[a-zA-Z0-9]+]]
 define void @gep_static_idx(ptr %ptr) {
diff --git a/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll b/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll
index f5128ff..bf4c857 100644
--- a/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll
+++ b/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll
@@ -92,3 +92,38 @@ declare void @foo(ptr %arg1)
 !0 = distinct !{!0, !"The domain"}
 !1 = !{!1, !0}
 !2 = !{!1}
+
+; // -----
+
+; CHECK: #[[DOMAIN:.*]] = #llvm.alias_scope_domain<id = "domain1">
+; CHECK: #[[$SCOPE0:.*]] = #llvm.alias_scope<id = "scopeid1", domain = #[[DOMAIN]], description = "The first scope">
+; CHECK: #[[$SCOPE1:.*]] = #llvm.alias_scope<id = "scopeid2", domain = #[[DOMAIN]]>
+; CHECK: #[[$SCOPE2:.*]] = #llvm.alias_scope<id = "scopeid3", domain = #[[DOMAIN]]>
+
+; CHECK-LABEL: llvm.func @alias_scope
+define void @alias_scope(ptr %arg1) {
+  ; CHECK: llvm.load
+  ; CHECK-SAME:  alias_scopes = [#[[$SCOPE0]]]
+  ; CHECK-SAME:  noalias_scopes = [#[[$SCOPE1]], #[[$SCOPE2]]]
+  %1 = load i32, ptr %arg1, !alias.scope !4, !noalias !7
+  ; CHECK: llvm.load
+  ; CHECK-SAME:  alias_scopes = [#[[$SCOPE1]]]
+  ; CHECK-SAME:  noalias_scopes = [#[[$SCOPE0]], #[[$SCOPE2]]]
+  %2 = load i32, ptr %arg1, !alias.scope !5, !noalias !8
+  ; CHECK: llvm.load
+  ; CHECK-SAME:  alias_scopes = [#[[$SCOPE2]]]
+  ; CHECK-SAME:  noalias_scopes = [#[[$SCOPE0]], #[[$SCOPE1]]]
+  %3 = load i32, ptr %arg1, !alias.scope !6, !noalias !9
+  ret void
+}
+
+!0 = !{!"domain1"}
+!1 = !{!"scopeid1", !0, !"The first scope"}
+!2 = !{!"scopeid2", !0}
+!3 = !{!"scopeid3", !0}
+!4 = !{!1}
+!5 = !{!2}
+!6 = !{!3}
+!7 = !{!2, !3}
+!8 = !{!1, !3}
+!9 = !{!1, !2}
diff --git a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
index fa33955..fb71a51 100644
--- a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
+++ b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
@@ -104,3 +104,54 @@ llvm.func @self_reference() {
 // CHECK-DAG: ![[SCOPES]] = !{![[SCOPE]]}
 // CHECK-DAG: = !DISubroutineType(types: ![[TYPES:[0-9]+]])
 // CHECK-DAG: ![[TYPES]] = !{null}
+
+// -----
+
+llvm.func @foo(%arg0: !llvm.ptr)
+
+#alias_scope_domain = #llvm.alias_scope_domain<id = "domain1", description = "The domain">
+#alias_scope1 = #llvm.alias_scope<id = "scope1", domain = #alias_scope_domain, description = "The first scope">
+#alias_scope2 = #llvm.alias_scope<id = "scope2", domain = #alias_scope_domain>
+#alias_scope3 = #llvm.alias_scope<id = "scope3", domain = #alias_scope_domain>
+
+// CHECK-LABEL: @alias_scopes
+llvm.func @alias_scopes(%arg1 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(0 : i32) : i32
+  // CHECK:  call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES1:[0-9]+]])
+  llvm.intr.experimental.noalias.scope.decl #alias_scope1
+  // CHECK:  store {{.*}}, !alias.scope ![[SCOPES1]], !noalias ![[SCOPES23:[0-9]+]]
+  llvm.store %0, %arg1 {alias_scopes = [#alias_scope1], noalias_scopes = [#alias_scope2, #alias_scope3]} : i32, !llvm.ptr
+  // CHECK:  load {{.*}}, !alias.scope ![[SCOPES2:[0-9]+]], !noalias ![[SCOPES13:[0-9]+]]
+  %1 = llvm.load %arg1 {alias_scopes = [#alias_scope2], noalias_scopes = [#alias_scope1, #alias_scope3]} : !llvm.ptr -> i32
+  // CHECK:  atomicrmw {{.*}}, !alias.scope ![[SCOPES3:[0-9]+]], !noalias ![[SCOPES12:[0-9]+]]
+  %2 = llvm.atomicrmw add %arg1, %0 monotonic {alias_scopes = [#alias_scope3], noalias_scopes = [#alias_scope1, #alias_scope2]} : !llvm.ptr, i32
+  // CHECK:  cmpxchg {{.*}}, !alias.scope ![[SCOPES3]]
+  %3 = llvm.cmpxchg %arg1, %1, %2 acq_rel monotonic {alias_scopes = [#alias_scope3]} : !llvm.ptr, i32
+  %5 = llvm.mlir.constant(42 : i8) : i8
+  // CHECK:  llvm.memcpy{{.*}}, !alias.scope ![[SCOPES3]]
+  "llvm.intr.memcpy"(%arg1, %arg1, %0) <{isVolatile = false}> {alias_scopes = [#alias_scope3]} : (!llvm.ptr, !llvm.ptr, i32) -> ()
+  // CHECK:  llvm.memset{{.*}}, !noalias ![[SCOPES3]]
+  "llvm.intr.memset"(%arg1, %5, %0) <{isVolatile = false}> {noalias_scopes = [#alias_scope3]} : (!llvm.ptr, i8, i32) -> ()
+  // CHECK: call void @foo({{.*}} !alias.scope ![[SCOPES3]]
+  llvm.call @foo(%arg1) {alias_scopes = [#alias_scope3]} : (!llvm.ptr) -> ()
+  // CHECK: call void @foo({{.*}} !noalias ![[SCOPES3]]
+  llvm.call @foo(%arg1) {noalias_scopes = [#alias_scope3]} : (!llvm.ptr) -> ()
+  llvm.return
+}
+
+// Check the intrinsic declarations.
+// CHECK-DAG: declare void @llvm.experimental.noalias.scope.decl(metadata)
+// CHECK-DAG: declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg)
+// CHECK-DAG: declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
+
+// Check the translated metadata.
+// CHECK-DAG: ![[DOMAIN:[0-9]+]] = !{!"domain1", !"The domain"}
+// CHECK-DAG: ![[SCOPE1:[0-9]+]] = !{!"scope1", ![[DOMAIN]], !"The first scope"}
+// CHECK-DAG: ![[SCOPE2:[0-9]+]] = !{!"scope2", ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPE3:[0-9]+]] = !{!"scope3", ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPES1]] = !{![[SCOPE1]]}
+// CHECK-DAG: ![[SCOPES2]] = !{![[SCOPE2]]}
+// CHECK-DAG: ![[SCOPES3]] = !{![[SCOPE3]]}
+// CHECK-DAG: ![[SCOPES12]] = !{![[SCOPE1]], ![[SCOPE2]]}
+// CHECK-DAG: ![[SCOPES13]] = !{![[SCOPE1]], ![[SCOPE3]]}
+// CHECK-DAG: ![[SCOPES23]] = !{![[SCOPE2]], ![[SCOPE3]]}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index b69d7749..2d7710e 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -556,9 +556,7 @@ llvm.func @kernel_func() attributes {nvvm.kernel} {
   llvm.return
 }
 
-// CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
+// CHECK: ptx_kernel void @kernel_func
 
 // -----
 
@@ -566,9 +564,8 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxntidx", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxntidy", i32 23}
 // CHECK:     {ptr @kernel_func, !"maxntidz", i32 32}
@@ -578,9 +575,8 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 2
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"reqntidx", i32 1}
 // CHECK:     {ptr @kernel_func, !"reqntidy", i32 23}
 // CHECK:     {ptr @kernel_func, !"reqntidz", i32 32}
@@ -590,31 +586,28 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32:
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"cluster_dim_x", i32 3}
 // CHECK:     {ptr @kernel_func, !"cluster_dim_y", i32 5}
 // CHECK:     {ptr @kernel_func, !"cluster_dim_z", i32 7}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // -----
 
 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} {
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"cluster_max_blocks", i32 8}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // -----
 
 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.minctasm = 16} {
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"minctasm", i32 16}
 // -----
 
@@ -622,9 +615,8 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxnreg = 16} {
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxnreg", i32 16}
 // -----
 
@@ -633,9 +625,8 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxnreg", i32 32}
 // CHECK:     {ptr @kernel_func, !"maxntidx", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxntidy", i32 23}
@@ -643,19 +634,19 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
 // CHECK:     {ptr @kernel_func, !"minctasm", i32 16}
 
 // -----
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK: !nvvm.annotations =
 // CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
 // CHECK: !2 = !{i32 1}
-// CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
   llvm.return
 }
 
 // -----
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK: !nvvm.annotations =
 // CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
 // CHECK: !2 = !{i32 1, i32 3}
-// CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir
new file mode 100644
index 0000000..279ecb3
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Not intended to be a functional example, the aim of this test is to verify
+// omp.threadprivate does not crash on lowering during the OpenMP target device
+// pass when used in conjunction with target code in the same module.
+
+module attributes {omp.is_target_device = true } {
+  llvm.func @func() attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
+    %0 = llvm.mlir.addressof @_QFEpointer2 : !llvm.ptr
+    %1 = omp.threadprivate %0 : !llvm.ptr -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(implicit, to) capture(ByRef) -> !llvm.ptr
+    omp.target map_entries(%2 -> %arg0 : !llvm.ptr) {
+      %3 = llvm.mlir.constant(1 : i32) : i32
+      %4 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+      llvm.store %3, %4 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+   llvm.mlir.global internal @_QFEpointer2() {addr_space = 0 : i32} : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {
+    %0 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.return %0 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+  }
+}
+
+// CHECK: define weak_odr protected void @{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]]) {
+// CHECK:  %[[ALLOCA:.*]] = alloca ptr, align 8
+// CHECK:  store ptr %[[ARG1]], ptr %[[ALLOCA]], align 8
+// CHECK:  %[[LOAD_ALLOCA:.*]] = load ptr, ptr %[[ALLOCA]], align 8
+// CHECK:  store i32 1, ptr %[[LOAD_ALLOCA]], align 4
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 5f8bdf8..44e32c3 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -2590,6 +2590,34 @@ llvm.func @omp_task_attrs() -> () attributes {
 // CHECK:  store i64 8, ptr %[[dep_arr_addr_0_size]], align 4
 // CHECK:  %[[dep_arr_addr_0_kind:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[dep_arr_addr_0]], i32 0, i32 2
 // CHECK: store i8 1, ptr %[[dep_arr_addr_0_kind]], align 1
+// -----
+// dependence_type: Out
+// CHECK:  %[[DEP_ARR_ADDR1:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR_1:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR1]], i64 0, i64 0
+//         [...]
+// CHECK:  %[[DEP_TYPE_1:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_1]], i32 0, i32 2
+// CHECK:  store i8 3, ptr %[[DEP_TYPE_1]], align 1
+// -----
+// dependence_type: Inout
+// CHECK:  %[[DEP_ARR_ADDR2:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR_2:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR2]], i64 0, i64 0
+//         [...]
+// CHECK:  %[[DEP_TYPE_2:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_2]], i32 0, i32 2
+// CHECK:  store i8 3, ptr %[[DEP_TYPE_2]], align 1
+// -----
+// dependence_type: Mutexinoutset
+// CHECK:  %[[DEP_ARR_ADDR3:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR_3:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR3]], i64 0, i64 0
+//         [...]
+// CHECK:  %[[DEP_TYPE_3:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_3]], i32 0, i32 2
+// CHECK:  store i8 4, ptr %[[DEP_TYPE_3]], align 1
+// -----
+// dependence_type: Inoutset
+// CHECK:  %[[DEP_ARR_ADDR4:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR_4:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR4]], i64 0, i64 0
+//         [...]
+// CHECK:  %[[DEP_TYPE_4:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_4]], i32 0, i32 2
+// CHECK:  store i8 8, ptr %[[DEP_TYPE_4]], align 1
 llvm.func @omp_task_with_deps(%zaddr: !llvm.ptr) {
   // CHECK: %[[omp_global_thread_num:.+]] = call i32 @__kmpc_global_thread_num({{.+}})
   // CHECK: %[[task_data:.+]] = call ptr @__kmpc_omp_task_alloc
@@ -2604,6 +2632,18 @@ llvm.func @omp_task_with_deps(%zaddr: !llvm.ptr) {
     llvm.store %double, %valaddr : i32, !llvm.ptr
     omp.terminator
   }
+  omp.task depend(taskdependout -> %zaddr : !llvm.ptr) {
+    omp.terminator
+  }
+  omp.task depend(taskdependinout -> %zaddr : !llvm.ptr) {
+    omp.terminator
+  }
+  omp.task depend(taskdependmutexinoutset -> %zaddr : !llvm.ptr) {
+    omp.terminator
+  }
+  omp.task depend(taskdependinoutset -> %zaddr : !llvm.ptr) {
+    omp.terminator
+  }
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir b/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir
new file mode 100644
index 0000000..234604e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-simd-aligned.mlir
@@ -0,0 +1,60 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+//CHECK-LABEL: define void @_QPsimd_aligned_pointer() {
+//CHECK:   %[[A_PTR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
+//CHECK:   %[[A_VAL:.*]] = load ptr, ptr %[[A_PTR]], align 8
+//CHECK:   call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ]
+llvm.func @_QPsimd_aligned_pointer() {
+  %1 = llvm.mlir.constant(1 : i64) : i64
+  %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "x"} : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %1 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(1 : i32) : i32
+  %5 = llvm.mlir.constant(10 : i32) : i32
+  %6 = llvm.mlir.constant(1 : i32) : i32
+  omp.simd aligned(%2 : !llvm.ptr -> 256 : i64) {
+    omp.loop_nest (%arg0) : i32 = (%4) to (%5) inclusive step (%6) {
+      llvm.store %arg0, %3 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+//CHECK-LABEL: define void @_QPsimd_aligned_cptr() {
+//CHECK:   %[[A_CPTR:.*]] = alloca %_QM__fortran_builtinsT__builtin_c_ptr, i64 1, align 8
+//CHECK:   %[[A_VAL:.*]] = load ptr, ptr %[[A_CPTR]], align 8
+//CHECK:   call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ]
+llvm.func @_QPsimd_aligned_cptr() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.struct<"_QM__fortran_builtinsT__builtin_c_ptr", (i64)> {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  %3 = llvm.alloca %2 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(1 : i32) : i32
+  %5 = llvm.mlir.constant(10 : i32) : i32
+  %6 = llvm.mlir.constant(1 : i32) : i32
+  omp.simd aligned(%1 : !llvm.ptr -> 256 : i64) {
+    omp.loop_nest (%arg0) : i32 = (%4) to (%5) inclusive step (%6) {
+      llvm.store %arg0, %3 : i32, !llvm.ptr
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+//CHECK-LABEL: define void @_QPsimd_aligned_allocatable() {
+//CHECK:   %[[A_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+//CHECK:   %[[A_VAL:.*]] = load ptr, ptr %[[A_ADDR]], align 8
+//CHECK:   call void @llvm.assume(i1 true) [ "align"(ptr %[[A_VAL]], i64 256) ]
+llvm.func @_QPsimd_aligned_allocatable() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i32) : i32
+  %3 = llvm.mlir.constant(10 : i32) : i32
+  %4 = llvm.mlir.constant(1 : i32) : i32
+  omp.simd aligned(%1 : !llvm.ptr -> 256 : i64) {
+    omp.loop_nest (%arg0) : i32 = (%2) to (%3) inclusive step (%4) {
+      omp.yield
+    }
+  }
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 8f3e466..83a0990 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -127,18 +127,6 @@ llvm.func @sections_private(%x : !llvm.ptr) {
   llvm.return
 }
 
-// -----
-
-llvm.func @simd_aligned(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
-  // expected-error@below {{not yet implemented: Unhandled clause aligned in omp.simd operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.simd}}
-  omp.simd aligned(%x : !llvm.ptr -> 32) {
-    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
-      omp.yield
-    }
-  }
-  llvm.return
-}
 
 // -----
 
diff --git a/mlir/test/Transforms/location-snapshot.mlir b/mlir/test/Transforms/location-snapshot.mlir
index 9f48cb6..aeddfed 100644
--- a/mlir/test/Transforms/location-snapshot.mlir
+++ b/mlir/test/Transforms/location-snapshot.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt -allow-unregistered-dialect -snapshot-op-locations='filename=%/t' -mlir-print-local-scope -mlir-print-debuginfo %s | FileCheck %s -DFILE=%/t
 // RUN: mlir-opt -allow-unregistered-dialect -snapshot-op-locations='filename=%/t tag='tagged'' -mlir-print-local-scope -mlir-print-debuginfo %s | FileCheck %s --check-prefix=TAG -DFILE=%/t
+// RUN: mlir-opt -allow-unregistered-dialect -snapshot-op-locations='filename=%/t print-debuginfo' -mlir-print-local-scope -mlir-print-debuginfo %s | FileCheck %s --check-prefix=DBG -DFILE=%/t && cat %/t | FileCheck %s --check-prefix=DBGFILE
 
 // CHECK: func @function(
 // CHECK-NEXT: loc("[[FILE]]":{{[0-9]+}}:{{[0-9]+}})
@@ -15,3 +16,18 @@ func.func @function() -> i32 {
   %1 = "foo"() : () -> i32 loc("original")
   return %1 : i32 loc("original")
 } loc("original")
+
+// DBG: func @function2(
+// DBG-NEXT: loc("[[FILE]]":{{[0-9]+}}:{{[0-9]+}})
+// DBG-NEXT: loc("[[FILE]]":{{[0-9]+}}:{{[0-9]+}})
+// DBG-NEXT: } loc("[[FILE]]":{{[0-9]+}}:{{[0-9]+}})
+
+// DBGFILE: func @function2(
+// DBGFILE-NEXT: loc("{{.*}}location-snapshot.mlir":{{[0-9]+}}:{{[0-9]+}})
+// DBGFILE-NEXT: loc("{{.*}}location-snapshot.mlir":{{[0-9]+}}:{{[0-9]+}})
+// DBGFILE-NEXT: } loc("{{.*}}location-snapshot.mlir":{{[0-9]+}}:{{[0-9]+}})
+
+func.func @function2() -> i32 {
+  %1 = "foo"() : () -> i32
+  return %1 : i32
+}
+\ No newline at end of file
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
index e4c423c..5133c14 100644
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -124,6 +124,64 @@ func.func @invariant_affine_if() {
 
 // -----
 
+func.func @hoist_invariant_affine_if_success(%lb: index, %ub: index, %step: index) -> i32 {
+  %cst_0 = arith.constant 0 : i32
+  %cst_42 = arith.constant 42 : i32
+  %sum_result = affine.for %i = %lb to %ub iter_args(%acc = %cst_0) -> i32 {
+    %conditional_add = affine.if affine_set<() : ()> () -> (i32) {
+      %add = arith.addi %cst_42, %cst_42 : i32
+      affine.yield %add : i32
+    } else {
+      %poison = ub.poison : i32
+      affine.yield %poison : i32
+    }
+    %sum = arith.addi %acc, %conditional_add : i32
+    affine.yield %sum : i32
+  }
+
+  // CHECK-LABEL: hoist_invariant_affine_if_success
+  // CHECK-NEXT: arith.constant 0 : i32
+  // CHECK-NEXT: %[[CST:.*]] = arith.constant 42 : i32
+  // CHECK-NEXT: %[[IF:.*]] = affine.if
+  // CHECK-NEXT: arith.addi %[[CST]], %[[CST]] : i32
+  // CHECK: affine.for
+  // CHECK-NOT: affine.if
+  // CHECK-NEXT: arith.addi %{{.*}}, %[[IF]]
+
+  return %sum_result : i32
+}
+
+// -----
+
+func.func @hoist_variant_affine_if_failure(%lb: index, %ub: index, %step: index) -> i32 {
+  %cst_0 = arith.constant 0 : i32
+  %cst_42 = arith.constant 42 : i32
+  %ind_7 = arith.constant 7 : index
+  %sum_result = affine.for %i = %lb to %ub iter_args(%acc = %cst_0) -> i32 {
+    %conditional_add = affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%i, %ind_7) -> (i32) {
+      %add = arith.addi %cst_42, %cst_42 : i32
+      affine.yield %add : i32
+    } else {
+      %poison = ub.poison : i32
+      affine.yield %poison : i32
+    }
+    %sum = arith.addi %acc, %conditional_add : i32
+    affine.yield %sum : i32
+  }
+
+  // CHECK-LABEL: hoist_variant_affine_if_failure
+  // CHECK-NEXT: arith.constant 0 : i32
+  // CHECK-NEXT: %[[CST:.*]] = arith.constant 42 : i32
+  // CHECK-NEXT: arith.constant 7 : index
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: %[[IF:.*]] = affine.if
+  // CHECK: arith.addi %{{.*}}, %[[IF]]
+
+  return %sum_result : i32
+}
+
+// -----
+
 func.func @hoist_affine_for_with_unknown_trip_count(%lb: index, %ub: index) {
   affine.for %arg0 = 0 to 10 {
     affine.for %arg1 = %lb to %ub {
@@ -383,6 +441,69 @@ func.func @parallel_loop_with_invariant() {
 
 // -----
 
+func.func @hoist_invariant_scf_if_success(%lb: index, %ub: index, %step: index) -> i32 {
+  %cst_0 = arith.constant 0 : i32
+  %cst_42 = arith.constant 42 : i32
+  %true = arith.constant true
+  %sum_result = scf.for %i = %lb to %ub step %step iter_args(%acc = %cst_0) -> i32 {
+    %conditional_add = scf.if %true -> (i32) {
+      %add = arith.addi %cst_42, %cst_42 : i32
+      scf.yield %add : i32
+    } else {
+      %poison = ub.poison : i32
+      scf.yield %poison : i32
+    }
+    %sum = arith.addi %acc, %conditional_add : i32
+    scf.yield %sum : i32
+  }
+
+  // CHECK-LABEL: hoist_invariant_scf_if_success
+  // CHECK-NEXT: arith.constant 0 : i32
+  // CHECK-NEXT: %[[CST:.*]] = arith.constant 42 : i32
+  // CHECK-NEXT: %[[TRUE:.*]] = arith.constant true
+  // CHECK-NEXT: %[[IF:.*]] = scf.if %[[TRUE]]
+  // CHECK-NEXT: arith.addi %[[CST]], %[[CST]] : i32
+  // CHECK: scf.for
+  // CHECK-NOT: scf.if
+  // CHECK-NEXT: arith.addi %{{.*}}, %[[IF]]
+
+  return %sum_result : i32
+}
+
+// -----
+
+func.func @hoist_variant_scf_if_failure(%lb: index, %ub: index, %step: index) -> i32 {
+  %cst_0 = arith.constant 0 : i32
+  %cst_42 = arith.constant 42 : i32
+  %ind_7 = arith.constant 7 : index
+  %sum_result = scf.for %i = %lb to %ub step %step iter_args(%acc = %cst_0) -> i32 {
+    %cond = arith.cmpi ult, %i, %ind_7 : index
+    %conditional_add = scf.if %cond -> (i32) {
+      %add = arith.addi %cst_42, %cst_42 : i32
+      scf.yield %add : i32
+    } else {
+      %poison = ub.poison : i32
+      scf.yield %poison : i32
+    }
+    %sum = arith.addi %acc, %conditional_add : i32
+    scf.yield %sum : i32
+  }
+
+  // CHECK-LABEL: hoist_variant_scf_if_failure
+  // CHECK-NEXT: arith.constant 0 : i32
+  // CHECK-NEXT: %[[CST_42:.*]] = arith.constant 42 : i32
+  // CHECK-NEXT: %[[CST_7:.*]] = arith.constant 7 : index
+  // CHECK-NEXT: scf.for %[[IV:.*]] = %{{.*}} to %{{.*}}
+  // CHECK-NEXT: %[[CMP:.*]] = arith.cmpi ult, %[[IV]], %[[CST_7]]
+  // CHECK-NEXT: %[[IF:.*]] = scf.if %[[CMP]]
+  // CHECK-NEXT: arith.addi %[[CST_42]], %[[CST_42]] : i32
+  // CHECK: arith.addi %{{.*}}, %[[IF]]
+
+  return %sum_result : i32
+}
+
+// -----
+
 func.func private @make_val() -> (index)
 
 // CHECK-LABEL: func @nested_uses_inside
diff --git a/mlir/test/Transforms/sccp.mlir b/mlir/test/Transforms/sccp.mlir
index dcae052..c78c859 100644
--- a/mlir/test/Transforms/sccp.mlir
+++ b/mlir/test/Transforms/sccp.mlir
@@ -246,3 +246,12 @@ func.func @op_with_region() -> (i32) {
 ^b:
   return %1 : i32
 }
+
+// CHECK-LABEL: no_crash_with_different_source_type
+func.func @no_crash_with_different_source_type() {
+  // CHECK: llvm.mlir.constant(0 : index) : i64
+  %0 = llvm.mlir.constant(0 : index) : i64
+  // CHECK: vector.broadcast %[[CST:.*]] : i64 to vector<128xi64>
+  %1 = vector.broadcast %0 : i64 to vector<128xi64>
+  llvm.return
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 2ca5f49..ae7d344 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -64,9 +64,6 @@ func.func @remap_call_1_to_1(%arg0: i64) {
 // Contents of the old block are moved to the new block.
 // CHECK-NEXT: notifyOperationInserted: test.return, was linked, exact position unknown
 
-// The new block arguments are used in "test.return".
-// CHECK-NEXT: notifyOperationModified: test.return
-
 // The old block is erased.
 // CHECK-NEXT: notifyBlockErased
 
@@ -390,8 +387,8 @@ func.func @caller() {
   // CHECK: %[[call:.*]]:2 = call @callee() : () -> (f16, f16)
   %0:2 = func.call @callee() : () -> (f32, i24)
 
-  // CHECK: %[[cast1:.*]] = "test.cast"() : () -> i24
-  // CHECK: %[[cast0:.*]] = "test.cast"(%[[call]]#0, %[[call]]#1) : (f16, f16) -> f32
+  // CHECK-DAG: %[[cast1:.*]] = "test.cast"() : () -> i24
+  // CHECK-DAG: %[[cast0:.*]] = "test.cast"(%[[call]]#0, %[[call]]#1) : (f16, f16) -> f32
   // CHECK: "test.some_user"(%[[cast0]], %[[cast1]]) : (f32, i24) -> ()
   // expected-remark @below{{'test.some_user' is not legalizable}}
   "test.some_user"(%0#0, %0#1) : (f32, i24) -> ()
@@ -450,7 +447,7 @@ func.func @fold_legalization() -> i32 {
 // -----
 
 // CHECK-LABEL: func @convert_detached_signature()
-//       CHECK:   "test.legal_op_with_region"() ({
+//       CHECK:   "test.legal_op"() ({
 //       CHECK:   ^bb0(%arg0: f64):
 //       CHECK:     "test.return"() : () -> ()
 //       CHECK:   }) : () -> ()
@@ -483,3 +480,20 @@ func.func @test_1_to_n_block_signature_conversion() {
   "test.return"() : () -> ()
 }
 
+// -----
+
+// CHECK: notifyOperationInserted: test.step_1
+// CHECK: notifyOperationReplaced: test.multiple_1_to_n_replacement
+// CHECK: notifyOperationErased: test.multiple_1_to_n_replacement
+// CHECK: notifyOperationInserted: test.legal_op
+// CHECK: notifyOperationReplaced: test.step_1
+// CHECK: notifyOperationErased: test.step_1
+
+// CHECK-LABEL: func @test_multiple_1_to_n_replacement()
+//       CHECK:   %[[legal_op:.*]]:4 = "test.legal_op"() : () -> (f16, f16, f16, f16)
+//       CHECK:   %[[cast:.*]] = "test.cast"(%[[legal_op]]#0, %[[legal_op]]#1, %[[legal_op]]#2, %[[legal_op]]#3) : (f16, f16, f16, f16) -> f16
+//       CHECK:   "test.valid"(%[[cast]]) : (f16) -> ()
+func.func @test_multiple_1_to_n_replacement() {
+  %0 = "test.multiple_1_to_n_replacement"() : () -> (f16)
+  "test.invalid"(%0) : (f16) -> ()
+}
diff --git a/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp b/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp
index 09c5b4b..d0b62e7 100644
--- a/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp
+++ b/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp
@@ -139,7 +139,7 @@ struct TestDecomposeCallGraphTypes
           tupleType.getFlattenedTypes(types);
           return success();
         });
-    typeConverter.addArgumentMaterialization(buildMakeTupleOp);
+    typeConverter.addSourceMaterialization(buildMakeTupleOp);
     typeConverter.addTargetMaterialization(buildDecomposeTuple);
 
     populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index a470497..5b7c36c 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -785,7 +785,7 @@ struct TestDetachedSignatureConversion : public ConversionPattern {
                   ConversionPatternRewriter &rewriter) const final {
     if (op->getNumRegions() != 1)
       return failure();
-    OperationState state(op->getLoc(), "test.legal_op_with_region", operands,
+    OperationState state(op->getLoc(), "test.legal_op", operands,
                          op->getResultTypes(), {}, BlockRange());
     Region *newRegion = state.addRegion();
     rewriter.inlineRegionBefore(op->getRegion(0), *newRegion,
@@ -1234,6 +1234,41 @@ public:
   }
 };
 
+/// A pattern that tests two back-to-back 1 -> 2 op replacements.
+class TestMultiple1ToNReplacement : public ConversionPattern {
+public:
+  TestMultiple1ToNReplacement(MLIRContext *ctx, const TypeConverter &converter)
+      : ConversionPattern(converter, "test.multiple_1_to_n_replacement", 1,
+                          ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    // Helper function that replaces the given op with a new op of the given
+    // name and doubles each result (1 -> 2 replacement of each result).
+    auto replaceWithDoubleResults = [&](Operation *op, StringRef name) {
+      SmallVector<Type> types;
+      for (Type t : op->getResultTypes()) {
+        types.push_back(t);
+        types.push_back(t);
+      }
+      OperationState state(op->getLoc(), name,
+                           /*operands=*/{}, types, op->getAttrs());
+      auto *newOp = rewriter.create(state);
+      SmallVector<ValueRange> repls;
+      for (size_t i = 0, e = op->getNumResults(); i < e; ++i)
+        repls.push_back(newOp->getResults().slice(2 * i, 2));
+      rewriter.replaceOpWithMultiple(op, repls);
+      return newOp;
+    };
+
+    // Replace test.multiple_1_to_n_replacement with test.step_1.
+    Operation *repl1 = replaceWithDoubleResults(op, "test.step_1");
+    // Now replace test.step_1 with test.legal_op.
+    replaceWithDoubleResults(repl1, "test.legal_op");
+    return success();
+  }
+};
+
 } // namespace
 
 namespace {
@@ -1241,7 +1276,6 @@ struct TestTypeConverter : public TypeConverter {
   using TypeConverter::TypeConverter;
   TestTypeConverter() {
     addConversion(convertType);
-    addArgumentMaterialization(materializeCast);
     addSourceMaterialization(materializeCast);
   }
 
@@ -1319,7 +1353,8 @@ struct TestLegalizePatternDriver
              TestUndoPropertiesModification, TestEraseOp,
              TestRepetitive1ToNConsumer>(&getContext());
     patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp,
-                 TestPassthroughInvalidOp>(&getContext(), converter);
+                 TestPassthroughInvalidOp, TestMultiple1ToNReplacement>(
+        &getContext(), converter);
     patterns.add<TestDuplicateBlockArgs>(converter, &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1330,8 +1365,7 @@ struct TestLegalizePatternDriver
     target.addLegalOp<ModuleOp>();
     target.addLegalOp<LegalOpA, LegalOpB, LegalOpC, TestCastOp, TestValidOp,
                       TerminatorOp, OneRegionOp>();
-    target.addLegalOp(
-        OperationName("test.legal_op_with_region", &getContext()));
+    target.addLegalOp(OperationName("test.legal_op", &getContext()));
     target
         .addIllegalOp<ILLegalOpF, TestRegionBuilderOp, TestOpWithRegionFold>();
     target.addDynamicallyLegalOp<TestReturnOp>([](TestReturnOp op) {
diff --git a/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp b/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
index ac904c3..83db118 100644
--- a/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
+++ b/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
@@ -149,7 +149,7 @@ ConvertTosaConv2DOp::matchAndRewrite(Operation *op,
       op->getLoc(), newTosaConv2DOpType, tosaConv2DOp.getInput(),
       tosaConv2DOp.getWeight(), tosaConv2DOp.getBias(),
       tosaConv2DOp.getPadAttr(), tosaConv2DOp.getStrideAttr(),
-      tosaConv2DOp.getDilationAttr());
+      tosaConv2DOp.getDilationAttr(), tosaConv2DOp.getAccTypeAttr());
 
   // Create rescale to quantized type
   double inputScale = inputQType.getScale();
diff --git a/mlir/test/lib/Transforms/TestDialectConversion.cpp b/mlir/test/lib/Transforms/TestDialectConversion.cpp
index 2cc1fb5..a03bf0a 100644
--- a/mlir/test/lib/Transforms/TestDialectConversion.cpp
+++ b/mlir/test/lib/Transforms/TestDialectConversion.cpp
@@ -28,7 +28,6 @@ namespace {
 struct PDLLTypeConverter : public TypeConverter {
   PDLLTypeConverter() {
     addConversion(convertType);
-    addArgumentMaterialization(materializeCast);
     addSourceMaterialization(materializeCast);
   }
 
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index fb4c75b..8785d6d 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -103,6 +103,42 @@ def testFuseIntoContainingOpCompact(target):
 
 @run
 @create_sequence
+def testFuseOpCompact(target):
+    structured.FuseOp(
+        target, tile_sizes=[4, 8], tile_interchange=[0, 1], apply_cleanup=True
+    )
+    # CHECK-LABEL: TEST: testFuseOpCompact
+    # CHECK: transform.sequence
+    # CHECK: %{{.+}}, %{{.+}}:2 = transform.structured.fuse %{{.*}}[4, 8]
+    # CHECK-SAME: interchange [0, 1] apply_cleanup = true
+    # CHECK-SAME: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+
+@run
+@create_sequence
+def testFuseOpNoArg(target):
+    structured.FuseOp(target)
+    # CHECK-LABEL: TEST: testFuseOpNoArg
+    # CHECK: transform.sequence
+    # CHECK: %{{.+}} = transform.structured.fuse %{{.*}} :
+    # CHECK-SAME: (!transform.any_op) -> !transform.any_op
+
+
+@run
+@create_sequence
+def testFuseOpAttributes(target):
+    attr = DenseI64ArrayAttr.get([4, 8])
+    ichange = DenseI64ArrayAttr.get([0, 1])
+    structured.FuseOp(target, tile_sizes=attr, tile_interchange=ichange)
+    # CHECK-LABEL: TEST: testFuseOpAttributes
+    # CHECK: transform.sequence
+    # CHECK: %{{.+}}, %{{.+}}:2 = transform.structured.fuse %{{.*}}[4, 8]
+    # CHECK-SAME: interchange [0, 1]
+    # CHECK-SAME: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+
+@run
+@create_sequence
 def testGeneralize(target):
     structured.GeneralizeOp(target)
     # CHECK-LABEL: TEST: testGeneralize
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index 6d3a8db..0d12c35 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -306,7 +306,7 @@ def testUnrankedMemRefWithOffsetCallback():
         log(arr)
 
     with Context():
-        # The module takes a subview of the argument memref, casts it to an unranked memref and 
+        # The module takes a subview of the argument memref, casts it to an unranked memref and
         # calls the callback with it.
         module = Module.parse(
             r"""
diff --git a/mlir/test/python/ir/dialects.py b/mlir/test/python/ir/dialects.py
index d59c6a6..5a2ed68 100644
--- a/mlir/test/python/ir/dialects.py
+++ b/mlir/test/python/ir/dialects.py
@@ -121,3 +121,39 @@ def testAppendPrefixSearchPath():
         sys.path.append(".")
         _cext.globals.append_dialect_search_prefix("custom_dialect")
         assert _cext.globals._check_dialect_module_loaded("custom")
+
+
+# CHECK-LABEL: TEST: testDialectLoadOnCreate
+@run
+def testDialectLoadOnCreate():
+    with Context(load_on_create_dialects=[]) as ctx:
+        ctx.emit_error_diagnostics = True
+        ctx.allow_unregistered_dialects = True
+
+        def callback(d):
+            # CHECK: DIAGNOSTIC
+            # CHECK-SAME: op created with unregistered dialect
+            print(f"DIAGNOSTIC={d.message}")
+            return True
+
+        handler = ctx.attach_diagnostic_handler(callback)
+        loc = Location.unknown(ctx)
+        try:
+            op = Operation.create("arith.addi", loc=loc)
+            ctx.allow_unregistered_dialects = False
+            op.verify()
+        except MLIRError as e:
+            pass
+
+    with Context(load_on_create_dialects=["func"]) as ctx:
+        loc = Location.unknown(ctx)
+        fn = Operation.create("func.func", loc=loc)
+
+    # TODO: This may require an update if a site wide policy is set.
+    # CHECK: Load on create: []
+    print(f"Load on create: {get_load_on_create_dialects()}")
+    append_load_on_create_dialect("func")
+    # CHECK: Load on create:
+    # CHECK-SAME: func
+    print(f"Load on create: {get_load_on_create_dialects()}")
+    print(get_load_on_create_dialects())
diff --git a/mlir/test/tblgen-lsp-server/templ-arg-check.test b/mlir/test/tblgen-lsp-server/templ-arg-check.test
new file mode 100644
index 0000000..cda9b79
--- /dev/null
+++ b/mlir/test/tblgen-lsp-server/templ-arg-check.test
@@ -0,0 +1,15 @@
+// RUN: tblgen-lsp-server -lit-test < %s | FileCheck -strict-whitespace %s
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"tablegen","capabilities":{},"trace":"off"}}
+// -----
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{
+  "uri":"test:///foo.td",
+  "languageId":"tablegen",
+  "version":1,
+  "text":"class Foo<int i>;\ndef : Foo<\"\">;"
+}}}
+// CHECK: "method": "textDocument/publishDiagnostics",
+// CHECK: "message": "Value specified for template argument 'Foo:i' is of type string; expected type int: \"\"",
+// -----
+{"jsonrpc":"2.0","id":3,"method":"shutdown"}
+// -----
+{"jsonrpc":"2.0","method":"exit"}
diff --git a/mlir/utils/pygments/README.md b/mlir/utils/pygments/README.md
new file mode 100644
index 0000000..838face
--- /dev/null
+++ b/mlir/utils/pygments/README.md
@@ -0,0 +1,45 @@
+## Pygments Lexer for MLIR
+
+This file contains a simple Pygments lexer configuration for MLIR, derived from
+the version used in the original CGO paper. Pygments allows for advanced
+configurable syntax highlighting of any code. This lexer is known to be
+incomplete and support mostly core IR with a subset of built-in types.
+Additions and customizations are welcome.
+
+### Standalone Usage
+
+Install Pygments, e.g., by running `pip install Pygments` or a Python package
+manager of your choosing. Use the standalone `pygmentize` command by
+instructing it to load the custom lexer:
+
+```
+pygmentize -l /path/to/mlir_lexer.py:MlirLexer -x myfile.mlir
+```
+
+This will produce highlighted output in the terminal. Other output formats are
+available, see Pygments [documentation](https://pygments.org/docs/) for more
+information.
+
+### LaTeX Usage
+
+First, make sure your distribution includes the `minted` package and list in
+the preamble.
+
+```latex
+\usepackage{minted}
+```
+
+Place the `mlir_lexer.py` in a place where the `latex` binary can find it,
+typically in the working directory next to the main `.tex` file. Note that you
+will have to invoke `latex` with the `-shell-escape` flag. See the `minted` 
+package [documentation](https://ctan.org/pkg/minted?lang=en) for more
+information.
+
+Leverage the custom lexer facility of `minted` to use this lexer in your
+document as:
+
+```latex
+\begin{minted}{mlir_lexer.py:MlirLexer -x}
+   ... your code here ...
+\end{minted}
+```
diff --git a/mlir/utils/pygments/mlir_lexer.py b/mlir/utils/pygments/mlir_lexer.py
new file mode 100644
index 0000000..179a058
--- /dev/null
+++ b/mlir/utils/pygments/mlir_lexer.py
@@ -0,0 +1,38 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pygments.lexer import RegexLexer
+from pygments.token import *
+
+
+class MlirLexer(RegexLexer):
+    name = "MLIR"
+    aliases = ["mlir"]
+    filenames = ["*.mlir"]
+
+    tokens = {
+        "root": [
+            (r"%[a-zA-Z0-9_]+", Name.Variable),
+            (r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function),
+            (r"\^[a-zA-Z0-9_]+", Name.Label),
+            (r"#[a-zA-Z0-9_]+", Name.Constant),
+            (r"![a-zA-Z0-9_]+", Keyword.Type),
+            (r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity),
+            (r"memref[^.]", Keyword.Type),
+            (r"index", Keyword.Type),
+            (r"i[0-9]+", Keyword.Type),
+            (r"f[0-9]+", Keyword.Type),
+            (r"[0-9]+", Number.Integer),
+            (r"[0-9]*\.[0-9]*", Number.Float),
+            (r'"[^"]*"', String.Double),
+            (r"affine_map", Keyword.Reserved),
+            # TODO: this should be within affine maps only
+            (r"\+-\*\/", Operator),
+            (r"floordiv", Operator.Word),
+            (r"ceildiv", Operator.Word),
+            (r"mod", Operator.Word),
+            (r"()\[\]<>,{}", Punctuation),
+            (r"\/\/.*\n", Comment.Single),
+        ]
+    }
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 11176a5..2294026 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -42,43 +42,6 @@ set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
 set(include_directory ${devicertl_base_directory}/include)
 set(source_directory ${devicertl_base_directory}/src)
 
-set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803"
-                             "gfx9-generic;gfx900;gfx902;gfx906;gfx908"
-                             "gfx90a;gfx90c"
-                             "gfx9-4-generic;gfx940;gfx941;gfx942;gfx950"
-                             "gfx10-1-generic;gfx1010;gfx1012"
-                             "gfx10-3-generic;gfx1030;gfx1031;gfx1032;gfx1033"
-                             "gfx1034;gfx1035;gfx1036"
-                             "gfx11-generic;gfx1100;gfx1101;gfx1102;gfx1103"
-                             "gfx1150;gfx1151;gfx1152;gfx1153"
-                             "gfx12-generic")
-set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
-                            "sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90")
-set(all_gpu_architectures
-    "${all_amdgpu_architectures};${all_nvptx_architectures}")
-
-set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING
-    "List of device architectures to be used to compile the OpenMP DeviceRTL.")
-
-if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all")
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "amdgpu")
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_amdgpu_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "nvptx")
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_nvptx_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
-       LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native")
-  if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH)
-    message(FATAL_ERROR
-      "Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'")
-  elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU)
-    message(FATAL_ERROR "No AMD or NVIDIA GPU found on the system when using 'auto'")
-  endif()
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES
-      "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
-endif()
-list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
-
 set(include_files
   ${include_directory}/Allocator.h
   ${include_directory}/Configuration.h
@@ -146,20 +109,22 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
 
 # first create an object target
 add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
-function(compileDeviceRTLLibrary target_cpu target_name target_triple)
+function(compileDeviceRTLLibrary target_name target_triple)
   set(target_bc_flags ${ARGN})
 
   set(bc_files "")
   foreach(src ${src_files})
     get_filename_component(infile ${src} ABSOLUTE)
     get_filename_component(outfile ${src} NAME)
-    set(outfile "${outfile}-${target_cpu}.bc")
+    set(outfile "${outfile}-${target_name}.bc")
     set(depfile "${outfile}.d")
 
+    # Passing an empty CPU to -march= suppressed target specific metadata.
     add_custom_command(OUTPUT ${outfile}
       COMMAND ${CLANG_TOOL}
       ${bc_flags}
-      --offload-arch=${target_cpu}
+      -fopenmp-targets=${target_triple}
+      -Xopenmp-target=${target_triple} -march=
       ${target_bc_flags}
       -MD -MF ${depfile}
       ${infile} -o ${outfile}
@@ -182,7 +147,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
     list(APPEND bc_files ${outfile})
   endforeach()
 
-  set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
+  set(bclib_name "libomptarget-${target_name}.bc")
 
   # Link to a bitcode library.
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
@@ -222,7 +187,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
       APPEND)
   endif()
 
-  set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
+  set(bclib_target_name "omptarget-${target_name}-bc")
   add_custom_target(${bclib_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
 
   # Copy library to destination.
@@ -244,7 +209,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
   # Package the bitcode in the bitcode and embed it in an ELF for the static library
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
       COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
-        "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp"
+        "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=generic,kind=openmp"
       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
       COMMENT "Packaging LLVM offloading binary ${bclib_name}.out"
   )
@@ -254,14 +219,14 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
       APPEND)
   endif()
 
-  set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o")
+  set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}.o")
   add_custom_command(OUTPUT ${output_name}
     COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib
             -Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
             -o ${output_name}
             ${source_directory}/Stub.cpp
     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} ${source_directory}/Stub.cpp
-    COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o"
+    COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}.o"
     VERBATIM
   )
   if(TARGET clang)
@@ -274,11 +239,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
   set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name})
 
   if (CMAKE_EXPORT_COMPILE_COMMANDS)
-    set(ide_target_name omptarget-ide-${target_name}-${target_cpu})
+    set(ide_target_name omptarget-ide-${target_name})
     add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
     target_compile_options(${ide_target_name} PRIVATE
-      -fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode
-      -mllvm -openmp-opt-disable
+      -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
+      -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
       -foffload-lto -fvisibility=hidden --offload-device-only
       -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
     )
@@ -293,18 +258,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
   endif()
 endfunction()
 
-# Generate a Bitcode library for all the gpu architectures the user requested.
-add_custom_target(omptarget.devicertl.nvptx)
 add_custom_target(omptarget.devicertl.amdgpu)
-foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
-  if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
-  elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
-  else()
-    message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
-  endif()
-endforeach()
+compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
+
+add_custom_target(omptarget.devicertl.nvptx)
+compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
 
 # Archive all the object files generated above into a static library
 add_library(omptarget.devicertl STATIC)
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index 00935cc..ba6fbf5 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -39,15 +39,7 @@ double getWTick() {
 }
 
 double getWTime() {
-  uint64_t NumTicks = 0;
-  if constexpr (__has_builtin(__builtin_amdgcn_s_sendmsg_rtnl))
-    NumTicks = __builtin_amdgcn_s_sendmsg_rtnl(0x83);
-  else if constexpr (__has_builtin(__builtin_amdgcn_s_memrealtime))
-    NumTicks = __builtin_amdgcn_s_memrealtime();
-  else if constexpr (__has_builtin(__builtin_amdgcn_s_memtime))
-    NumTicks = __builtin_amdgcn_s_memtime();
-
-  return static_cast<double>(NumTicks) * getWTick();
+  return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
 }
 
 #pragma omp end declare variant
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 57df159d..d3b4528 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
   }
 }
 
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
                                           ShuffleReductFnTy shflFct) {
   uint32_t size, remote_id, physical_lane_id;
@@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   } while (logical_lane_id % 2 == 0 && size > 1);
   return (logical_lane_id == 0);
 }
-#endif
 
 static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                             ShuffleReductFnTy shflFct,
@@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   uint32_t NumThreads = omp_get_num_threads();
   if (NumThreads == 1)
     return 1;
-    /*
-     * This reduce function handles reduction within a team. It handles
-     * parallel regions in both L1 and L2 parallelism levels. It also
-     * supports Generic, SPMD, and NoOMP modes.
-     *
-     * 1. Reduce within a warp.
-     * 2. Warp master copies value to warp 0 via shared memory.
-     * 3. Warp 0 reduces to a single value.
-     * 4. The reduced value is available in the thread that returns 1.
-     */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded =
-      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  uint32_t WarpId = mapping::getWarpIdInBlock();
-
-  // Volta execution model:
-  // For the Generic execution mode a parallel region either has 1 thread and
-  // beyond that, always a multiple of 32. For the SPMD execution mode we may
-  // have any number of threads.
-  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
 
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
+    //
+    // This reduce function handles reduction within a team. It handles
+    // parallel regions in both L1 and L2 parallelism levels. It also
+    // supports Generic, SPMD, and NoOMP modes.
+    //
+    // 1. Reduce within a warp.
+    // 2. Warp master copies value to warp 0 via shared memory.
+    // 3. Warp 0 reduces to a single value.
+    // 4. The reduced value is available in the thread that returns 1.
+    //
 
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
+#if __has_builtin(__nvvm_reflect)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
+    uint32_t WarpsNeeded =
+        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    uint32_t WarpId = mapping::getWarpIdInBlock();
+
+    // Volta execution model:
+    // For the Generic execution mode a parallel region either has 1 thread and
+    // beyond that, always a multiple of 32. For the SPMD execution mode we may
+    // have any number of threads.
+    if ((NumThreads % mapping::getWarpSize() == 0) ||
+        (WarpId < WarpsNeeded - 1))
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+    else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+      gpu_irregular_warp_reduce(
+          reduce_data, shflFct,
+          /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+          /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
+
+    // When we have more than [mapping::getWarpSize()] number of threads
+    // a block reduction is performed here.
+    //
+    // Only L1 parallel region can enter this if condition.
+    if (NumThreads > mapping::getWarpSize()) {
+      // Gather all the reduced values from each warp
+      // to the first warp.
+      cpyFct(reduce_data, WarpsNeeded);
+
+      if (WarpId == 0)
+        gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                  BlockThreadId);
+    }
+    return BlockThreadId == 0;
   }
-  return BlockThreadId == 0;
-#else
+#endif
   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
   if (Liveness == lanes::All) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
@@ -150,10 +152,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
     return BlockThreadId == 0;
   }
 
-  // Get the OMP thread Id. This is different from BlockThreadId in the case of
-  // an L2 parallel region.
+  // Get the OMP thread Id. This is different from BlockThreadId in the case
+  // of an L2 parallel region.
   return BlockThreadId == 0;
-#endif // __CUDA_ARCH__ >= 700
 }
 
 uint32_t roundToWarpsize(uint32_t s) {
diff --git a/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90 b/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90
new file mode 100644
index 0000000..b4fded7
--- /dev/null
+++ b/offload/test/offloading/fortran/target-map-local-intrinisc-sized-param.f90
@@ -0,0 +1,39 @@
+! Offloading test checking interaction of an local array
+! sized utilising an input parameter and the size intrinsic
+! when being mapped to device.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+module mod
+    use iso_fortran_env, only: real64
+    implicit none
+contains
+    subroutine test(a)
+        implicit none
+        integer :: i
+        real(kind=real64), dimension(:) :: a
+        real(kind=real64), dimension(size(a, 1)) :: b
+
+!$omp target map(tofrom: b)
+        do i = 1, 10
+            b(i) = i
+        end do
+!$omp end target
+
+        print *, b
+    end subroutine
+end module mod
+
+program main
+    use mod
+    real(kind=real64), allocatable :: a(:)
+    allocate(a(10))
+
+    do i = 1, 10
+        a(i) = i
+    end do
+
+    call test(a)
+end program main
+
+!CHECK: 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.
diff --git a/offload/test/offloading/fortran/target-with-threadprivate.f90 b/offload/test/offloading/fortran/target-with-threadprivate.f90
new file mode 100644
index 0000000..10c7cec
--- /dev/null
+++ b/offload/test/offloading/fortran/target-with-threadprivate.f90
@@ -0,0 +1,37 @@
+! Basic offloading test that makes sure we can use the predominantly host
+! pragma threadprivate in the same program as target code
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+
+    type dtype
+       integer :: val(10)
+    end type dtype
+
+    integer :: i
+    type(dtype), pointer :: pointer1
+    type(dtype), pointer :: pointer2=>null()
+    integer, dimension(:), pointer :: data_pointer
+
+!$omp threadprivate(pointer2)
+
+nullify(pointer1)
+allocate(pointer1)
+
+pointer2=>pointer1
+pointer2%val(:)=1
+data_pointer=>pointer2%val
+
+!$omp target
+   do i = 1, 10
+     data_pointer(i) = i
+   end do
+!$omp end target
+
+print *, data_pointer
+
+end program main
+
+! CHECK: 1 2 3 4 5 6 7 8 9 10
diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index d4a4b1a..0089f1a 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -19,3 +19,9 @@ from the `LLVM releases web site <https://llvm.org/releases/>`_.
 
 Non-comprehensive list of changes in this release
 =================================================
+
+Device Runtime
+--------------
+- Changed the OpenMP DeviceRTL to use 'generic' IR. The
+  ``LIBOMPTARGET_DEVICE_ARCHITECTURES`` CMake argument is now unused and will
+  always build support for AMDGPU and NVPTX targets.
diff --git a/polly/CMakeLists.txt b/polly/CMakeLists.txt
index b4cfc77..955c171 100644
--- a/polly/CMakeLists.txt
+++ b/polly/CMakeLists.txt
@@ -29,11 +29,7 @@ if(POLLY_STANDALONE_BUILD)
 
   # Enable unit tests if available.
   set(POLLY_GTEST_AVAIL 0)
-  set(UNITTEST_DIR ${LLVM_THIRD_PARTY_DIR}/unittest)
-  if(EXISTS ${UNITTEST_DIR}/googletest/include/gtest/gtest.h)
-    if (NOT TARGET gtest)
-      add_subdirectory(${UNITTEST_DIR} third-party/unittest)
-    endif()
+  if(TARGET llvm_gtest)
     set(POLLY_GTEST_AVAIL 1)
   endif()
 
diff --git a/polly/docs/UsingPollyWithClang.rst b/polly/docs/UsingPollyWithClang.rst
index 08fdcbc..2aa35a0 100644
--- a/polly/docs/UsingPollyWithClang.rst
+++ b/polly/docs/UsingPollyWithClang.rst
@@ -101,7 +101,7 @@ polly, after SSA transformation, loop canonicalization, inlining and
 other passes.
 
 Thereafter, any Polly pass can be run over 'before-polly.ll' using the
-'opt' tool.  To found out which Polly passes are active in the standard
+'opt' tool.  To find out which Polly passes are active in the standard
 pipeline, see the output of
 
 .. code-block:: console
diff --git a/polly/include/polly/CodeGen/BlockGenerators.h b/polly/include/polly/CodeGen/BlockGenerators.h
index 4e26454..401e80e 100644
--- a/polly/include/polly/CodeGen/BlockGenerators.h
+++ b/polly/include/polly/CodeGen/BlockGenerators.h
@@ -632,7 +632,7 @@ public:
 };
 
 /// Generator for new versions of polyhedral region statements.
-class RegionGenerator final : BlockGenerator {
+class RegionGenerator final : public BlockGenerator {
 public:
   /// Create a generator for regions.
   ///
diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index b76d8f4..6d723d6 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -1000,7 +1000,7 @@ BasicBlock *RegionGenerator::repairDominance(BasicBlock *BB,
   BasicBlock *BBCopyIDom = EndBlockMap.lookup(BBIDom);
 
   if (BBCopyIDom)
-    DT.changeImmediateDominator(BBCopy, BBCopyIDom);
+    GenDT->changeImmediateDominator(BBCopy, BBCopyIDom);
 
   return StartBlockMap.lookup(BBIDom);
 }
@@ -1069,8 +1069,8 @@ void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
   // Create a dedicated entry for the region where we can reload all demoted
   // inputs.
   BasicBlock *EntryBB = R->getEntry();
-  BasicBlock *EntryBBCopy = SplitBlock(Builder.GetInsertBlock(),
-                                       &*Builder.GetInsertPoint(), &DT, &LI);
+  BasicBlock *EntryBBCopy = SplitBlock(
+      Builder.GetInsertBlock(), &*Builder.GetInsertPoint(), GenDT, GenLI);
   EntryBBCopy->setName("polly.stmt." + EntryBB->getName() + ".entry");
   Builder.SetInsertPoint(&EntryBBCopy->front());
 
@@ -1136,7 +1136,7 @@ void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
 
   // Now create a new dedicated region exit block and add it to the region map.
   BasicBlock *ExitBBCopy = SplitBlock(Builder.GetInsertBlock(),
-                                      &*Builder.GetInsertPoint(), &DT, &LI);
+                                      &*Builder.GetInsertPoint(), GenDT, GenLI);
   ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".exit");
   StartBlockMap[R->getExit()] = ExitBBCopy;
   EndBlockMap[R->getExit()] = ExitBBCopy;
@@ -1145,7 +1145,7 @@ void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
   assert(ExitDomBBCopy &&
          "Common exit dominator must be within region; at least the entry node "
          "must match");
-  DT.changeImmediateDominator(ExitBBCopy, ExitDomBBCopy);
+  GenDT->changeImmediateDominator(ExitBBCopy, ExitDomBBCopy);
 
   // As the block generator doesn't handle control flow we need to add the
   // region control flow by hand after all blocks have been copied.
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index d76f625..739bd63 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -612,6 +612,7 @@ void IslNodeBuilder::createForParallel(__isl_take isl_ast_node *For) {
   GenLI = SubLI;
   GenSE = SubSE.get();
   BlockGen.switchGeneratedFunc(SubFn, GenDT, GenLI, GenSE);
+  RegionGen.switchGeneratedFunc(SubFn, GenDT, GenLI, GenSE);
   ExprBuilder.switchGeneratedFunc(SubFn, GenDT, GenLI, GenSE);
   Builder.SetInsertPoint(&*LoopBody);
 
@@ -681,6 +682,7 @@ void IslNodeBuilder::createForParallel(__isl_take isl_ast_node *For) {
   IDToValue = std::move(IDToValueCopy);
   ValueMap = std::move(CallerGlobals);
   ExprBuilder.switchGeneratedFunc(CallerFn, CallerDT, CallerLI, CallerSE);
+  RegionGen.switchGeneratedFunc(CallerFn, CallerDT, CallerLI, CallerSE);
   BlockGen.switchGeneratedFunc(CallerFn, CallerDT, CallerLI, CallerSE);
   Builder.SetInsertPoint(&*AfterLoop);
 
diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index 6d50e297..d0e305a 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -604,7 +604,8 @@ bool polly::isHoistableLoad(LoadInst *LInst, Region &R, LoopInfo &LI,
 
   for (auto *User : Ptr->users()) {
     auto *UserI = dyn_cast<Instruction>(User);
-    if (!UserI || !R.contains(UserI))
+    if (!UserI || UserI->getFunction() != LInst->getFunction() ||
+        !R.contains(UserI))
       continue;
     if (!UserI->mayWriteToMemory())
       continue;
diff --git a/polly/test/CodeGen/reggen_domtree_crash.ll b/polly/test/CodeGen/reggen_domtree_crash.ll
new file mode 100644
index 0000000..58c2709
--- /dev/null
+++ b/polly/test/CodeGen/reggen_domtree_crash.ll
@@ -0,0 +1,41 @@
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s
+
+; CHECK: define ptr @ham(ptr %arg, i64 %arg1, i1 %arg2)
+
+; This test is added to verify if the following IR does not crash on using different Dominator Tree when using polly parallel flag.
+
+; ModuleID = '<stdin>'
+source_filename = "<stdin>"
+
+define ptr @ham(ptr %arg, i64 %arg1, i1 %arg2) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb8, %bb
+  %phi = phi i64 [ 0, %bb ], [ %add9, %bb8 ]
+  %getelementptr = getelementptr [64 x i16], ptr %arg, i64 %phi
+  br label %bb4
+
+bb4:                                              ; preds = %bb7, %bb3
+  %phi5 = phi i64 [ %add, %bb7 ], [ 0, %bb3 ]
+  %load = load i16, ptr null, align 2
+  br i1 %arg2, label %bb7, label %bb6
+
+bb6:                                              ; preds = %bb4
+  store i16 0, ptr %getelementptr, align 2
+  br label %bb7
+
+bb7:                                              ; preds = %bb6, %bb4
+  %add = add i64 %phi5, 1
+  %icmp = icmp ne i64 %phi5, 64
+  br i1 %icmp, label %bb4, label %bb8
+
+bb8:                                              ; preds = %bb7
+  %add9 = add i64 %phi, 1
+  %icmp10 = icmp ult i64 %phi, %arg1
+  br i1 %icmp10, label %bb3, label %bb11
+
+bb11:                                             ; preds = %bb8
+  ret ptr null
+}
+
diff --git a/polly/test/ScopDetect/dom-tree-crash.ll b/polly/test/ScopDetect/dom-tree-crash.ll
new file mode 100644
index 0000000..efc732c
--- /dev/null
+++ b/polly/test/ScopDetect/dom-tree-crash.ll
@@ -0,0 +1,31 @@
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: Detected Scops in Function foo
+
+; This unit test case is to check if the following IR does not crash in isHoistableLoad function during Scop Detection.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnueabi"
+
+define void @foo(ptr %block) {
+entry:
+  br label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body
+  %0 = load ptr, ptr null, align 8
+  %1 = load i16, ptr %block, align 2
+  %2 = load i16, ptr %0, align 2
+  br label %foo.exit
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 false, label %for.cond1.preheader, label %for.body
+
+foo.exit:                                     ; preds = %for.cond1.preheader
+  ret void
+}
+
+define void @init_foo() {
+entry:
+  store ptr null, ptr null, align 8
+  ret void
+}
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
index db53df0..fa77152 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
@@ -265,6 +265,7 @@ clang_tidy_library(
 clang_tidy_library(
     name = "cppcoreguidelines",
     deps = [
+        ":bugprone",
         ":lib",
         ":misc",
         ":modernize",
@@ -278,7 +279,6 @@ clang_tidy_library(
 clang_tidy_library(
     name = "bugprone",
     deps = [
-        ":cppcoreguidelines",
         ":lib",
         ":utils",
         "//clang:analysis",
@@ -368,6 +368,7 @@ cc_library(
         ":utils",
         "//clang:tooling",
         "//llvm:Support",
+        "//llvm:TargetParser",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index e2babad..8624028 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -342,6 +342,21 @@ gentbl(
 )
 
 gentbl(
+    name = "basic_builtins_spirv_gen",
+    tbl_outs = [(
+        "-gen-clang-builtins",
+        "include/clang/Basic/BuiltinsSPIRV.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/BuiltinsSPIRV.td",
+    td_srcs = [
+        "include/clang/Basic/Builtins.td",
+        "include/clang/Basic/BuiltinsBase.td",
+        "include/clang/Basic/BuiltinsSPIRV.td",
+    ],
+)
+
+gentbl(
     name = "basic_builtins_riscv_gen",
     tbl_outs = [(
         "-gen-clang-builtins",
@@ -366,6 +381,22 @@ gentbl(
     td_file = "include/clang/Basic/BuiltinsX86.td",
     td_srcs = [
         "include/clang/Basic/BuiltinsX86.td",
+        "include/clang/Basic/BuiltinsX86Base.td",
+        "include/clang/Basic/BuiltinsBase.td",
+    ],
+)
+
+gentbl(
+    name = "basic_builtins_x86_64_gen",
+    tbl_outs = [(
+        "-gen-clang-builtins",
+        "include/clang/Basic/BuiltinsX86_64.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/BuiltinsX86_64.td",
+    td_srcs = [
+        "include/clang/Basic/BuiltinsX86_64.td",
+        "include/clang/Basic/BuiltinsX86Base.td",
         "include/clang/Basic/BuiltinsBase.td",
     ],
 )
@@ -707,7 +738,9 @@ cc_library(
         ":basic_builtins_bpf_gen",
         ":basic_builtins_gen",
         ":basic_builtins_riscv_gen",
+        ":basic_builtins_spirv_gen",
         ":basic_builtins_x86_gen",
+        ":basic_builtins_x86_64_gen",
         ":basic_internal_headers",
         ":basic_riscv_sifive_vector_builtins_gen",
         ":basic_riscv_vector_builtin_cg_gen",
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 91c7db9..ac3f503 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -670,6 +670,8 @@ libc_support_library(
     deps = [
         ":__support_cpp_array",
         ":__support_cpp_iterator",
+        ":__support_libc_assert",
+        ":string_memory_utils",
     ],
 )
 
@@ -1395,10 +1397,6 @@ libc_support_library(
     hdrs = [
         "src/__support/threads/linux/raw_mutex.h",
     ],
-    defines = [
-        "LIBC_COPT_TIMEOUT_ENSURE_MONOTONICITY",
-        "LIBC_COPT_RAW_MUTEX_DEFAULT_SPIN_COUNT",
-    ],
     target_compatible_with = select({
         "@platforms//os:linux": [],
         "//conditions:default": ["@platforms//:incompatible"],
@@ -1797,7 +1795,6 @@ libc_support_library(
     hdrs = ["src/math/generic/sincosf16_utils.h"],
     deps = [
         ":__support_common",
-        ":__support_fputil_fp_bits",
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
     ],
@@ -1846,11 +1843,8 @@ libc_support_library(
     name = "exp10f_impl",
     hdrs = ["src/math/generic/exp10f_impl.h"],
     deps = [
-        ":__support_fputil_basic_operations",
         ":__support_fputil_fma",
         ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
         ":common_constants",
@@ -3485,11 +3479,13 @@ libc_support_library(
     hdrs = [
         "src/stdlib/heap_sort.h",
         "src/stdlib/qsort_data.h",
+        "src/stdlib/qsort_pivot.h",
         "src/stdlib/qsort_util.h",
         "src/stdlib/quick_sort.h",
     ],
     deps = [
         ":__support_common",
+        ":__support_cpp_bit",
         ":__support_cpp_cstddef",
         ":__support_macros_attributes",
     ],
@@ -3596,13 +3592,6 @@ libc_function(
 
 ############################### string targets ###############################
 
-no_sanitize_features = [
-    "-asan",
-    "-msan",
-    "-tsan",
-    "-ubsan",
-]
-
 libc_support_library(
     name = "string_memory_utils",
     hdrs = [
@@ -3680,7 +3669,6 @@ libc_function(
     name = "memcpy",
     srcs = ["src/string/memcpy.cpp"],
     hdrs = ["src/string/memcpy.h"],
-    features = no_sanitize_features,
     weak = True,
     deps = [
         ":__support_common",
@@ -3692,7 +3680,6 @@ libc_function(
     name = "memset",
     srcs = ["src/string/memset.cpp"],
     hdrs = ["src/string/memset.h"],
-    features = no_sanitize_features,
     weak = True,
     deps = [
         ":__support_common",
@@ -3704,7 +3691,6 @@ libc_function(
     name = "memmove",
     srcs = ["src/string/memmove.cpp"],
     hdrs = ["src/string/memmove.h"],
-    features = no_sanitize_features,
     weak = True,
     deps = [
         ":__support_common",
@@ -3716,7 +3702,6 @@ libc_function(
     name = "mempcpy",
     srcs = ["src/string/mempcpy.cpp"],
     hdrs = ["src/string/mempcpy.h"],
-    features = no_sanitize_features,
     weak = True,
     deps = [
         ":__support_common",
@@ -3728,7 +3713,6 @@ libc_function(
     name = "bcopy",
     srcs = ["src/strings/bcopy.cpp"],
     hdrs = ["src/strings/bcopy.h"],
-    features = no_sanitize_features,
     deps = [
         ":__support_common",
         ":string_memory_utils",
@@ -3739,7 +3723,6 @@ libc_function(
     name = "memcmp",
     srcs = ["src/string/memcmp.cpp"],
     hdrs = ["src/string/memcmp.h"],
-    features = no_sanitize_features,
     weak = True,
     deps = [
         ":__support_common",
@@ -3752,7 +3735,6 @@ libc_function(
     name = "bcmp",
     srcs = ["src/strings/bcmp.cpp"],
     hdrs = ["src/strings/bcmp.h"],
-    features = no_sanitize_features,
     weak = True,
     deps = [
         ":__support_common",
@@ -3764,7 +3746,6 @@ libc_function(
     name = "bzero",
     srcs = ["src/strings/bzero.cpp"],
     hdrs = ["src/strings/bzero.h"],
-    features = no_sanitize_features,
     weak = True,
     deps = [
         ":__support_common",
@@ -3786,7 +3767,6 @@ libc_function(
     name = "strlen",
     srcs = ["src/string/strlen.cpp"],
     hdrs = ["src/string/strlen.h"],
-    features = no_sanitize_features,
     deps = [
         ":__support_common",
         ":string_utils",
@@ -3797,21 +3777,6 @@ libc_function(
     name = "strcpy",
     srcs = ["src/string/strcpy.cpp"],
     hdrs = ["src/string/strcpy.h"],
-    features = no_sanitize_features,
-    deps = [
-        ":__support_common",
-        ":memcpy",
-        ":string_memory_utils",
-        ":string_utils",
-    ],
-)
-
-# A sanitizer instrumented flavor of strcpy to be used with unittests.
-libc_function(
-    name = "strcpy_sanitized",
-    testonly = 1,
-    srcs = ["src/string/strcpy.cpp"],
-    hdrs = ["src/string/strcpy.h"],
     deps = [
         ":__support_common",
         ":memcpy",
@@ -4467,26 +4432,6 @@ libc_support_library(
     ],
 )
 
-# Only used for testing.
-libc_support_library(
-    name = "printf_mock_parser",
-    hdrs = ["src/stdio/printf_core/parser.h"],
-    local_defines = ["LIBC_COPT_MOCK_ARG_LIST"],
-    deps = [
-        ":__support_arg_list",
-        ":__support_common",
-        ":__support_cpp_bit",
-        ":__support_cpp_optional",
-        ":__support_cpp_string_view",
-        ":__support_cpp_type_traits",
-        ":__support_ctype_utils",
-        ":__support_fputil_fp_bits",
-        ":__support_str_to_integer",
-        ":printf_config",
-        ":printf_core_structs",
-    ],
-)
-
 libc_support_library(
     name = "printf_writer",
     srcs = ["src/stdio/printf_core/writer.cpp"],
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index 82e65a7..9c9fd50 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -22,7 +22,29 @@ def libc_common_copts():
         "-DLIBC_NAMESPACE=" + LIBC_NAMESPACE,
     ]
 
-def _libc_library(name, hidden, copts = [], deps = [], local_defines = [], **kwargs):
+def libc_release_copts():
+    copts = [
+        "-DLIBC_COPT_PUBLIC_PACKAGING",
+        # This is used to explicitly give public symbols "default" visibility.
+        # See src/__support/common.h for more information.
+        "-DLLVM_LIBC_FUNCTION_ATTR='[[gnu::visibility(\"default\")]]'",
+        # All other libc sources need to be compiled with "hidden" visibility.
+        "-fvisibility=hidden",
+        "-O3",
+        "-fno-builtin",
+        "-fno-lax-vector-conversions",
+        "-ftrivial-auto-var-init=pattern",
+        "-fno-omit-frame-pointer",
+        "-fstack-protector-strong",
+    ]
+
+    platform_copts = selects.with_or({
+        PLATFORM_CPU_X86_64: ["-mno-omit-leaf-frame-pointer"],
+        "//conditions:default": [],
+    })
+    return copts + platform_copts
+
+def _libc_library(name, copts = [], deps = [], local_defines = [], **kwargs):
     """Internal macro to serve as a base for all other libc library rules.
 
     Args:
@@ -30,15 +52,9 @@ def _libc_library(name, hidden, copts = [], deps = [], local_defines = [], **kwa
       copts: The special compiler options for the target.
       deps: The list of target dependencies if any.
       local_defines: The list of target local_defines if any.
-      hidden: Whether the symbols should be explicitly hidden or not.
       **kwargs: All other attributes relevant for the cc_library rule.
     """
 
-    # We want all libc sources to be compiled with "hidden" visibility.
-    # The public symbols will be given "default" visibility explicitly.
-    # See src/__support/common.h for more information.
-    if hidden:
-        copts = copts + ["-fvisibility=hidden"]
     native.cc_library(
         name = name,
         copts = copts + libc_common_copts(),
@@ -52,13 +68,13 @@ def _libc_library(name, hidden, copts = [], deps = [], local_defines = [], **kwa
 # Any library which does not define a public function should be listed with
 # libc_support_library.
 def libc_support_library(name, **kwargs):
-    _libc_library(name = name, hidden = False, **kwargs)
+    _libc_library(name = name, **kwargs)
 
 def libc_function(
         name,
         srcs,
         weak = False,
-        copts = None,
+        copts = [],
         local_defines = [],
         **kwargs):
     """Add target for a libc function.
@@ -81,25 +97,6 @@ def libc_function(
       **kwargs: Other attributes relevant for a cc_library. For example, deps.
     """
 
-    # We use the explicit equals pattern here because append and += mutate the
-    # original list, where this creates a new list and stores it in deps.
-    copts = copts or []
-    copts = copts + [
-        "-O3",
-        "-fno-builtin",
-        "-fno-lax-vector-conversions",
-        "-ftrivial-auto-var-init=pattern",
-        "-fno-omit-frame-pointer",
-        "-fstack-protector-strong",
-    ]
-
-    # x86 targets have -mno-omit-leaf-frame-pointer.
-    platform_copts = selects.with_or({
-        PLATFORM_CPU_X86_64: ["-mno-omit-leaf-frame-pointer"],
-        "//conditions:default": [],
-    })
-    copts = copts + platform_copts
-
     # We compile the code twice, the first target is suffixed with ".__internal__" and contains the
     # C++ functions in the "LIBC_NAMESPACE" namespace. This allows us to test the function in the
     # presence of another libc.
@@ -111,22 +108,16 @@ def libc_function(
         **kwargs
     )
 
-    # This second target is the llvm libc C function with either a default or hidden visibility.
-    # All other functions are hidden.
+    # This second target is the llvm libc C function with default visibility.
     func_attrs = [
         "LLVM_LIBC_FUNCTION_ATTR_" + name + "='LLVM_LIBC_EMPTY, [[gnu::weak]]'",
     ] if weak else []
 
-    local_defines = (local_defines +
-                     ["LIBC_COPT_PUBLIC_PACKAGING"] +
-                     ["LLVM_LIBC_FUNCTION_ATTR='[[gnu::visibility(\"default\")]]'"] +
-                     func_attrs)
     _libc_library(
         name = name,
-        hidden = True,
         srcs = srcs,
-        copts = copts,
-        local_defines = local_defines,
+        copts = copts + libc_release_copts(),
+        local_defines = local_defines + func_attrs,
         **kwargs
     )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index e4b4b07..a8b37c5 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -115,36 +115,30 @@ libc_support_library(
     hdrs = ["SortingTest.h"],
     deps = [
         "//libc:__support_macros_config",
-        "//libc:qsort_util",
+        "//libc:qsort",
         "//libc/test/UnitTest:LibcUnitTest",
     ],
 )
 
 libc_test(
-    name = "qsort_test",
-    srcs = ["qsort_test.cpp"],
-    libc_function_deps = ["//libc:qsort"],
-    deps = [
-        ":qsort_test_helper",
-        "//libc:types_size_t",
-    ],
-)
-
-libc_test(
     name = "quick_sort_test",
     srcs = ["quick_sort_test.cpp"],
+    libc_function_deps = ["//libc:qsort"],
     deps = [
         ":qsort_test_helper",
         "//libc:qsort_util",
+        "//libc:types_size_t",
     ],
 )
 
 libc_test(
     name = "heap_sort_test",
     srcs = ["heap_sort_test.cpp"],
+    libc_function_deps = ["//libc:qsort"],
     deps = [
         ":qsort_test_helper",
         "//libc:qsort_util",
+        "//libc:types_size_t",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
index a31c67c..a36cf2e 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
@@ -23,7 +23,7 @@ libc_test(
     name = "strcpy_test",
     srcs = ["strcpy_test.cpp"],
     libc_function_deps = [
-        "//libc:strcpy_sanitized",
+        "//libc:strcpy",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index db6577c..181fe3b 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -1071,6 +1071,7 @@ cc_binary(
         "//lldb:APIHeaders",
         "//lldb:Headers",
         "//lldb:Host",
+        "//lldb:Utility",
         "//lldb:liblldb.wrapper",
         "//llvm:Option",
         "//llvm:Support",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 36e266d..bfcb53e 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -645,18 +645,20 @@ cc_binary(
 cc_binary(
     name = "llvm-min-tblgen",
     srcs = [
-        "utils/TableGen/ARMTargetDefEmitter.cpp",
-        "utils/TableGen/Attributes.cpp",
+        "utils/TableGen/Basic/ARMTargetDefEmitter.cpp",
+        "utils/TableGen/Basic/Attributes.cpp",
         "utils/TableGen/Basic/CodeGenIntrinsics.cpp",
         "utils/TableGen/Basic/CodeGenIntrinsics.h",
         "utils/TableGen/Basic/SDNodeProperties.cpp",
         "utils/TableGen/Basic/SDNodeProperties.h",
+        "utils/TableGen/Basic/TableGen.h",
+        "utils/TableGen/Basic/TableGen.cpp",
         "utils/TableGen/Basic/SequenceToOffsetTable.h",
-        "utils/TableGen/DirectiveEmitter.cpp",
-        "utils/TableGen/IntrinsicEmitter.cpp",
-        "utils/TableGen/RISCVTargetDefEmitter.cpp",
-        "utils/TableGen/TableGen.cpp",
-        "utils/TableGen/VTEmitter.cpp",
+        "utils/TableGen/Basic/DirectiveEmitter.cpp",
+        "utils/TableGen/Basic/IntrinsicEmitter.cpp",
+        "utils/TableGen/Basic/RISCVTargetDefEmitter.cpp",
+        "utils/TableGen/Basic/VTEmitter.cpp",
+        "utils/TableGen/llvm-min-tblgen.cpp",
     ],
     copts = llvm_copts,
     stamp = 0,
@@ -715,7 +717,10 @@ cc_binary(
             # regular dependency.
             "include/llvm/MC/*.h",
         ],
-        exclude = ["utils/TableGen/Common/GlobalISel/CodeExpander.cpp"],
+        exclude = [
+            "utils/TableGen/Common/GlobalISel/CodeExpander.cpp",
+            "utils/TableGen/llvm-min-tblgen.cpp",
+        ],
     ) + [
         "include/llvm/TargetParser/SubtargetFeature.h",
     ],
@@ -1317,6 +1322,7 @@ cc_library(
     includes = ["include"],
     textual_hdrs = [
         "include/llvm/TargetParser/AArch64CPUFeatures.inc",
+        "include/llvm/TargetParser/AArch64FeatPriorities.inc",
         "include/llvm/TargetParser/AArch64TargetParserDef.inc",
         "include/llvm/TargetParser/ARMTargetParserDef.inc",
         "include/llvm/TargetParser/RISCVTargetParserDef.inc",
@@ -2265,6 +2271,7 @@ llvm_target_lib_list = [lib for lib in [
             ("-gen-register-info", "lib/Target/RISCV/RISCVGenRegisterInfo.inc"),
             ("-gen-subtarget", "lib/Target/RISCV/RISCVGenSubtargetInfo.inc"),
             ("-gen-searchable-tables", "lib/Target/RISCV/RISCVGenSearchableTables.inc"),
+            ("-gen-exegesis", "lib/Target/RISCV/RISCVGenExegesis.inc"),
         ],
         "tbl_deps": [
             ":riscv_isel_target_gen",
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 8a6950f..d576a91 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -697,7 +697,7 @@ cc_test(
     ],
     linkstatic = 1,
     tags = [
-        "local",  # Not compatible with the sandbox on MacOS
+        "no-sandbox",  # FileSystemTest.permissions not compatible with the sandbox on MacOS
     ],
     deps = [
         "//llvm:AllTargetsCodeGens",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f1192d0..e823af2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -8391,6 +8391,7 @@ cc_library(
         ":ArithDialect",
         ":ConversionPassIncGen",
         ":EmitCDialect",
+        ":EmitCTransforms",
         ":IR",
         ":SCFDialect",
         ":TransformUtils",
author	NAKAMURA Takumi <geek4civic@gmail.com>	2025-01-09 17:50:40 +0900
committer	NAKAMURA Takumi <geek4civic@gmail.com>	2025-01-09 17:50:40 +0900
commit	fea7da1b00cc97d742faede2df96c7d327950f49 (patch)
tree	4de1d6b4ddc69f4f32daabb11ad5c71ab0cf895e
parent	9b99dde0d47102625d93c5d1cbbc04951025a6c9 (diff)
parent	0aa930a41f2d1ebf1fa90ec42da8f96d15a4dcbb (diff)
download	llvm-users/chapuni/cov/single/nextcount.zip llvm-users/chapuni/cov/single/nextcount.tar.gz llvm-users/chapuni/cov/single/nextcount.tar.bz2